In [None]:
import json
import pandas as pd
import nltk
from collections import Counter
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
#nltk.download('vader_lexicon')

import os.path
import tarfile

import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stop_words = stopwords.words('english')

# •    October 31 – 7 pm – village Halloween parade – 6th avenue at Canal Street between 6.30 – 8.30
# •    Diwali at times square – October 13th
# •    Columbus day parade – October 8th – fifth avenue from 44th to 72nd 
# •    October 10th – Slice out hunger/pizza party – St Anthony’s Church/155 Sullivan


year_month = "2018-10-"
dates = ["8","10","13","31"]
folders = [year_month + "0" + date if len(date) == 1 else year_month + date for date in dates]
print(folders)

['2018-10-08', '2018-10-10', '2018-10-13', '2018-10-31']


In [None]:
fileName = "twitter_us_2018-10-01_00h.json"

# with open(fileName, encoding="utf8") as file:
#     #json_file = file.read()
#     tweets = json.load(file)

In [None]:
def bag_of_words(df_col): #tweets_df['text']
    bow = df_col.apply(lambda x: Counter(x.split(" ") if type(x) == str else x))
    return bow

def sentiment(df_col): #tweets_df['text']
    nltk_sentiment = df_col.apply(lambda x: sia.polarity_scores(x))
    return nltk_sentiment

def segregate_sentiment(nltk_sentiment):
    pos_sentiment = nltk_sentiment.apply(lambda x: x['pos'])
    neg_sentiment = nltk_sentiment.apply(lambda x: x['neg'])
    neu_sentiment = nltk_sentiment.apply(lambda x: x['neu'])
    return pos_sentiment, neg_sentiment, neu_sentiment

def classify_sentiment(x):
    if x['pos'] > 0.2:
        return "Positive"
    if x['neg'] > 0.2:
        return "Negative"
    if x['neu'] > 0.2:
        return "Neutral"
    return "Mixed"

def classify(df_col):
    return df_col.apply(lambda x: classify_sentiment(x))

def preprocess_text(text):
    text = text.lower()
    text = "".join([char for char in text if char not in string.punctuation])
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    stemmed_words = [porter.stem(word) for word in filtered_words]
    return stemmed_words

def preprocess(df_col):
    return df_col.apply(lambda x: preprocess_text(x))

In [None]:
def process_df(tweets_df):
    processed_df = pd.DataFrame()
    processed_df['id'] = tweets_df['id']
    processed_df['text'] = tweets_df['text']
    sentiment_col = sentiment(tweets_df['text'])
    processed_df['Sentiment'] = classify(sentiment_col)
    processed_df['Extracted Keywords'] = preprocess(tweets_df['text'])
    processed_df['lang'] = tweets_df['lang']
    processed_df['retweet_count'] = tweets_df['retweet_count']
    processed_df['coordinates'] = tweets_df['coordinates']
    return processed_df

In [None]:
hours = ["0" + str(hr) if len(str(hr)) == 1 else str(hr) for hr in range(24)]

def check_files_or_create(file_paths):
    for file_path in file_paths:
        if os.path.isfile(file_path + ".tar.gz"):
            my_tar = tarfile.open(file_path + ".tar.gz")
            folder = file_path.split('/')[0]
            my_tar.extractall(folder) # Extract here
            my_tar.close()
    
for folder in folders:
    file_paths = [folder + "/" + "twitter_us_" + folder + "_" + hour + "h.json" for hour in hours]
    print(file_paths)
    check_files_or_create(file_paths)
    
    daily_df = pd.DataFrame()
    for hr, file in enumerate(file_paths):
        json_file = file.split("/")[0] + "/data/" + file.split("/")[0] + "/" + file.split("/")[1] 
        tweets = []
        for line in open(json_file, encoding="utf8"):
            tweets.append(json.loads(line))
        tweets_df = pd.DataFrame(tweets)
        processed_df = process_df(tweets_df)
        if not os.path.exists("output/" + file.split("/")[0]):
            os.makedirs("output/" + file.split("/")[0])
        processed_df.to_excel("output/" + file.split(".")[0] + ".xlsx")
        daily_df = daily_df.append(processed_df)
        grouping = processed_df.set_index('Sentiment')['Extracted Keywords'].apply(pd.Series).stack().groupby(level=0).value_counts()
        grouping = grouping[grouping > 15]
        grouping.to_excel("output/" + folder + "/hourly_" + file.split("/")[1].split(".")[0] + ".xlsx")
    daily = daily_df.set_index('Sentiment')['Extracted Keywords'].apply(pd.Series).stack().groupby(level=0).value_counts()
    daily = daily[daily > 100]
    daily.to_excel('output/' + folder + '/daily.xlsx')
#   daily_df[hr] = bag_of_words(processed_df['Extracted Keywords'])
#   daily_df.to_excel("output/" + folder + "/hourly.xlsx")

['2018-10-08/twitter_us_2018-10-08_00h.json', '2018-10-08/twitter_us_2018-10-08_01h.json', '2018-10-08/twitter_us_2018-10-08_02h.json', '2018-10-08/twitter_us_2018-10-08_03h.json', '2018-10-08/twitter_us_2018-10-08_04h.json', '2018-10-08/twitter_us_2018-10-08_05h.json', '2018-10-08/twitter_us_2018-10-08_06h.json', '2018-10-08/twitter_us_2018-10-08_07h.json', '2018-10-08/twitter_us_2018-10-08_08h.json', '2018-10-08/twitter_us_2018-10-08_09h.json', '2018-10-08/twitter_us_2018-10-08_10h.json', '2018-10-08/twitter_us_2018-10-08_11h.json', '2018-10-08/twitter_us_2018-10-08_12h.json', '2018-10-08/twitter_us_2018-10-08_13h.json', '2018-10-08/twitter_us_2018-10-08_14h.json', '2018-10-08/twitter_us_2018-10-08_15h.json', '2018-10-08/twitter_us_2018-10-08_16h.json', '2018-10-08/twitter_us_2018-10-08_17h.json', '2018-10-08/twitter_us_2018-10-08_18h.json', '2018-10-08/twitter_us_2018-10-08_19h.json', '2018-10-08/twitter_us_2018-10-08_20h.json', '2018-10-08/twitter_us_2018-10-08_21h.json', '2018-10-

In [None]:
grouping

Sentiment           
Negative   traffic      10
           new           8
           halloween     7
           york          7
           delay         6
                        ..
Positive   🤠             1
           🤴🏼👸🏼          1
           🥘             1
           🥞             1
           🦇             1
Length: 4817, dtype: int64