In [24]:
import boto3
import botocore
import os
import matplotlib.pyplot as plt
import datetime
import numpy as np
import pandas as pd
import string
import seaborn as sns
import re
import random
import nltk
import sys
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize
from wordcloud import WordCloud
from nltk.stem.snowball import SnowballStemmer
# Gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.corpora import Dictionary
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Loading Data 

Selecting the Top 500,000 based on Retweets

In [20]:
df= pd.read_csv('April_Data')

In [21]:
df = df.drop_duplicates(subset = "Tweet").reset_index(drop=True)
df1 = df.sort_values(by ='Retweets',ascending=False)
df1=df1.iloc[:500000]

## Preprocessesing


In [22]:
import re
def find_retweeted(tweet):
    '''This function will extract the twitter handles of retweed people'''
    return re.findall('(?<=RT\s)(@[A-Za-z]+[A-Za-z0-9-_]+)', tweet)

def find_mentioned(tweet):
    '''This function will extract the twitter handles of people mentioned in the tweet'''
    return re.findall('(?<!RT\s)(@[A-Za-z]+[A-Za-z0-9-_]+)', tweet)  

def find_hashtags(tweet):
    '''This function will extract hashtags'''
    return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', tweet) 
def find_links(tweet):
    return re.findall('(https?://\S+)', tweet) 


In [23]:
df1['retweeted'] = df1.Tweet.apply(find_retweeted)
df1['mentioned'] = df1.Tweet.apply(find_mentioned)
df1['hashtags'] = df1.Tweet.apply(find_hashtags)
df1['links'] = df1.Tweet.apply(find_links)

In [25]:
def clean_text(text):
    text = re.sub(r'https?://\S+', '', text) # Remove link
    ext = re.sub(r'http?://\S+', '', text)
    text = re.sub(r'\n',' ', text) # Remove line breaks
    text = re.sub('\s+', ' ', text).strip() # Remove leading, trailing, and extra spaces
    text= re.sub('(?<=RT\s)(@[A-Za-z]+[A-Za-z0-9-_]+)','',text).strip()
    text= re.sub('(?<!RT\s)(@[A-Za-z]+[A-Za-z0-9-_]+)','',text).strip()
    text= re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)','',text).strip()
    text= re.sub('([0-9]+)','',text).strip()
    text= re.sub('(https?://\S+)','',text).strip()
    text= re.sub(r'RT',' ',text)
    text= re.sub(r':',' ',text)
    return text  
df1['Tweet']= df1.Tweet.apply(clean_text)

In [26]:
def stem(text):
    return stemmer.stem(text);
    
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and (len(token) > 3):
            result.append(stem(token))
    return result 

## Visualisation and Analysis on the Cleaned Tweets

In [None]:
docs = df1['Tweet'].map(preprocess)

In [None]:
df2= pd.DataFrame(docs)

In [None]:
word_freq = FreqDist(w for words in df2.Tweet for w in words)

Top 20 unigrams

In [None]:
df_word_freq = pd.DataFrame.from_dict(word_freq, orient='index', columns=['count'])
top20w = df_word_freq.sort_values('count',ascending=False).head(20)
plt.style.use('dark_background')
plt.figure(figsize=(8,6))
sns.barplot(top20w['count'], top20w.index)
plt.title('Top 20 words')
plt.show()

Top 20 Bigrams

In [None]:
bgdf_d = pd.DataFrame.from_dict(d_fq, orient='index', columns=['count'])
bgdf_d.index = bgdf_d.index.map(lambda x: ' '.join(x))
plt.figure(figsize=(16,7))
plt.subplot(121)
bgdf_d = bgdf_d.sort_values('count',ascending=False)
sns.barplot(bgdf_d.head(20)['count'], bgdf_d.index[:20], color='pink')
plt.title('Top bigrams in the tweets')

Cleaning and Visualising Locations

In [None]:
raw_loc = df1.Location.value_counts()
top_loc = list(raw_loc[raw_loc>=10].index)
top_only = df1[df1.Location.isin(top_loc)]

In [None]:
def clean_loc(x):
    if x == 'None':
        return 'None'
    elif x == 'Earth' or x =='Worldwide' or x == 'Everywhere':
        return 'World'
    elif 'New York' in x or 'NYC' in x:
        return 'New York'    
    elif 'London' in x:
        return 'London'
    elif 'Mumbai' in x:
        return 'Mumbai'
    elif 'Washington' in x and 'D' in x and 'C' in x:
        return 'Washington DC'
    elif 'San Francisco' in x:
        return 'San Francisco'
    elif 'Los Angeles' in x:
        return 'Los Angeles'
    elif 'Seattle' in x:
        return 'Seattle'
    elif 'Chicago' in x:
        return 'Chicago'
    elif 'Toronto' in x:
        return 'Toronto'
    elif 'Sacramento' in x:
        return 'Sacramento'
    elif 'Atlanta' in x:
        return 'Atlanta'
    elif 'California' in x:
        return 'California'
    elif 'Florida' in x:
        return 'Florida'
    elif 'Texas' in x:
        return 'Texas'
    elif 'United States' in x or 'USA' in x:
        return 'USA'
    elif 'United Kingdom' in x or 'UK' in x or 'Britain' in x:
        return 'UK'
    elif 'Canada' in x:
        return 'Canada'
    elif 'India' in x:
        return 'India'
    elif 'Kenya' in x:
        return 'Kenya'
    elif 'Nigeria' in x:
        return 'Nigeria'
    elif 'Australia' in x:
        return 'Australia'
    elif 'Indonesia' in x:
        return 'Indonesia'
    elif x in top_loc:
        return x
    
df1['location_clean'] = df1['Location'].apply(lambda x: clean_loc(str(x)))

In [None]:
plt.figure(figsize=(9,6))
sns.countplot(y=df1.location_clean, order = df1.location_clean.value_counts().iloc[:25].index)
plt.title('Top 25 locations')
plt.show()

In [None]:
df_clean= df1
df_clean.drop_duplicates(subset='Tweet', inplace= True)
data_text = df_clean[['Tweet']];

In [None]:
data_text = data_text.astype('str');
for idx in range(len(data_text)):
    
    #go through each word in each data_text row, remove stopwords, and set them on the index.
    data_text.iloc[idx]['Tweet'] = [word for word in data_text.iloc[idx]['Tweet'].split(' ') if word not in stop_words];
#     data_text.iloc[idx]['Tweet']= [word for word in data_text.iloc[idx]['Tweet'].split(' ') if len(word)>3];
    
    #print logs to monitor output
    if idx % 1000 == 0:
        sys.stdout.write('\rc = ' + str(idx) + ' / ' + str(len(data_text)));

In [None]:
text = [value[0] for value in data_text.iloc[0:].values];

In [None]:
texts = [' '.join(sent) for sent in text]

In [None]:
num_topics= 10

## Utilising IBM Watson for Topic Modelling on Top 10k Tweets

In [None]:
df1=df1.iloc[:10000]

Extracting Information Using Watson

In [None]:
def get_sample(df, nums):
    return df[nums:nums + 50]
def perform(df_try):
    data = pd.DataFrame(columns=["Tweet", "Language", "Sentiment", "Emotion", "Keyword","Categories"])
    
    for tweet in df_try["Tweet"]:
        try:
            tw = tweet
            response = service.analyze(text=tweet, features = Features(sentiment= DocumentSentimentResults(), emotion=EmotionOptions(), keywords=KeywordsOptions(), categories=CategoriesOptions())).get_result()
        except:
            print("Error in Tweet: ", tw)
            continue
            
        try:
            lan = response["language"]
            sent = response["sentiment"]["document"]["label"]
        except:
            lan = 'en'
            sent = 'neutral'
            
        ans = -1
        place = -1
        emotion = []
        try:
            for i in response["emotion"]["document"]["emotion"]:
                emotion.append(response["emotion"]["document"]["emotion"][i])
            for j in range(len(emotion)):
                if emotion[j] > ans:
                    ans  = emotion[j]
                    place = j
            if (place == 0):
                emot = 'sadness'
            elif (place == 1):
                emot = 'joy'
            elif (place == 2):
                emot = 'fear'
            elif (place == 3):
                emot = 'disgust'
            else:
                emot = 'anger'
            
        except:
            emot = "sadness"
            
        try:
            word = response["keywords"][0]["text"]
        except:
            word = '----'
            
        try:
            cat = response['categories'][0]['label']
        except:
            cat = 'Unknown'
            
        final = {"Tweet":tw, "Language":lan, "Sentiment":sent, "Emotion":emot, "Keyword":word, "Categories":cat}
        data = data.append(final, ignore_index=True)
    return data

def clean_simple(x):
    a = []
    for tw in x:
        tw = tw.lower()
        if (tw.startswith('rt') and tw.endswith('...')):
            continue
        if (tw.startswith('rt')):
            tw = tw[2:]
        a.append(' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tw).split()))
            
    return set(a)

def analyse_tweet(df):
    df = df.drop_duplicates(subset = "Tweet")
    df = df.sort_values(by = ["Retweets"], ascending=False)
    df = df.reset_index(drop = True)
    del df["Timestamp"], df["Retweets"], df["Location"]
    x = set(df["Tweet"])
    a = clean_simple(x)
    a = list(a)
    a = a[1:]
    dff = pd.DataFrame(a, columns=["Tweet"])
    return dff

def remove_short(df):
    df.drop(df[df['Length'] < 30].index, inplace = True) 
    return df

def extract_information(df):
    print("Converting Data into suitable format...")
    df = analyse_tweet(df)
    print("Data format complete!")
    md = pd.DataFrame(columns=["Tweet", "Language", "Sentiment", "Emotion", "Keyword"])
    nums = 0
    while (nums < df.shape[0]):
        print("Getting results:", (nums/df.shape[0])*100, "%...")
        df_mini = get_sample(df, nums)
        df_mini = df_mini.reset_index(drop = True)
        df_res = perform (df_mini)
        md = pd.concat([md, df_res], axis=0)
        nums = nums + 50
    print("Analysis complete!")
    return md

def extract_information(df):
    print("Converting Data into suitable format...")
    df = analyse_tweet(df)
    
    print("Removing tweets with characters below 30.")
    lengths = []
    for i in df["Tweet"]:
        l = len(i)
        lengths.append(l)
    df["Length"] = lengths
    df = remove_short(df)
    del df["Length"]
    
    print("Data format complete!")
    
    md = pd.DataFrame(columns=["Tweet", "Language", "Sentiment", "Emotion", "Keyword", "Categories"])
    nums = 0
    while (nums < df.shape[0]):
        print("Getting results:", (nums/df.shape[0])*100, "%...")
        df_mini = get_sample(df, nums)
        df_mini = df_mini.reset_index(drop = True)
        df_res = perform (df_mini)
        md = pd.concat([md, df_res], axis=0)
        nums = nums + 50
    print("Analysis complete!")
    return md

In [None]:
def main(dataframe):
    size = dataframe.shape[0]
    i = 0;
    final = pd.DataFrame(columns=["Tweet", "Language", "Sentiment", "Emotion", "Keyword", "Categories"])
    while (i <= size):
        print("TWEET CHUNK", i, "TO", i+100)
        res = extract_information(dataframe[i:i + 100])
        i = i + 100
        final = pd.concat([final, res], axis=0)
        print()
    final.reset_index(drop = True)
    return final

In [None]:
df_test= df1.iloc[:5000]

In [None]:
res= main(df_test)

In [None]:
res = res.reset_index(drop = True)
res.drop(res[res.Language!='en'].index, inplace=True)
res = res.reset_index(drop = True)
res.to_csv('5000_1')

Similarly Saved a CSV file for the next 5000 Tweets

## Visualisation and Analysis on Watson Data

In [None]:
df= pd.read_csv('5000_1',names=['Temp','Tweet','Language','Sentiment','Emotion','Keyword','Categories'])
df_2=pd.read_csv('5000_2',names=['Temp','Tweet','Language','Sentiment','Emotion','Keyword','Categories'])

In [None]:
df=df.drop('Temp',axis=1)
df_2=df_2.drop('Temp',axis=1)

In [None]:
df=pd.concat([df,df_2])
df=df.drop([0],axis=0)

In [None]:
df_emotion=df.groupby(['Emotion']).count()
df_emotion=df_emotion.sort_values(by='Tweet',ascending=False)

In [None]:
plt.style.use('dark_background')
plt.figure(figsize=(8,6))
sns.barplot(df_emotion['Tweet'], df_emotion.index)
plt.title('Emotion Analysis')
plt.show()

In [None]:
cat=[]
cat=[w.split('/') for w in df.Categories]

In [None]:
categories=[]
for i in range(len(cat)):
    categories.append(cat[i][1])

In [None]:
df['Main Categories']= categories
df_cat=df.groupby(['Main Categories']).count()
df_cat = df_cat.sort_values(by ='Tweet',ascending=False)

In [None]:
plt.style.use('dark_background')
plt.figure(figsize=(8,6))
sns.barplot(df_cat['Tweet'], df_cat.index)
plt.title('Topics discussed')
plt.show()

In [None]:
health_categories={}
for i in range(len(cat)):
    if(cat[i][1])=='health and fitness': 
        try:
            ele=cat[i][2]
            if ele in health_categories:
                health_categories[ele]+=1
            else:
                health_categories[ele]=1
        except:
            continue

In [None]:
df_health = pd.DataFrame(health_categories.values(), health_categories.keys(),columns=['Count'])
df_health= df_health.sort_values(by='Count',ascending=False)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(df_health['Count'], df_health.index)
plt.title('Health Topics discussed')
plt.show()