<h1><center>Process Module - Sentiment Analysis on the Twitter Data </center></h1> 

## This file constructs the module which we import in Projectrun.ipynb file to run our application

## Importing necessary libraries

In [44]:
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream

#### Importing pandas for dataframes

In [45]:
import pandas as pd

In [46]:
from os import listdir
from os.path import isfile, join

In [47]:
import geopy
import json    

In [48]:
import matplotlib.pyplot as pie_plot
import matplotlib.pyplot as plt

In [49]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
geolocator = Nominatim()

In [50]:
import copy
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re

In [51]:
from PIL import Image
import random
def grey_color_func( font_size, position, orientation, random_state=None, **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(3, 23)

In [52]:
from wordcloud import WordCloud, STOPWORDS

#To use the twitter mask
image_mask = np.array(Image.open('twittermask.jpg'))

In [53]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Shakthi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [54]:
import sys

In [55]:
from operator import itemgetter
from collections import OrderedDict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from nltk import pos_tag
from nltk import ngrams
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shakthi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shakthi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [56]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Shakthi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [57]:
from bokeh.io import output_file, output_notebook, show
from bokeh.models import (
  GMapPlot, GMapOptions, ColumnDataSource, Circle, LogColorMapper, BasicTicker, ColorBar,
    Range1d, PanTool, WheelZoomTool, BoxSelectTool,BoxZoomTool
)
from bokeh.models.mappers import ColorMapper, LinearColorMapper
from bokeh.palettes import Viridis5
from bokeh.io import export_png

In [58]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display

## Loading the access tokens to connect to twitter

In [59]:
with open('auth_dict.json','r') as f:
    twtr_auth = json.load(f)
    
# To make it more readable, lets store the OAuth credentials in strings first

access_token =  twtr_auth['token']
access_token_secret = twtr_auth['token_secret']
consumer_key = twtr_auth['consumer_key']
consumer_secret = twtr_auth['consumer_secret']

In [60]:
#Authenticating using the previously loaded access tokens
auth = tweepy.OAuthHandler(consumer_key,consumer_secret)
auth.set_access_token(access_token,access_token_secret)
api=tweepy.API(auth,wait_on_rate_limit=True)

## Importing the category list file to the dataframe

In [61]:
cat_df = pd.read_csv('Categories.csv',encoding='ISO-8859-1')
#print(category_df)
category_df = pd.DataFrame()
for column in cat_df.columns:
    col_count = 0
    column_data = cat_df[column].str.lower()
    category_df.insert(loc=col_count,column=column,value=pd.Series(column_data))
    col_count+=1

## Code to handle rate limit errors

In [62]:
def limit_handled(cursor):
    while True:
        try:
            yield cursor.next()
        except tweepy.RateLimitError:
            print("Limit Exceeds. Sleeping now for 15 mins....")
            time.sleep(15 * 60)
            print("Trying again")

## Creating dataframe for the input passed using pandas

In [63]:
def CreateDataFrame(hashtag):
    count=0
    prevmaxid=0
    location=[]
    tweet=[]
    data= pd.DataFrame()
    for tweets in limit_handled(tweepy.Cursor(api.search,q=hashtag,tweet_mode='extended',lang='en').items()):
        if (prevmaxid==0 or tweets.id<prevmaxid):
            if (tweets.lang == "en"):
                raw_text = tweets.full_text
                textwithouthttp = re.sub(r"http\S+","",raw_text)
                textwithoutat = re.sub(r"@\S+","",textwithouthttp)
                textRT = re.sub(r"RT","",textwithoutat)
                text = re.sub(r"#\S+","",textRT)
                if (("".join(char for char in text if ord(char)< 128)) not in tweet):
                    tweet.append("".join(char for char in text if ord(char) < 128))
                    #tweet.append( tweets.full_text)
                    prevmaxid=tweets.id
                    location.append( tweets.user.location)
                    count+=1
                    if count == 2000:
                        break
    data.insert(loc = 0,column="tweets",value= pd.Series(tweet))
    data.insert(loc = 1,column="location",value= pd.Series(location))
    data.to_csv(hashtag+'.csv',index=False)
    return True

## Function to load csv files of already collected tweets and related data

In [64]:
def LoadFromCsv(hashtag):
    dframe= pd.read_csv(hashtag+'.csv', engine="python")
    dframe.dropna(subset=['location'], how='all', inplace = True)
    dframe.reset_index(drop=True, inplace = True)
    dframe.dropna(subset=['tweets'], how='all', inplace = True)
    dframe.reset_index(drop=True, inplace = True)
    return dframe

## Function to perform sentiment analysis on the loaded dataframe

In [65]:
def SentimentAnalysis(dfnoloc):    
    df = copy.deepcopy(dfnoloc)
    senti_df = df['tweets'].reset_index(drop=True)
    senti_df = pd.DataFrame(senti_df,columns=['tweets']).dropna()
    senti_df['category'] = np.nan
    senti_df = senti_df.reset_index(drop=True) 
    analyseSentiment = SentimentIntensityAnalyzer()
    for itr in range(0,len(senti_df)):
        index = list(np.where(senti_df['tweets'] == senti_df.iloc[itr]['tweets'] )[0])
        senti_score = analyseSentiment.polarity_scores(senti_df.iloc[itr]['tweets'])
        #classify pos or neg category based on the compound score
        if (senti_score['compound'] > 0.0):
            senti_df.loc[index,'category'] = 'positive'
        else:
            senti_df.loc[index,'category'] = 'negative'        
    #writeToCsv(screen_name + '_Sentiment',senti_df)
    return(senti_df)

## Merger function to join the dataframe containing locations with the catogerised data

In [66]:
def MergeDf(dfnoloc,sentiment_df):
    dfnoloc['category'] = sentiment_df['category'].tolist()
    dfwithcategories=dfnoloc.drop_duplicates(subset=['location'],keep='first')
    dfwithcategories.reset_index(drop=True, inplace = True)
    return dfwithcategories

## Function to find the latitude and longitude based on the locations of tweets as by default lot of tweets lack this information

### Also removing duplicates to comply with geopy library restriction

In [67]:
def FindLocation(mergeddf_noloc):
    count=0
    lat=[]
    lon=[]
    for index,row in mergeddf_noloc.iterrows():
        if(count<300 ):
            try:
                location = geolocator.geocode(row['location'],timeout=30)
                (a,b)=(location.latitude, location.longitude)
                lat.append(a)
                lon.append(b)
            except AttributeError:
                #print("NA")
                lat.append("NA")
                lon.append("NA")
            except:
                return (lat,lon)
        count+=1
    return (lat,lon)

## Function to update the locations based on longitude and latitude

In [68]:
def UpdateLocation(mergeddf_noloc,coordinates):
    (lat,lon)=coordinates
    tempdf= copy.deepcopy(mergeddf_noloc.iloc[0:len(lat)])
    tempdf['latitude'] = lat
    tempdf['longitude'] = lon
    return tempdf

## Function to run all the functions regarding data collection and do all the processing in single functionality

In [69]:
def dataprocess(hashtag):
    createdata= CreateDataFrame(hashtag)
    if(createdata==True):
        dfnoloc=LoadFromCsv(hashtag)
        print("loaded data set for sentiment analysis")
        sentiment_df = SentimentAnalysis(dfnoloc)
        print("Sentiment analysis completed successfully")
        mergeddf_noloc = MergeDf(dfnoloc,sentiment_df)
        coordinates=FindLocation(mergeddf_noloc)
        dfwithloc=UpdateLocation(mergeddf_noloc,coordinates)
        print("Dataset updated successfully with classification and coordinates")
        dfwithloc.to_csv(hashtag+'withloc.csv',index=False)
        print("data processing completed")

## Function to generate the results of the Sentiment Analysis using the previously saved datafiles

In [70]:
def Analysis(filename):
    dfwithloc= pd.read_csv(filename+'.csv', engine="python")
    wholestring=getWholeString(dfwithloc)
    freqcounts=getFrequencies(wholestring)
    freqTuple=Classification(freqcounts)
    percentdf=categoryWisePercent(freqTuple)
    plot_bargraph(percentdf,filename)
    plot_scatterplot(dfwithloc,filename)
    plot_wordcloud(wholestring,filename)

## Function that takes tweets as input and converts it to one continous string

### It then generate tokens by removing the stopwords and filtering out the punctutations

In [71]:
def getWholeString(senti_df):
    dataFrame = copy.deepcopy(senti_df)
    all_nouns_text=[]
    positive_text=[]
    all_text=[]
    # set the stop words
    stop_words = set(stopwords.words('English'))
    #set the stemmer
    #stemmer = PorterStemmer()
    for itr in dataFrame['tweets']:
        #tokenize
        tokens = word_tokenize(itr)
        #remove stop words & punctuation(other characters)
        filtered_text = [ word.lower() for word in tokens if (word not in stop_words) and (word.isalpha()) and (len(word) > 1)]
       
        # get list of all tokens
        all_text+=filtered_text
    
    # take only the nouns
    tags = pos_tag(all_text)
    for word,pos in tags:
            if pos in ['NN','NNS','NNP','NNPS']:
                all_nouns_text.append(word)
    
    return all_nouns_text

## Function to categorize the nouns text generated from previous function and getting the frequency of the words

In [72]:
def getFrequencies(all_nouns_text):
    nouns_text = copy.deepcopy(all_nouns_text)
    topCount_df=pd.DataFrame()
    all_single=[]
    all_bigrams=[]
    all_trigrams=[]
    freq_single = Counter(nouns_text)
    for token,count in freq_single.most_common(50):
        all_single.append(token)
    bigrams = list(ngrams(nouns_text,2))
    freq_bi = Counter(bigrams)
    for token,count in freq_bi.most_common(50):
        all_bigrams.append(list(token))
    trigrams = list(ngrams(nouns_text,3))
    freq_tri = Counter(trigrams)
    for token,count in freq_tri.most_common(50):
        all_trigrams.append(list(token))
    topCount_df.insert(loc=0,column='Single',value=pd.Series(all_single))
    topCount_df.insert(loc=1,column='Bigrams',value=pd.Series(all_bigrams))
    topCount_df.insert(loc=2,column='Trigrams',value=pd.Series(all_trigrams))
    return topCount_df

## Function to get the token/word count for wordcloud generation

### This function also catogerizes the words!

In [73]:
def Classification(topCount_df):
    count_df = copy.deepcopy(topCount_df)
    # create dictionaries for tracking word count & categories
    word_count_dict = {}
    word_category_dict = {}
    for row_itr in range(0,len(count_df)):
        for column_itr in range(0,len(count_df.columns)):
            key = count_df.iloc[row_itr][column_itr]
            if isinstance(key,str):
            #check if this key is already present in the dictionary
                if key not in word_count_dict.keys():
                    word_count_dict[key] = 0
                    word_category_dict[key] = []
            elif isinstance(key,list):
                for key_list in key:
                    if key_list not in word_count_dict.keys():
                        word_count_dict[key_list] = 0  
                        word_category_dict[key_list] = [] 
    column_labels = list(category_df.columns)
    for key in word_count_dict.keys():
        for column in column_labels: 
            if any(category_df[column] == key):
                word_count_dict[key] +=1 
                word_category_dict[key].append(column)
    return (word_count_dict,word_category_dict)

## Function to get the percentage of words, classified based on emotions

In [74]:
def categoryWisePercent(frequency):
    (word_count_dict,word_category_dict)=frequency
    sorted_word_dict = copy.deepcopy(word_count_dict)
    category_dict = copy.deepcopy(word_category_dict)
    # create empty dataframe
    percent_df = pd.DataFrame(columns = ['Category','Words','Percentage'])
    # categorize it
    category_list= []
    for key in sorted_word_dict:
        cat_list = list (category_dict[key])
        for itr in range(0,len(cat_list)):
            if cat_list[itr] not in category_list:
                category_list.append(cat_list[itr])
    percent_df['Category'] = category_list
    percent_df['Words'] = 0
    percent_df['Percentage'] = 0 
    #fill words column
    for key in sorted_word_dict:
        cat_list = list(category_dict[key])
        for itr in range(0,len(cat_list)):
            index = list(np.where(percent_df['Category'] == cat_list[itr])[0])
            percent_df.at[index,'Words'] = percent_df['Words'][index] +1
    # fill percentage column
    total_word_count=0
    for row_count in range(0,len(percent_df)):
        total_word_count += percent_df['Words'][row_count]
    for row_count in range(0,len(percent_df)):
        percent_df.at[row_count,'Percentage'] = ( percent_df['Words'][row_count] / total_word_count) * 100
    #plot_bargraph(percent_df)
    return percent_df

## Function to generate the wordcloud 

In [75]:
def plot_wordcloud(wholestring,filename):
    wordfreq = {}
    for raw_word in wholestring:
        word = raw_word.strip(" ")
        if word not in wordfreq:
            wordfreq[word] = 0 
        wordfreq[word] += 1

    wordcloud= WordCloud(stopwords=STOPWORDS,width=1600, height=800, background_color = 'white', max_words=200, colormap="Dark2",mask=image_mask)
    wordcloud.generate_from_frequencies(frequencies=wordfreq,max_font_size=1600)
    plt.figure(figsize=(20,10), facecolor='k')
    plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3))
    plt.imshow(image_mask, alpha=0.3)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.savefig(filename+'.png', facecolor='k', bbox_inches='tight')
    plt.show()


## Function to show the catogerised emotions in a bar graph

In [76]:
def plot_bargraph(percent_df,filename):
    per_df = copy.deepcopy(percent_df)
    max_value= max(list(per_df['Words']))
    index = list(np.where(per_df['Words'] == max_value)[0])
    explode_list = []
    for itr in range(0,len(per_df)):
        if itr in index:
            explode_list.append(0.1)
        else:
            explode_list.append(0.0)        
    #print(explode_list)
    explode_tuple = tuple(explode_list) 
    labels = list(per_df['Category'])
    values = list(per_df['Percentage'])
    
    
    y_pos = np.arange(len(labels))
    axes = pie_plot.gca()
    axes.set_ylim([0,100])
    colors=["#79d279","#ff4d4d","#99ccff"]
    pie_plot.bar(y_pos, values, align='center', alpha=0.5,color=colors)
  
    pie_plot.xticks(y_pos, labels)
    pie_plot.ylabel('Percentage')
    pie_plot.title('Emotions')
    plt.savefig(filename+'_bar.png')
    pie_plot.show()

## Function to genertate the scatter plot based on the polarity and location of the tweets

In [77]:
def plot_scatterplot(tempdf,name):
    map_options = GMapOptions( map_type="terrain",zoom=1)

    plot = GMapPlot(
        x_range=Range1d(), y_range=Range1d(), map_options=map_options
    )
    plot.title.text = "Hey look! It's a scatter plot on a map!"

    # For GMaps to function, Google requires you obtain and enable an API key:
    #
    #     https://developers.google.com/maps/documentation/javascript/get-api-key
    #
    # Replace the value below with your personal API key:
    plot.api_key = "AIzaSyDpIUAZYB_QZ_ePte4xa27HJzFtZYd0tVk"
    colorvalue=[]
    for cat in tempdf.category.tolist():
        if(cat=="positive"):
            colorvalue.append("#0000cc")
        else:
            colorvalue.append("#e60000")

    source = ColumnDataSource(
        data=dict(
            lat=tempdf.latitude.tolist(),
            lon=tempdf.longitude.tolist(),
            color=colorvalue
        )
    )
    circle = Circle(x="lon", y="lat",size=6, fill_color={'field': 'color'}, fill_alpha=0.5, line_color=None)
    plot.add_glyph(source, circle)
    plot.add_tools(PanTool(), WheelZoomTool(), BoxZoomTool())
    #output_file("gmap_plot.html")
    output_notebook()
    export_png(plot, filename=name+'_map.png')
    show(plot)

## Function to call the previously defined functions for data collection and data analysis

In [78]:
def mainprocess(hashtag):
    dataprocess(str(hashtag.value))
    Analysis(str(hashtag.value)+'withloc')