In [27]:
import time
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
# from textblob import TextBlob
import re
# import cld2

#### Define function 

In [28]:
import pickle
import os

#define a function that loads pickled lists; it returns a list from a pickled one
def load_pickle(path):
    #import pickle file 
    if os.path.isfile(path) == False:
        return []
    else:
        with open (path, 'rb') as fp:
            list_file = pickle.load(fp)
        return list_file  

#### Set Working Directories

In [29]:
from pathlib import *
#current working directory
current_dir = Path.cwd()
#go up 1 level to the 1st parent directory
Par1_dir = current_dir.parents[0]
Par1_dir

WindowsPath('C:/Users/USER/Desktop/CASS/Modules/2 - Term II/AI Project')

#### Load the list of locations resulting from Spacy's NER

In [30]:
loc_hist_path = Par1_dir / 'Python Env' / 'DataFrames' / 'AI_datasets' / 'locations_ai_2013-01_2019-06.pkl'
loc_hist = load_pickle(loc_hist_path)
# loc_hist

#### Detect the spikes in Locations per month

identify companies that have increased in mentions significantly when compared with the previous month

- count the frequency of each location for 2 consequtive months x0 and x1
- if a location has increased in mentions by 12 times then add it to the list
- store the results as strings in the format of "location: 1300%". each month will have a seperate list of these strings

In [31]:
from collections import Counter

gpe_change = []

for i in range(len(loc_hist)-1):
    #frequency of each location per month in a dictionary of cities
    x0 = Counter(loc_hist[i][1])
    x1 = Counter(loc_hist[i+1][1])
    
    monthly_gpe_change = []
    #add cities with more than 1200% spikes and 10 tweets in volume
    for gpe in x1:
                
        if gpe in x0:
            gpec = x1[gpe]/x0[gpe]
            if (gpec > 12) & (x1[gpe]>5):
                monthly_gpe_change.append(gpe+": "+str(int(gpec*100))+"%")
    
    #add monthly spikes to list             
    gpe_change.append([loc_hist[i+1][0][:7], monthly_gpe_change])

In [32]:
# gpe_change

Note: make sure to change the file name at the end of city_df_path

In [33]:
#make monthly cities data into a dataframe
city_df = pd.DataFrame(np.array(gpe_change)).set_index(0)
city_df = city_df.loc[~city_df.index.duplicated(keep='first')]
#output results to disk
city_df_path = Par1_dir / 'Python Env' /'DataFrames' / 'AI_datasets' / 'city_df_ai_2012-12-30_2019-07-01.csv'
# city_df.to_csv(city_df_path)

#### Import the cleaned location data

- the location data generated above is not clean therefor it requires to be saved to the disk
- cleaned manually using a software like excel
- then imported again to be used

In [34]:
city_df_cleaned_path =  Par1_dir / 'Python Env' / 'DataFrames' / 'AI_datasets' / 'city_df_ai_2012-12-30_2019-07-01_cleaned.csv'
city_df = pd.read_csv(city_df_cleaned_path)
city_df = city_df.set_index('0')

#### Import saved weekly Sentiment analysis results

In [40]:
filename = Par1_dir / 'Python Env' / 'DataFrames' / 'AI_datasets' / 'weekly_sa_df_ai_2012-12-30_2019-07-01.csv'

#import weekly SA results as a pandas DF
weekly_df = pd.read_csv(filename)
weekly_df.head(3)

Unnamed: 0,wc,SA_pos,SA_neu,SA_neg
0,2012-12-30,46.3,43.6,10.1
1,2013-01-06,41.7,44.2,14.1
2,2013-01-13,42.7,45.5,11.8


#### Add the results of city_df to the weekly_df

To do so some preprocessing is required. we need to duplicate the monthly results for every week available within that month.

In [36]:
#turn the monthly data into weekly data to fit with the SA weekly DF
cities_weekly = []
for date in weekly_df.wc:
    if str(date)[:7] in  city_df.index:
        cities_weekly.append(city_df.loc[str(date)[:7]].item())
    else:
        cities_weekly.append("")
        
#produce column of datetime weeks from string index of df
datetime_weeks = [datetime.strptime(str(week), '%Y-%m-%d').date() for week in weekly_df.wc]

In [44]:
cities_weekly[5:10]

["['Newyork: 1300%', 'US: 54000%', 'Edmonton: 1300%']",
 "['Newyork: 1300%', 'US: 54000%', 'Edmonton: 1300%']",
 "['Newyork: 1300%', 'US: 54000%', 'Edmonton: 1300%']",
 "['Newyork: 1300%', 'US: 54000%', 'Edmonton: 1300%']",
 "[ 'Ukraine: 1300%', 'Paris: 4800%', 'Galway: 1500%', 'Chengdu: 2400%', 'Syria: 1433%']"]

In [47]:
#add cities and top5 topics to DF
weekly_df['cities'] = np.array(cities_weekly)
weekly_df['datetime'] = np.array(datetime_weeks)
weekly_df['SA_tot'] = [100]*len(weekly_df)

In [48]:
weekly_df.head(3)

Unnamed: 0,wc,SA_pos,SA_neu,SA_neg,cities,datetime,SA_tot
0,2012-12-30,46.3,43.6,10.1,,2012-12-30,100
1,2013-01-06,41.7,44.2,14.1,,2013-01-06,100
2,2013-01-13,42.7,45.5,11.8,,2013-01-13,100


In [38]:
from bokeh.plotting import figure, show, output_file, output_notebook
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource,LinearColorMapper,BasicTicker, PrintfTickFormatter, ColorBar
output_notebook()


In [39]:
from bokeh.models import Legend
TOOLS = 'save,pan,box_zoom,reset,wheel_zoom'

#define figure parameters
p = figure(title="Sentiment Analysis and Top Locations for AI Tweets", y_axis_type="linear", x_axis_type='datetime',\
           plot_height = 600, tools = TOOLS, plot_width = 1200)
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = 'Percent of Weekly Data(%)'

#ploting the Total Plot with cities and topics
line = p.line(x='datetime', y='SA_tot', line_color='black', line_width=2,
              source=weekly_df)

legend = Legend(items=[("Total", [line])], location=(-98, 370))

p.add_layout(legend, 'right')

hover = HoverTool(tooltips=[('Date', '@wc'),('Cities', '@cities')],
                  renderers=[line])
p.add_tools(hover)
#plotting the Positive, neutral, and negative sentiment plots
palette = ['green', 'blue', 'red']
snt = ['pos','neu','neg', 'Positive', 'Neutral', 'Negative']
for i in range(3):
    line = p.line(x='datetime', y='SA_'+snt[i], line_color=palette[i], line_width=1,
                  source=weekly_df, legend=snt[i+3])
    hover = HoverTool(tooltips=[('Date', '@wc'),('Sentiment', '$y')], renderers=[line])
    p.add_tools(hover)


# output_path = Par1_dir / 'Pout' / 'line_chart_AI_2019-06.html'
# output_file(output_path , title="Line Chart")
show(p)