## Introduction

Imports

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
from fastai import *
from fastai.text.all import *
from datetime import datetime

set_seed(42)

Set paths and create dataframes for the sentiment and US politicians datasets

In [None]:
us_path = Path('../input/us-politicians-twitter-dataset')
sent_path = Path('../input/sentiment140')
tweets_path = Path('../input/ustweetssent')


us_df = pd.read_csv(us_path/'dataset.csv', encoding='latin-1')
sent_df = pd.read_csv(sent_path/'training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None, names=['sentiment', 'ID', 'Date', 'Flag', 'User', 'text'])
tweets_df = pd.read_csv(tweets_path/'us_tweets_sent_v2.csv', encoding='latin-1', parse_dates=['time'])

Display the first 5 rows of each dataset.

In [None]:
us_df.head()

In [None]:
sent_df.head()

In [None]:
sent_df.shape

In [None]:
sent_df['sentiment'].value_counts()

The sentiment140 dataset (sent_df) has 1.6 million rows of data. When training with this set, the memory limit on Kaggle was reached, so a random sample of 800 000 tweets has been collected. Unwanted columns have been dropped and positive sentiment has been changed from a value of 4 to 1, and negative from 0 to -1.

In [None]:
sent_df_sample = sent_df.sample(n=800000)
sent_df_sample = sent_df_sample.drop(axis=0, columns=['ID', 'Date', 'Flag', 'User'])
sent_df_sample.loc[sent_df_sample.sentiment == 4, 'sentiment'] = 1
sent_df_sample.loc[sent_df_sample.sentiment == 0, 'sentiment'] = -1

In [None]:
sent_df_sample['sentiment'].value_counts()

## Language Model

Train a Wikitext 103 language model on the sentiment tweets dataset. The language model will predict the next word of a sentence. The language model will then be trained as a sentiment classifier. This methodology is known as the ULMFit approach, and improves accuracy.

In [None]:
dls_lm = TextDataLoaders.from_df(sent_df_sample, text_col='text', is_lm=True)

In [None]:
dls_lm.show_batch(max_n=2)

Create model

In [None]:
learn = language_model_learner(dls_lm, AWD_LSTM, drop_mult=0.3, metrics=[accuracy, Perplexity()]).to_fp16()

Fit for one epoch whilst frozen.

In [None]:
learn.fit_one_cycle(1, 2e-2)

Unfreeze model and train for 4 epochs, which was found to be the optimal number from previous tests. Save weights.

In [None]:
learn.unfreeze()
learn.fit_one_cycle(4, 2e-3)

In [None]:
learn.save_encoder('finetuned_800k')

## Classification Model

In [None]:
dls_class = DataBlock(
    blocks = (TextBlock.from_df('text', seq_len=dls_lm.seq_len, vocab=dls_lm.vocab), CategoryBlock),
    get_x=ColReader('text'),
    get_y=ColReader('sentiment'),
    splitter=RandomSplitter()
).dataloaders(sent_df_sample, bs=64)

dls_class.show_batch(max_n=2)

Copy weights to path where FastAI expects them '/models'

In [None]:
if not os.path.exists('models'):
        os.makedirs('models')
!cp '../input/tweets-languagemodel/finetuned_comb4.pth' 'models/finetuned_comb4.pth'
!cp '../input/tweets-languagemodel/classifier.pth' 'models/classifier.pth'

Create model and load encoder weights from language model.

In [None]:
learn = text_classifier_learner(dls_class, AWD_LSTM, drop_mult=0.5, metrics=accuracy).to_fp16()

In [None]:
learn.load_encoder('finetuned_800k')

In [None]:
learn.fit_one_cycle(1, 2e-2)

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))

Final accuracy of 83.6% achieved. Ideally should be higher, especially when compared to models trained on IMDb datasets, but I have not been able to find a dataset of tweets with three labels of positive, negative and neutral, with an adequate size for training.

In [None]:
learn.save('classifier_800k')

In [None]:
learn.load('classifier')

## Collecting Tweets from US Politicians

In [None]:
!pip install tweepy
import tweepy

In [None]:
consumer_key = 'b5V0twlRbIFQTsf3ccLxdJTiT'
consumer_secret = 'TMro0NvOQh5jLCx6lP80Pl3qu08Iuyk2sGMV0HczDeW1LLqH8p'
access_key = '562242130-K6ka3D23C90CPAY48xMr1yGieJYakfnTAYtFuz1a'
access_secret = 'qh14ZTpwoRJ16OHwkDdidPI4K5KHGse4cAZxovHVR3ims'
bearer_token = 'AAAAAAAAAAAAAAAAAAAAACkuLQEAAAAAswe%2Bwca72vNoYfQap0WCrYN7yHE%3DAbhtbxHkwcFiyV8m2wlSbinjZbWONsECVmg1qS22bpMtNaekPn'

In [None]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

In [None]:
us_tweets_df = pd.DataFrame(columns=['Twitter_username', 'text', 'time'])
for index, row in us_df.iterrows():
    jsons = []
    username = row['Twitter_username']
    try:
        new_jsons = api.user_timeline(screen_name=username, count=40, tweet_mode="extended")
    except tweepy.TweepError:
        print("Failed to run the command on that user, Skipping...")
    jsons.extend(new_jsons)
    tweets = [status.full_text for status in jsons]
    times = [status.created_at for status in jsons]
    for i in range(len(tweets)):
        us_tweets_df = us_tweets_df.append({'Twitter_username': username, 'text': tweets[i], 'time': times[i]}, ignore_index=True)
us_tweets_df.head()

In [None]:
us_tweets_df.to_csv('us_tweets', encoding='utf-8', index=False)

## Analysing tweets

Applying inference to the newly collected dataset, with the pre-trained sentiment classifier.

Testing on a batch of 100 tweets, the model took about 16 seconds to classify them all. This suggests that it would take about 15000 seconds to classify all the tweets, or more than 4 hours - let's go...

In [None]:
us_tweets_df.head()

In [None]:
pred_dl = dls_class.test_dl(us_tweets_df['text'])

In [None]:
preds = learn.get_preds(dl=pred_dl)

In [None]:
preds

Preds contains the final activations (/probabilities) for each class. The predicted class is found from the largest probability. 
0 = negative sentiment, 1 = positive sentiment.

In [None]:
us_tweets_df['sentiment'] = preds[0].argmax(dim=-1)

In [None]:
us_tweets_df.head()

In [None]:
us_tweets_df['sentiment'].value_counts()

In [None]:
us_tweets_df.to_csv('us_tweets_sent_v2.csv', encoding='utf-8', index=False)

tweets_df = us_tweets_df: the DataFrame has been loaded from a csv in a new session, hence the new name.

In [None]:
tweets_df.head(1)

In [None]:
tweets_df['date'] = tweets_df['time'].dt.date

In [None]:
tweets_df.head(1)

In [None]:
print(tweets_df.date.min())
print(tweets_df.date.max())

In [None]:
tweets_df['date'].iloc[0]

In [None]:
compare_date = datetime.date(datetime(2020, 12, 1))
compare_date

In [None]:
tweets_df_21 = tweets_df.loc[tweets_df['date'] >= compare_date]

In [None]:
dates = tweets_df_21['date']
dates = sorted(dates)
dates = sorted(list(set(dates)))
dates

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111)
ax.plot(tweets_df.groupby(tweets_df_21['date'])["sentiment"].mean())
#tweets_df.groupby(tweets_df_21['date'])["sentiment"].mean().plot(figsize=(20,10))
plt.xticks(dates[::5], rotation='60')
plt.minorticks_on()
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.grid(b=True, which='minor', color='#666666', linestyle=':')
plt.xlabel('Date')
plt.ylabel('Average Sentiment Score')
plt.show()

In [None]:
Both_DFs = pd.merge(tweets_df.set_index('Twitter_username', drop=True), us_df.set_index('Twitter_username', drop=True), left_index=True, right_index=True).dropna().reset_index()

In [None]:
Both_DFs.head()

In [None]:
Both_DFs['Political_party'].value_counts()

In [None]:
df_x = Both_DFs.loc[(Both_DFs['Political_party'] == 'Democratic Party')|  
                    (Both_DFs['Political_party'] == 'Republican Party')|
                    (Both_DFs['Political_party'] == 'Green Party of the United States')|
                   (Both_DFs['Political_party'] == 'Libertarian Party')]

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111)
df_x.groupby(df_x['Political_party'])["sentiment"].mean().plot(kind='bar', rot=0, ax=ax)
plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111)
Both_DFs.groupby(Both_DFs['Sex'])["sentiment"].mean().plot(kind='bar', rot=0, ax=ax)
plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111)
Both_DFs.groupby(Both_DFs['Age'])["sentiment"].mean().plot(linestyle='', marker='o', rot=0, ax=ax)
plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(111)
Both_DFs.groupby(Both_DFs['Birthplace'])["sentiment"].mean().plot(kind='bar', rot=0, ax=ax)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
from mpl_toolkits.basemap import Basemap

In [None]:
from itertools import chain

def draw_map(m, scale=0.2):
    # draw a shaded-relief image
    m.shadedrelief(scale=scale)
    
    # lats and longs are returned as a dictionary
    lats = m.drawparallels(np.linspace(-90, 90, 13))
    lons = m.drawmeridians(np.linspace(-180, 180, 13))

    # keys contain the plt.Line2D instances
    lat_lines = chain(*(tup[1][0] for tup in lats.items()))
    lon_lines = chain(*(tup[1][0] for tup in lons.items()))
    all_lines = chain(lat_lines, lon_lines)
    
    # cycle through these lines and set the desired style
    for line in all_lines:
        line.set(linestyle='-', alpha=0.3, color='w')

In [None]:
birthplace_sent = Both_DFs.groupby(Both_DFs['Birthplace'])["sentiment"].mean()

In [None]:
birthplace_sent

In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='nlp')

In [None]:
def geolocate(city=None, country=None):
    if city != None:
        try:
            loc = geolocator.geocode(str(city + ',' + country))
            return (loc.latitude, loc.longitude)
        except:
            return np.nan
    else:
        try:
            loc = geolocator.geocode(country)
            return (loc.latitude, loc.longitude)
        except:
            return np.nan

In [None]:
lat=[]
long=[]
for country in birthplace_sent.index:
    coords = geolocate(country=country)
    lat.append(coords[0])
    long.append(coords[1])

In [None]:
fig = plt.figure(figsize=(30, 15), edgecolor='w')

m = Basemap(projection='cyl', resolution=None,
            llcrnrlat=-90, urcrnrlat=90,
            llcrnrlon=-180, urcrnrlon=180)

m.scatter(long, lat, latlon=True, s=(birthplace_sent*70)**2,
          cmap='Reds', alpha=0.5)

for a in [0.25, 0.5, 1]:
    plt.scatter([], [], c='k', alpha=0.5, s=(a*70)**2,
                label=str(a))
plt.legend(scatterpoints=1, frameon=False,
           labelspacing=5, loc='lower left', borderpad = 5);

draw_map(m)