In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import matplotlib
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from textwrap import wrap
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
#print(os.listdir("../input"))
import datetime as dt
import matplotlib.colors

# fastai
from fastai.text.all import *

# set dpi fpr better visuals

plt.rcParams['figure.dpi'] = 150
plt.rcParams["font.family"] = "monospace"

# **NLP with FastAI**

I cannot reccommend the FastAI course highly enough. It's a great entry point in to Neural Networks and Deep Learning in general. Many of the explanations I provide below are snippets from the course.

You can find more information here:

https://www.fast.ai/

and the course here:

https://course.fast.ai/

I reccommend using Google Collab for the course, rather the a jupyter notebook.

# **Project: Vaccine Sentiment Analysis**

# Project Plan

The purpose of this notebook is practice using FastAI, to do so, I will perform **vaccine sentiment analysis** 


**I will be following the FastAI course workbook**, lecture 10, but will apply it the Covid vaccine tweets dataset.

**Many of the explanations I give will be directly taken from the lecture notes.** Please do follow the links above to find out more about the course.

I will also demonstrate how we can **generate our own tweets** using FastAI.

Next, I will visualise if/how vaccine sentiment has changed over time. For this, I will pay particular interest to Johnson & Johnson and Oxford/AstraZeneca, as both have had some negative press lately, related to very rare side effects.


# References

Other than the FastAI course linked above, a fantastic notebook that follows the same course is:

https://www.kaggle.com/twhelan/covid-19-vaccine-sentiment-analysis-with-fastai/

This notebook really helped me out and inspired me to make my own version. Check it out if you have the chance.


In [None]:
path = Path('/kaggle/input/')
path.ls()

# Loading the data

In [None]:
vaccine_tweets = pd.read_csv(path/'all-covid19-vaccines-tweets/vaccination_all_tweets.csv')
tweets = pd.read_csv(path/'complete-tweet-sentiment-extraction-data/tweet_dataset.csv')

# A quick overview of our dataset

In [None]:
vaccine_tweets.shape

In [None]:
vaccine_tweets.head(1)

In [None]:
tweets.head(3)

In [None]:
tweets.isnull().sum()

In [None]:
# Code via https://www.kaggle.com/garyongguanjie/comments-analysis
def de_emojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

# Code via https://www.kaggle.com/pawanbhandarkar/generate-smarter-word-clouds-with-log-likelihood
def tweet_cleaner(df, text_col='text'):
    df['orig_text'] = df[text_col]
    # Remove twitter handles
    df[text_col] = df[text_col].apply(lambda x:re.sub('@[^\s]+','',x))
    # Remove URLs
    df[text_col] = df[text_col].apply(lambda x:re.sub(r"http\S+", "", x))
    # Remove emojis
    df[text_col] = df[text_col].apply(de_emojify)
    # Remove hashtags
    df[text_col] = df[text_col].apply(lambda x:re.sub(r'\B#\S+','',x))
    return df[df[text_col]!='']

In [None]:
# Rename columns for usability / aesthetics. The function above refers to 'text' column too, which we can overwrite, but this looks nicer
# This will also alow us to append the data later
tweets = tweets[['old_text','new_sentiment']].rename(columns={'old_text':'text','new_sentiment':'sentiment'})

# Add in a column in our df for sentiments - for now this will be a nan value, this is what we will predict!
vaccine_tweets['sentiment'] = np.nan

# Run our data through the functions we created above
tweets = tweet_cleaner(tweets)
vaccine_tweets = tweet_cleaner(vaccine_tweets)

# Preparing our data for FastAI

In [None]:
df_lm = tweets[['text', 'sentiment']].append(vaccine_tweets[['text', 'sentiment']])
df_clas = df_lm.dropna(subset=['sentiment'])
print(len(df_lm), len(df_clas))

In [None]:
df_clas['sentiment'].value_counts()

# Creating the Language Model

In [None]:
dls_lm = TextDataLoaders.from_df(df_lm,
                                 text_col='text',
                                 is_lm=True, 
                                 valid_pct=0.1)

Let's look at what we've created so far

In [None]:
dls_lm.show_batch(max_n=3)

In [None]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3, 
    metrics=[accuracy, Perplexity()]).to_fp16()

# Now let's fit the model

We'll fine tune this later, but for now I want to see where we are initially.

This is often a good idea, as once you have a basline you can ask yourself a number of questions:

 * Is this score good enough already?

 * Do we have time to train more models?

 * Is optimization necessary?

These are all questions the may or may not provide direction in your specific domain.

A fantastic course relevant to my point above is **"Structuting Machine Learning Projects"** by **Andrew Ng** on Coursera.

It focuses on the practical realities of ML tools & products, optimization, setting a basline, and it is a fantastic resource.

Here's a link:

https://www.coursera.org/learn/machine-learning-projects

In [None]:
learn.fit_one_cycle(1, 2e-2)

In [None]:
learn.save('1epoch')

# Improving the model

For the model above, I ran FastAI's default learning rate.

To try to improve the model, we can seek to **optimize the learning rate**, rather than just taking the default option.

In [None]:
learn.unfreeze()
learn.lr_find()

# Interpreting the learner

Ideally, we want to select a learning rate where the loss is still decreasing, but not too close to where it begins to increase.

The plot above appears prerrty horizontal, until a sharp increase at approx. 10^-1.

It looks as though there might be a slight decrease between 10^-4 and 10^-3 - but it is extremely slight. I will select 10^-4 for my new learning rate, but I don't expect much of an improvement on the model above.

In [None]:
learn.fit_one_cycle(5, 1e-4)

So, by changing the learning rate **we managed to improve our model** fairly significantly!

The loss on both the training set & validation set is still decreasing too, so more epochs might have led to an even higher accuracy score.

In [None]:
learn.save('model2')

learn.save_encoder('model_finetuned')

Above, I saved the full model, and then saved all of our model except the final layer that converts activations to probabilities of picking each token in our vocabulary. 

The model not including the final layer is called the **encoder**

**This completes this phase of the text classification process**: fine-tuning the language model. 

We can now use it to fine-tune a classifier for our tweets.

But first, an **interesting detour...**

# Synthetic Text Generation with AI

Things get even more interesting now. 

Since our model is attempting to predict the next word of a sentence, we can atually use the model to **generate our own synthetic sentences** or, in this case, tweets.

Feel free to **try it out...**

In [None]:
Text = "The vaccine"
Number_of_words = 15     # limit the words in the sentence
Number_of_sentences = 2  # how many sentences/samples
preds = [learn.predict(Text, Number_of_words, temperature=0.75) # temperature is an element of randomness so we don't get the same predictions
         for _ in range(Number_of_sentences)]

print("\n".join(preds))

# Building the Classifier

In [None]:
dls_clas = DataBlock(
    blocks = (TextBlock.from_df('text', seq_len=dls_lm.seq_len, vocab=dls_lm.vocab), CategoryBlock),
    get_x=ColReader('text'),
    get_y=ColReader('sentiment'),
    splitter=RandomSplitter()
).dataloaders(df_clas, bs=128, seq_len=72)

In [None]:
dls_clas.show_batch(max_n=3)

In [None]:
learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, 
                                metrics=accuracy).to_fp16()

In [None]:
learn = learn.load_encoder('model_finetuned')

# Fine Tuning the Classifier

The last step is to train with discriminative learning rates and *gradual unfreezing*. In computer vision we often unfreeze the model all at once, but for NLP classifiers, we find that unfreezing a few layers at a time makes a real difference:

In [None]:
learn.fit_one_cycle(1, 2e-2)

We can pass -2 to 'freeze_to' to freeze all except the last two parameter groups:

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

And unfreeze again and run some more

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

And now we'll unfreeze the entire model

In [None]:
learn.unfreeze()
learn.fit_one_cycle(3, slice(1e-3/(2.6**4),1e-3))

In [None]:
learn.save('final_classifier')

Our model has an **accuracy of nearly 77%** - pretty good!

What does this mean? 

It means that we can **correctly predict tweet sentiment in 77% of cases**

Let's **apply this to our COVID tweets** now...

In [None]:
predictions_dl = dls_clas.test_dl(vaccine_tweets['text'])
predictions = learn.get_preds(dl=predictions_dl)

In [None]:
predictions

We convert this 

In [None]:
predictions[0].argmax(dim=-1)

We now fill the Sentiments column that we made earlier...

In [None]:
vaccine_tweets['sentiment'] = predictions[0].argmax(dim=-1)

# Map results to text 

vaccine_tweets['sentiment'] = vaccine_tweets['sentiment'].map({0:'Negative', 1:'Neutral', 2:'Positive'})

# Convert dates
vaccine_tweets['Date'] = pd.to_datetime(vaccine_tweets['date'], errors='coerce').dt.date

# Save to csv
vaccine_tweets.to_csv('vaccine_tweets.csv')

# Exploration of our results

First, let's view how many of each category there are

In [None]:
background_color = '#f5f8fa'

fig = plt.figure(figsize=(4, 4), dpi=150,facecolor=background_color)
gs = fig.add_gridspec(1, 1)
gs.update(wspace=0, hspace=0)
ax0 = fig.add_subplot(gs[0, 0])
ax0.set_facecolor(background_color)

vaccine_tweets['Count'] = 1
temp = vaccine_tweets.groupby('sentiment')['Count'].sum().sort_values(ascending=False)

ax0.bar(temp.index, temp, width=0.5, edgecolor='black',linewidth=0.6, color='#1da1f2')

for i in temp.index:
    ax0.annotate(f"{format(round(temp[i]), ',')}", xy=(i, temp[i]/2),color='white', va='center', ha='center', fontweight='light')

ax0.grid(axis='y', linestyle='-', alpha=0.4)   
ax0.set_yticks([])
ax0.tick_params(axis=u'both', which=u'both',length=0)

for s in ['top', 'left', 'right']:
    ax0.spines[s].set_visible(False)
    

## For picture 

from matplotlib.offsetbox import AnnotationBbox, OffsetImage
def offset_png(x, y, path, ax, zoom, offset):
    '''For adding  .png images to the graph.
    source: https://stackoverflow.com/questions/61971090/how-can-i-add-images-to-bars-in-axes-matplotlib'''
    
    img = plt.imread(path)
    im = OffsetImage(img, zoom=zoom)
    im.image.axes = ax
    x_offset = offset
    ab = AnnotationBbox(im, (x, y), xybox=(x_offset, 0), frameon=False,
                        xycoords='data', boxcoords="offset points", pad=0)
    ax.add_artist(ab)
    
# Picture
path='../input/twitter-icon/twitter-icon-83.png'
offset_png(x=2.1, y=33000, path=path, ax=ax0, zoom=0.2, offset=0)


# title
ax0.text(0.65,32300,'Tweet Sentiment',fontweight='bold', fontsize=16, zorder=20)
ax0.text(0.66,30000,'COVID-19 Vaccinnations',fontweight='light', fontsize=8, zorder=20)




plt.show()

In [None]:
# format date correctly & get month & year
vaccine_tweets['date'] = pd.to_datetime(vaccine_tweets["date"].dt.strftime('%Y-%m-%d'))
vaccine_tweets['Year'], vaccine_tweets['Month'], = vaccine_tweets['date'].dt.year, vaccine_tweets['date'].dt.month

In [None]:
temp1 = vaccine_tweets.groupby(['date', 'sentiment'])['Count'].count().reset_index().dropna()
temp2 = pd.pivot_table(temp1, index='date',columns='sentiment',values='Count',aggfunc=np.sum, fill_value=0)

So there appear to be many more neutral tweets than positive or negative.

We can now view how the sentiment changed over time

In [None]:
fig = plt.figure(figsize=(8, 3), dpi=150,facecolor=background_color)
gs = fig.add_gridspec(1, 1)
gs.update(wspace=0, hspace=0)
ax0 = fig.add_subplot(gs[0, 0])
ax0.set_facecolor(background_color)

colors = ['#9b1b30', '#009473', '#f0c05a']

color_num = 0
for i in ["Negative", "Positive", "Neutral"]:
    sns.lineplot(data=temp2[i], x=temp2.index, y=temp2[i], color=colors[color_num], ax=ax0)
    color_num += 1
    
ax0.grid(axis='y', linestyle='-', alpha=0.01)   
#ax0.set_yticks([])
ax0.tick_params(axis=u'both', which=u'both',length=0)

for s in ['top', 'left', 'right']:
    ax0.spines[s].set_visible(False)
    
Xstart, Xend = ax0.get_xlim()
Ystart, Yend = ax0.get_ylim()

ax0.set_ylabel("Tweet Count",fontsize=8,loc='top', fontfamily='monospace')
ax0.set_xlabel(" ",fontsize=8,loc='left', fontfamily='arial')
ax0.tick_params(axis = "both", which = "both", left=False, bottom=False)
ax0.get_yaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

locator = mdates.AutoDateLocator(minticks=4, maxticks=7)
formatter = mdates.ConciseDateFormatter(locator)
ax0.xaxis.set_major_locator(locator)
ax0.xaxis.set_major_formatter(formatter)

ax0.text(Xstart,2400,'Tweet Sentiment over time',fontweight='bold', fontsize=16, zorder=20)


plt.show()

# Oxford/AstraZeneca

Let's shift focus to the Oxford/AstraZeneca Jab now, as this has been in the press a lot lately.

What can we learn?

In [None]:
astra = (vaccine_tweets[vaccine_tweets['orig_text'].str.lower().str.contains('astra')])
temp1 = astra.groupby(['date', 'sentiment'])['Count'].count().reset_index().dropna()
temp2 = pd.pivot_table(temp1, index='date',columns='sentiment',values='Count',aggfunc=np.sum, fill_value=0)

fig = plt.figure(figsize=(8, 3), dpi=150,facecolor=background_color)
gs = fig.add_gridspec(1, 1)
gs.update(wspace=0, hspace=0)
ax0 = fig.add_subplot(gs[0, 0])
ax0.set_facecolor(background_color)

colors = ['#9b1b30', '#009473', '#f0c05a']


color_num = 0
for i in ["Negative", "Positive", "Neutral"]:
    sns.lineplot(data=temp2[i], x=temp2.index, y=temp2[i], color=colors[color_num], ax=ax0)
    color_num += 1
    
ax0.grid(axis='y', linestyle='-', alpha=0.01)   
#ax0.set_yticks([])
ax0.tick_params(axis=u'both', which=u'both',length=0)

for s in ['top', 'left', 'right']:
    ax0.spines[s].set_visible(False)
    
Xstart, Xend = ax0.get_xlim()
Ystart, Yend = ax0.get_ylim()

ax0.set_ylabel("Tweet Count",fontsize=8,loc='top', fontfamily='monospace')
ax0.set_xlabel(" ",fontsize=8,loc='left', fontfamily='arial')
ax0.tick_params(axis = "both", which = "both", left=False, bottom=False)
ax0.get_yaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

locator = mdates.AutoDateLocator(minticks=4, maxticks=7)
formatter = mdates.ConciseDateFormatter(locator)
ax0.xaxis.set_major_locator(locator)
ax0.xaxis.set_major_formatter(formatter)

ax0.text(Xstart,185,'Tweet Sentiment over time: Oxford/AstraZeneca',fontweight='bold', fontsize=16, zorder=20)

ax0.axvspan(18700, 18708, facecolor='lightgray',alpha=0.5)

ax0.axvspan(18722, 18727, facecolor='lightgray',alpha=0.5)


plt.annotate('Negative Press Begins', xy=(18700, 95), xytext=(18680, 120),
             arrowprops=dict(facecolor='steelblue',arrowstyle="->",connectionstyle="arc3,rad=.2",color='black'), fontsize=7,fontfamily='monospace',ha='right', color='black')
    

plt.show()

Above we see **how the sentiment towards the Oxford/AstraZeneca jab changed over time.**

Initially, it was consistently neutral, up until Feb 2021 when all sentiments started to increase. 

Neutral remained the dominant sentiment, but it could be argued that **Positive & Negative sentiments were fairly equal.**

Then, in mid to late March, came the **negative press** associated with rare side effects. 

It is interesting that **not only did Negative sentiment increase as would be expected, but so too did Neutral and Positive.**

This again was the case in early April.

In [None]:
temp2['Neut_%'] = temp2['Neutral'] / (temp2['Neutral'] + temp2['Negative'] + temp2['Positive'] )
temp2['Neg_%'] = temp2['Negative'] / (temp2['Neutral'] + temp2['Negative'] + temp2['Positive'] )
temp2['Pos_%'] = temp2['Positive'] / (temp2['Neutral'] + temp2['Negative'] + temp2['Positive'] )

# Viewed Differently

We can also view how the proportions of neutral / negative / positive tweets changed over time.

For this, **I'll zoom in to February through April**, so we can see how the negative press affected the sentiment explicitly

Plot inspired by:

https://www.kaggle.com/subinium/all-you-need-is-time-series-visualization-20

In [None]:
fig = plt.figure(figsize=(12, 3), dpi=150,facecolor=background_color)
gs = fig.add_gridspec(1, 1)
gs.update(wspace=0, hspace=0)
ax0 = fig.add_subplot(gs[0, 0])
ax0.set_facecolor(background_color)

Neg_col = '#9b1b30'
Pos_col = '#009473'
Neut_col = '#f0c05a'

color = [Neut_col, Neg_col, Pos_col]

astra_temp = astra.groupby('sentiment')['date'].value_counts().unstack().fillna(0).loc[['Positive','Negative','Neutral']].T
astra_all = astra_temp.sum(axis=1)
astra_temp = (astra_temp.T / astra_all).cumsum().T

for i, sents in enumerate(astra['sentiment'].value_counts().index):
    sentims = astra_temp[sents]
    ax0.bar(sentims.index, sentims, color=color[i], label=sents)
    
for s in ['top', 'right', 'left']:
    ax0.spines[s].set_visible(False)

ax0.set_yticks([])

Xstart, Xend = ax0.get_xlim()
Ystart, Yend = ax0.get_ylim()

ax0.set_ylabel(" ",fontsize=8,loc='top', fontfamily='monospace')
ax0.set_xlabel(" ",fontsize=8,loc='left', fontfamily='arial')
ax0.tick_params(axis = "both", which = "both", left=False, bottom=False)
ax0.get_yaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

locator = mdates.AutoDateLocator(minticks=4, maxticks=7)
formatter = mdates.ConciseDateFormatter(locator)
ax0.xaxis.set_major_locator(locator)
ax0.xaxis.set_major_formatter(formatter)

ax0.text(Xstart+63.5,1.26,'Tweet Sentiment over time: Oxford/AstraZeneca',fontweight='bold', fontsize=16, zorder=20)
ax0.text(Xstart+63.5,1.125,'February & April 2021',fontweight='light', fontsize=14, zorder=20)

ax0.set_xlim(Xstart+63.5,Xend-9.5)

plt.show()

This is a valuable plot, because it shows that, regardless of the press coverage, the **predominant sentiment is always neutral**, with only minor changes in positive or negative sentiment.


# Let's remove neutrals

To understand the underlying currents in sentiment, let've remove neutral tweets.

This way, we'll be able to explicitly view how positive and negative sentiments have changed over time, without the distraction of the majority class of neutral

In [None]:
fig = plt.figure(figsize=(12, 3), dpi=150,facecolor=background_color)
gs = fig.add_gridspec(1, 1)
gs.update(wspace=0, hspace=0)
ax0 = fig.add_subplot(gs[0, 0])
ax0.set_facecolor(background_color)

Neg_col = '#9b1b30'
Pos_col = '#009473'

color = [Neg_col, Pos_col]

neg_pos = astra[(astra['sentiment'] == 'Positive')  | (astra['sentiment'] == 'Negative')]
astra_temp = neg_pos.groupby('sentiment')['date'].value_counts().unstack().fillna(0).loc[['Positive','Negative']].T
astra_all = astra_temp.sum(axis=1)
astra_temp = (astra_temp.T / astra_all).cumsum().T

for i, sents in enumerate(neg_pos['sentiment'].value_counts().index):
    sentims = astra_temp[sents]
    ax0.bar(sentims.index, sentims, color=color[i], label=sents)
    
for s in ['top', 'right', 'left']:
    ax0.spines[s].set_visible(False)

ax0.set_yticks([])

Xstart, Xend = ax0.get_xlim()
Ystart, Yend = ax0.get_ylim()

ax0.set_ylabel(" ",fontsize=8,loc='top', fontfamily='monospace')
ax0.set_xlabel(" ",fontsize=8,loc='left', fontfamily='arial')
ax0.tick_params(axis = "both", which = "both", left=False, bottom=False)
ax0.get_yaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

locator = mdates.AutoDateLocator(minticks=4, maxticks=7)
formatter = mdates.ConciseDateFormatter(locator)
ax0.xaxis.set_major_locator(locator)
ax0.xaxis.set_major_formatter(formatter)

ax0.text(Xstart+63.5,1.26,'Tweet Sentiment over time: Oxford/AstraZeneca',fontweight='bold', fontsize=16, zorder=20)
ax0.text(Xstart+63.5,1.125,'February & April 2021: Positive & Negative Only',fontweight='light', fontsize=14, zorder=20)

ax0.set_xlim(Xstart+63.5,Xend-9.5)

plt.show()

This again is a powerful plot. 

We see more clearly now how positive & negative sentiment changed over time


# Work in progress...