<h1 style="color:aqua;text-align:center">COVID-19 Tweet EDA + Fast.ai Classification</h1>


<strong style="color:red">If you like my notebook, please leave an upvote!</strong>
<hr>

In [None]:
! pip install --quiet chart-studio

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from tqdm.notebook import tqdm

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import torch
import fastai
from fastai import *
from fastai.text import *

import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
from plotly.offline import iplot
from wordcloud import WordCloud
from plotly.offline import iplot

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
plt.style.use('fivethirtyeight')
torch.device(0)

In [None]:
train_data = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_train.csv", encoding='latin-1')
test_data = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_test.csv", encoding='latin-1')

In [None]:
train_data.head()

In [None]:
test_data.head()

We'll join both Datasets, shuffle them and them divide them.

In [None]:
data = pd.concat([train_data, test_data])
data.shape

We want this classification to be 3-way so changing `Extremely Positive` to `Positive` and `Extremely Negative` to `Negative`.
For the moment, we only need the tweet text and the sentiment of it.

In [None]:
data['Sentiment'] = data['Sentiment'].map({'Extremely Positive':'Positive', 'Extremely Negative':'Negative', 'Negative':'Negative', 'Positive':'Positive', 'Neutral':'Neutral'})
train_data = data[['OriginalTweet', 'Sentiment']]

<h2 style="color:blue;text-align:center">Exploratory Data Analysis</h2>
<hr>

In [None]:
train_data.describe()

<h3 style="color:green;text-align:center">Target Value Distribution</h3>

In [None]:
vals = [len(train_data[train_data['Sentiment']=='Negative']['Sentiment']), len(train_data[train_data['Sentiment']=='Positive']['Sentiment']), len(train_data[train_data['Sentiment']=='Neutral']['Sentiment'])]
idx = ['Negative', 'Positive', 'Neutral']
fig = px.pie(
    train_data,
    names='Sentiment',
    title='Target Value Distribution Chart',
    color_discrete_sequence=px.colors.sequential.Agsunset
)
iplot(fig)

<h3 style="color:green;text-align:center">Character Frequency Count</h3>

In [None]:
neg = train_data[train_data['Sentiment']=='Negative']['OriginalTweet'].str.len()
pos = train_data[train_data['Sentiment']=='Positive']['OriginalTweet'].str.len()
neu = train_data[train_data['Sentiment']=='Neutral']['OriginalTweet'].str.len()

fig = make_subplots(rows=1, cols=3)

fig.add_trace(
    go.Histogram(x=list(neg), name='Negative Tweets'),
    row=1, 
    col=1
)

fig.add_trace(
    go.Histogram(x=list(pos), name='Positive Tweets'),
    row=1, 
    col=2,
)

fig.add_trace(
    go.Histogram(x=list(neu), name='Neutral Tweets'),
    row=1, 
    col=3,
)


fig.update_layout(title_text="Character Count")
iplot(fig)

<h3 style="color:green;text-align:center">Word Count Distribution</h3>

In [None]:
neg = train_data[train_data['Sentiment']=='Negative']['OriginalTweet'].str.split().map(lambda x: len(x))
pos = train_data[train_data['Sentiment']=='Positive']['OriginalTweet'].str.split().map(lambda x: len(x))
neu = train_data[train_data['Sentiment']=='Neutral']['OriginalTweet'].str.split().map(lambda x: len(x))

fig = make_subplots(rows=1, cols=3)

fig.add_trace(
    go.Histogram(x=list(neg), name='Negative Tweets'),
    row=1, 
    col=1
)

fig.add_trace(
    go.Histogram(x=list(pos), name='Positive Tweets'),
    row=1, 
    col=2,
)

fig.add_trace(
    go.Histogram(x=list(neu), name='Neutral Tweets'),
    row=1, 
    col=3,
)

fig.update_layout(title_text="Word Count")
iplot(fig)

<h3 style="color:green;text-align:center">Unique Word Count</h3>

In [None]:
neg = train_data[train_data['Sentiment']=='Negative']['OriginalTweet'].apply(lambda x: len(set(str(x).split()))).to_list()
pos = train_data[train_data['Sentiment']=='Positive']['OriginalTweet'].apply(lambda x: len(set(str(x).split()))).to_list()
neu = train_data[train_data['Sentiment']=='Neutral']['OriginalTweet'].apply(lambda x: len(set(str(x).split()))).to_list()

fig = ff.create_distplot([neg, pos, neu], ['Negative', 'Positive', 'Neutral'])
fig.update_layout(title_text="Unique Word Count Distribution")
iplot(fig)

<h3 style="color:green;text-align:center">URL Count</h3>

In [None]:
neg = train_data[train_data['Sentiment']=='Negative']['OriginalTweet'].str.split().map(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w or 'ftp' in w]))
pos = train_data[train_data['Sentiment']=='Positive']['OriginalTweet'].str.split().map(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w or 'ftp' in w]))
neu = train_data[train_data['Sentiment']=='Neutral']['OriginalTweet'].str.split().map(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w or 'ftp' in w]))

fig = make_subplots(rows=1, cols=3)

fig.add_trace(
    go.Histogram(x=list(neg), name='Negative Tweets'),
    row=1, 
    col=1
)

fig.add_trace(
    go.Histogram(x=list(pos), name='Positive Tweets'),
    row=1, 
    col=2,
)

fig.add_trace(
    go.Histogram(x=list(neu), name='Neutral Tweets'),
    row=1, 
    col=3,
)

fig.update_layout(title_text="URL Count")
iplot(fig)

<h3 style="color:green;text-align:center">Word Cloud</h3>

In [None]:
negative = " ".join(train_data[train_data['Sentiment'] == 'Negative']['OriginalTweet'].to_list())
positive = " ".join(train_data[train_data['Sentiment'] == 'Positive']['OriginalTweet'].to_list())
neutral = " ".join(train_data[train_data['Sentiment'] == 'Neutral']['OriginalTweet'].to_list())

fig, ax = plt.subplots(1, 3, figsize=(15,15))
ng_wlc = WordCloud(width=256, height=256, collocations=False).generate(negative)
ps_wlc = WordCloud(width=256, height=256, collocations=False).generate(positive)
ne_wlc = WordCloud(width=256, height=256, collocations=False).generate(neutral)
wcs = [ng_wlc, ps_wlc, ne_wlc]
titls = ["Negative Tweets", "Positive Tweets", "Neutral Tweets"]

for num, el in enumerate(wcs):
    ax[num].imshow(el)
    ax[num].axis('off')
    ax[num].set_title(titls[num])

<h2 style="color:blue;text-align:center">Text Cleaning</h2>
<hr>

In [None]:
# Remove everything except basic text characters
train_data['OriginalTweet'] = train_data['OriginalTweet'].str.replace("[^a-zA-Z]", " ").str.lower()
train_data.sample(5)

In [None]:
# Change the column name and encode the labels
train_data = train_data.rename(columns={'Sentiment':'label'})
train_data['label'] = train_data['label'].apply(lambda x: 0 if x=='Negative' else (1 if x=='Positive' else 2))

In [None]:
# Let us now split the dataset into training and validation sets
split_pcent = 0.20  # How much percent of data should go into testing set
split = int(split_pcent * len(train_data))

shuffled_set = train_data.sample(frac=1).reset_index(drop=True)   # Shuffle the data
valid_set = shuffled_set[:split]   # Get everything till split number
train_set = shuffled_set[split:]   # Get everything after split number

train_set = train_set[['label', 'OriginalTweet']]
valid_set = valid_set[['label', 'OriginalTweet']]

<h2 style="color:blue;text-align:center">Modelling</h2>
<hr>

In [None]:
# Make a Language Model Data Bunch from our train set
data_bunch = TextLMDataBunch.from_df(train_df=train_set, valid_df=valid_set, path="")

# Make the data classifier
data_clf = TextClasDataBunch.from_df(path="", train_df=train_set, valid_df=valid_set, vocab=data_bunch.train_ds.vocab, bs=16)

Just train the learner as-is.

In [None]:
# Define the language learner model and fit for one epoch
learner = language_model_learner(data_bunch, arch=AWD_LSTM, drop_mult=0.5)

learner.fit_one_cycle(1, 1e-2)

Now unfreeze the hidden layers and train the learner.

In [None]:
# Try unfreezing last 3 layers first
layers_to_unfreeze = [1, 2, 3]
for i in layers_to_unfreeze:
    learner.freeze_to(-i)
    learner.fit_one_cycle(1, 1e-2)

Let's now unfreeze all layers and then train them.

In [None]:
learner.unfreeze()
learner.fit_one_cycle(1, 1e-2)

In [None]:
# Save the encoder
learner.save_encoder('learn_encoder')

Now we train the classifier using the encoder above.

In [None]:
clf = text_classifier_learner(data_clf, arch=AWD_LSTM, drop_mult=0.5)
clf.load_encoder('learn_encoder')

In [None]:
clf.fit_one_cycle(5, 1e-2)

In [None]:
# Let's unfreeze all it's layers and train it.
clf.unfreeze()
clf.fit_one_cycle(5)

<h2 style="color:blue;text-align:center">Testing and Classification</h2>
<hr>

In [None]:
clf.predict("The COVID is harming our lives and destroying job opportunities")