In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
from pathlib import Path
from fastai.text import *
from fastai.widgets import ClassConfusion
from sklearn.model_selection import train_test_split

Path.ls= lambda self: list(self.glob("*"))

In [None]:
path = Path('../input/twitter-airline-sentiment')
path.ls()

## EDA

In [None]:
df = pd.read_csv(path/"Tweets.csv")
df.head()

### Tweets length Distribution

In [None]:
df['text'].apply(lambda text: len(text)).hist(figsize=(10, 5))

### Sentiment distributions

In [None]:
df.groupby("airline_sentiment")['airline_sentiment'].count().plot(kind='bar', figsize=(10, 6))

In [None]:
arline_by_sentiment = df.groupby(['airline', 'airline_sentiment'])['airline'].agg('count')
arline_by_sentiment.unstack().plot(kind='bar', figsize=(10, 6))

In [None]:
location_by_sentiment = df.groupby(['tweet_location', 'airline_sentiment'])['airline'].agg('count')
location_by_sentiment

In [None]:
log_retweet_by_sentiment = np.log(df.groupby(['retweet_count', 'airline_sentiment'])['airline'].agg('count'))
log_retweet_by_sentiment.unstack().plot(kind='bar', figsize=(20, 10))

## fast.ai

In [None]:
train_df, valid_df = train_test_split(df, test_size=0.2)

In [None]:
# Language model data
data_lm = TextLMDataBunch.from_df('.', train_df, valid_df, text_cols='text')

In [None]:
data_lm.show_batch()

### Fine-tuning a language model
Let's fine-tune a learner on the tweet dataset, i.e. a model that can generate tweets like in the dataset

In [None]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)
learn.fit_one_cycle(1, 1e-2)

Now let's unfreeze all layers to further train the Language Model

In [None]:
learn.unfreeze()
learn.fit_one_cycle(1, 1e-3)

Let's test the trained Language Model and see how well looking are the tweets

In [None]:
learn.predict("This is really a bad thing", n_words=10)

In [None]:
learn.predict("This is really a bad thing", n_words=50)

In [None]:
learn.save_encoder('ft_enc')

## Sentiment classifier
Let's build a sentiment classifier based on the LM that we've already built.

Note: we should reuse the vocab from the previous LM otherwise the learnt embeddings will be useless.

In [None]:
data_clas = TextClasDataBunch.from_df('.', train_df, valid_df, text_cols='text', label_cols='airline_sentiment', vocab=data_lm.train_ds.vocab, bs=32)

In [None]:
data_clas.show_batch()

In [None]:
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)
learn.load_encoder('ft_enc')

### Train the classifier

In [None]:
learn.fit_one_cycle(1, 1e-2)

Unfreeze two more layers and futher train

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-3/2., 5e-3))

Total unfreeze and train

In [None]:
learn.unfreeze()
learn.fit_one_cycle(1, slice(2e-3/100, 2e-3))

In [None]:
learn.fit_one_cycle(10, slice(2e-3/100, 2e-3))

Let's plot the confusion matrix and see where the model have hard time classifying classes}

In [None]:
interp = ClassificationInterpretation.from_learner(learn)

In [None]:
interp.plot_confusion_matrix()