In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import fastai
from fastai import *
from fastai.text import * 

from functools import partial

import os

In [None]:
! pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
! pip install fastai

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(filename)
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("../input/train.tsv", sep="\t")
test = pd.read_csv("../input/test.tsv", sep="\t")
sample = pd.read_csv("../input/sampleSubmission.csv", sep="\t")

In [None]:
df = pd.DataFrame({'label':df.Sentiment, 'text':df.Phrase})# df for deep learnimg approach

In [None]:
print(df.shape)
print(df.head())

In [None]:
plt.figure(figsize=(9,6))
sns.countplot(y=df.label, order = df.label.value_counts().iloc[:5].index)
plt.title('The distribution of the Sentiment')
plt.show()

# Fastai NLP approach

*As text can't directly be transformed into numbers to be fed into a model. The first thing we need to do is to preprocess our data so that we change the raw texts to lists of words, or tokens (a step that is called tokenization) then transform these tokens into numbers (a step that is called numericalization). These numbers are then passed to embedding layers that will convert them in arrays of floats before passing them through a model.*

*You can find on the web plenty of Word Embeddings to directly convert your tokens into floats. Those word embeddings have generally be trained on a large corpus such as wikipedia. Following the work of ULMFiT, the fastai library is more focused on using pre-trained Language Models and fine-tuning them. *

*The library is structured around three steps:*

* Get your data preprocessed and ready to use in a minimum amount of code,
* Create a language model with pretrained weights that you can fine-tune to your dataset,
* Create other models such as classifiers on top of the encoder of the language model.

Let’s clean our text by retaining only alphabets and removing everything else.
I will get rid of the stopwords from our text data. 

In [None]:
df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")
stop_words = set(stopwords.words('english'))
# tokenization 
tokenized_doc = df['text'].apply(lambda x: x.split())

# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc = [] 
for i in range(len(df)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

df['text'] = detokenized_doc

*First let’s split our dataset we prepared earlier into training and validation sets in a 80:20 ratio.*

In [None]:
df_trn, df_val = train_test_split(df[['label', 'text']], stratify = df['label'], test_size = 0.3, random_state = 12)

# Language Model

*To get a DataBunch quickly, there are also several factory methods depending on how our data is structured. They are all detailed in text.data, here we'll use the method from_df of the TextLMDataBunch (to get the data ready for a language model). *

In [None]:
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Fine-tuning a language model

*We can use the data_lm object we created earlier to fine-tune a pretrained language model. fast.ai has an English model with an AWD-LSTM architecture available that we can download. We can create a learner object that will directly create a model, download the pretrained weights and be ready for fine-tuning.*

In [None]:
learn = language_model_learner(data_lm,  arch = AWD_LSTM, pretrained = True, drop_mult = 0.5)# WT103_FWD, reducing drop_mult will avoid underfitting
# https://stackoverflow.com/questions/57240057/not-able-to-use-fastais-pretrained-model-urls-wt103

*First let´s find and vizualize the range of the learning rates with the corresponding losses.*

In [None]:
learn.lr_find()
learn.recorder.plot(skip_end = 15)

*To fine tune the last layers we are going to go for 1e-02 learning rate because that´s where the loss line is going down. *

In [None]:
learn.fit_one_cycle(1, 1e-02, moms = (0.8, 0.7))

In [None]:
learn.save('fit_head')
learn.load('fit_head')


*To complete fine tuning we are going to unfreeze and launch a new training with ten epochs this time:*

In [None]:
learn.unfreeze()
learn.fit_one_cycle(4, 1e-03, moms = (0.8, 0.7)) # 1e should be a bit higher than in the first tuning, moms the same moms = (0.8, 0.7), epchs now 4

In [None]:
learn.save('fine_tuned')

*To evaluate your language model, you can run the **Learner.predict** method and specify the number of words you want it to guess.*

In [None]:
learn.predict("I like this movie because", n_words=10, temperature = 1.1, min_p = 0.001)

*It is very important after all to save this encoder to use it for classification later.*

In [None]:
learn.save_encoder('fine_tuned_encoder')

# Classifier.

*To get the DataBunch for our classifier we'll use the from_df method of TextClasDataBunch (to get the data ready for a text classifier) classes.*

*This does all the necessary preprocessing behind the scene. For the classifier, we also pass the vocabulary (mapping from ids to words) that we want to use: this is to ensure that data_clas will use the same dictionary as data_lm.*

*In general, batch size of 32 is a good starting point, and you should also try with 64, 128, and 256. Other values (lower or higher) may be fine for some data sets, but the given range is generally the best to start experimenting with.*

In [None]:
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab = data_lm.train_ds.vocab, bs = 64)


*Let's inspect the data_clas we've just created:*

In [None]:
data_clas.show_batch()

https://docs.fast.ai/text.transform.html#BaseTokenizer
**
The rules are all listed below, here is the meaning of the special tokens:

UNK (xxunk) is for an unknown word (one that isn't present in the current vocabulary)

PAD (xxpad) is the token used for padding, if we need to regroup several texts of different lengths in a batch

BOS (xxbos) represents the beginning of a text in your dataset

FLD (xxfld) is used if you set mark_fields=True in your TokenizeProcessor to separate the different fields of texts (if your texts are loaded from several columns in a dataframe)

TK_MAJ (xxmaj) is used to indicate the next word begins with a capital in the original text

TK_UP (xxup) is used to indicate the next word is written in all caps in the original text

TK_REP (xxrep) is used to indicate the next character is repeated n times in the original text (usage xxrep n {char})

TK_WREP(xxwrep) is used to indicate the next word is repeated n times in the original text (usage xxwrep n {word})**

In [None]:
data_clas.vocab.itos[:20]

*We now use the data_clas object we created earlier to build a classifier with our fine-tuned encoder. The learner object can be done in a single line.*

In [None]:
learn = text_classifier_learner(data_clas, arch = AWD_LSTM, drop_mult=0.5)

In [None]:
learn.load_encoder('fine_tuned_encoder')

In [None]:

learn.freeze()

In [None]:
learn.lr_find()
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(1, 1e-02, moms= (0.8, 0.7))

In [None]:
learn.save('first')
learn.load('first')

Now we are going to unfreeze and train only the last two layers (freeze_to(-2)):

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4), 1e-3), moms = (0.8, 0.7))

In [None]:
learn.save('second')
learn.load('second')

Unfreeze and train the next layer: (freeze_to(-3))

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4), 5e-3), moms = (0.8, 0.7))

In [None]:
learn.save('third')
learn.load('third')

Unfreeze the whole thing and train it:

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4), 1e-3), moms = (0.8, 0.7))

In [None]:
learn.save('last')
learn.load('last')

In [None]:
learn.predict('intermittently pleasing mostly routine')

# Making Predictions on the validation set.

In [None]:
df_test = pd.DataFrame({'text':test.Phrase})# df for deep learnimg approach

In [None]:
df_test['text'] = df_test['text'].str.replace("[^a-zA-Z]", " ")

# tokenization 
tokenized_doc_test = df_test['text'].apply(lambda x: x.split())

# remove stop-words 
tokenized_doc_test = tokenized_doc_test.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc_test = [] 
for i in range(len(df_test)): 
    t = ' '.join(tokenized_doc_test[i]) 
    detokenized_doc_test.append(t) 

df_test['text'] = detokenized_doc_test

In [None]:
df_test.head()

In [None]:
dd = df_test.text.value_counts().to_frame().reset_index()
for i in dd['index']:
    print(i)

In [None]:
dd.shape

In [None]:
learn.predict('n')

In [None]:
preds = dd['index'].apply(lambda x: learn.predict(x))