In [None]:
from fastai.text import *
import html
import pandas as pd

In [None]:
df = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/train.tsv',delimiter='\t',encoding='utf-8')

In [None]:
df.head()

In [None]:
df_test = pd.read_csv('../input/movie-review-sentiment-analysis-kernels-only/test.tsv',delimiter='\t',encoding='utf-8')

In [None]:
df_test.head()
df_test2 = df_test.drop(["PhraseId", "SentenceId"], axis=1)

In [None]:
df_test2.head(5)

In [None]:
trn_texts = df['Phrase'].values
trn_labels = df['Sentiment'].values

In [None]:
np.random.seed(42)
trn_idx = np.random.permutation(len(trn_texts))
trn_texts = trn_texts[trn_idx]
trn_labels = trn_labels[trn_idx]

In [None]:
from sklearn.model_selection import train_test_split
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(trn_texts, trn_labels, test_size=0.1)
print (X_train.shape)
print(y_train.shape)
print (X_test.shape)
print(y_test.shape)

In [None]:
col_names = ['labels','text']
df_trn = pd.DataFrame({'text':X_train, 'labels':y_train}, columns=col_names)
df_val = pd.DataFrame({'text':X_test, 'labels':y_test}, columns=col_names)

In [None]:
df_trn['labels'].value_counts()

In [None]:
df_val['labels'].value_counts()

In [None]:
df_trn.shape

In [None]:
# Language model data
data_lm = TextLMDataBunch.from_df('./', train_df=df_trn, valid_df=df_val)

In [None]:
em_sz,nl = 400,3

In [None]:
learn = language_model_learner(data_lm, emb_sz=em_sz, nl=nl, drop_mult=0.1)
learn = LanguageLearner(data_lm, learn.model, bptt=70)
learn.load_pretrained('../input/wiki103/lstm_wt103.pth', '../input/wiki103/itos_wt103.pkl')

In [None]:
learn.metrics = [accuracy]
learn.freeze_to(-1)

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(1, 1e-2, moms=(.8,.7))

In [None]:
learn.unfreeze()
learn.fit_one_cycle(1, 1e-3)

In [None]:
learn.predict("this is a review about", n_words =10)

In [None]:
# Classifier model data
data_clas = TextClasDataBunch.from_df('./', train_df=df_trn, valid_df=df_val, vocab=data_lm.train_ds.vocab, bs=32)

In [None]:
learn.save_encoder('fine_enc')

In [None]:
# Classifier
classifier = text_classifier_learner(data_clas, drop_mult=0.5)
classifier.load_encoder('fine_enc')
classifier.crit = F.cross_entropy

In [None]:
classifier.lr_find()

In [None]:
classifier.recorder.plot()

In [None]:
classifier.fit_one_cycle(1, 1e-2, moms=(.8,.7))

In [None]:
classifier.freeze_to(-2)
classifier.fit_one_cycle(1, 1e-3, moms=(.8,.7))

In [None]:
classifier.freeze_to(-3)
classifier.fit_one_cycle(1, 1e-4, moms=(.8,.7))

In [None]:
classifier.unfreeze()
classifier.fit_one_cycle(5, 50e-5, moms=(.8,.7))

In [None]:
classifier.predict("This is not a  good movie")

In [None]:
preds = classifier.get_preds()

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

In [None]:
if torch.cuda.is_available():
    test_txt_list = (TextList.from_df(df_test2, './',  processor=[TokenizeProcessor(), NumericalizeProcessor(vocab=data_lm.vocab)])).process()
    classifier.model = classifier.model.to(device)
    classifier.model = classifier.model.eval()

predicted = []
with torch.no_grad():
    for i, doc in enumerate(test_txt_list.items):
        if i % 10000 == 0: print("Evaluating...",i) 

        doc = torch.LongTensor(doc).to(device)
        pred, _, _ = classifier.model(doc.unsqueeze(0))
        pred = pred.detach().cpu().numpy()
        predicted_labels = np.argmax(pred.squeeze())
        predicted.append(predicted_labels.item())
       

In [None]:
df_test['Predicted'] = predicted

In [None]:
df_test.head()

In [None]:
my_submission = pd.DataFrame({'PhraseId': df_test.PhraseId, 'Sentiment': df_test.Predicted})
my_submission.to_csv('submission.csv', index= False)

In [None]:
my_submission.head()