# LAB 7: Ben and Saatvik 

## Setup and Installation

### In the following notebook we explore three possible solutions for this competition. These solutions included using the `AWD_LSTM`, a bag of words approach, and a custum embeddings approach.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from fastai.text.all import * 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from torch import optim
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Here we unzip the datasets for the competition.

In [None]:
!unzip '/kaggle/input/movie-review-sentiment-analysis-kernels-only/test.tsv.zip'
!unzip '/kaggle/input/movie-review-sentiment-analysis-kernels-only/train.tsv.zip'

In [None]:
df = pd.read_csv('train.tsv', sep="\t")
df_test = pd.read_csv('test.tsv', sep="\t")
df_test = df_test.rename({'Phrase': 'text'}, axis='columns')

In [None]:
df = df[:10000]
df.head()

In [None]:
df_test.head()

### Create a dataloader for the training dataset.

In [None]:
# create train dataloader
dls = TextDataLoaders.from_df(df, text_col='Phrase', label_col='Sentiment')

# Fine Tune `AWD_LSTM` Model

In [None]:
awd_learner = text_classifier_learner(dls, AWD_LSTM, metrics=accuracy)

In [None]:
awd_learner.fine_tune(100, cbs=[SaveModelCallback, EarlyStoppingCallback(patience=10, min_delta=0.01)])

In [None]:
# save the model for future use
awd_learner.export('export_lab7.pkl')

# Bag Of Words Approach


Here we create a vector that represents the set of words that are present in the review and use it to predict the sentiment.
Our approahc uses a standard linear neural network with 2 hidden layers. The layer at the beginning of our network converts rom the format the dataloader provides, a tensor of word numbers, to a tensor of length equal to the number of words in the dictionary, with a 1 at each location that a word is present.

In [None]:
vocab_size = len(dls.train.vocab[0])

In [None]:
def to_multi_hot(arr):
  mh = [0]*vocab_size
  for i in range(len(arr)):
    mh[arr[i]] = 1
  return mh

def batch_mh(big_arr):
  big_arr = big_arr.tolist()
  result=[]
  for x in big_arr:
    result += [to_multi_hot(x)]
  result = torch.Tensor(result)
  result = to_device(result)
  return result

In [None]:
model = nn.Sequential(
    Lambda(batch_mh),
    nn.Linear(vocab_size, 30),
    nn.ReLU(),
    nn.Linear(30,5)  # note that output doesn't have a softmax layer.
                      # That gets handled in CrossEntropyLoss function. 
)

In [None]:
bow_learner = Learner(dls=dls, model=model, 
                opt_func=SGD, 
                loss_func=CrossEntropyLossFlat(), 
                metrics=accuracy)
bow_learner.summary()

In [None]:
bow_learner.fit(100, cbs=[SaveModelCallback(), EarlyStoppingCallback, ReduceLROnPlateau])

In [None]:
bow_learner.export('bag_of_words.pkl')

# Embeddings 

Here we create an embedding matrix that is applied to each word. Reviews are limited to a size of 100 since we cannot have a variable length of input.

In [None]:
# Defining some constants
review_size = 100
embedding_size = 10
hidden_layer_size = 20

In [None]:
import torch.nn.functional as F
class FirstHundred(Module):
    # This function returns the indices of the first hundred words in the review.
    def forward(self, tns):
        padded_tns = F.pad(tns, pad=(0, review_size - tns.shape[1], 0, 0), value=1)
        padded_tns = padded_tns[:, :review_size]
        padded_tns = to_device(padded_tns)
        return padded_tns

class PrintShape(Module):
    # This function prints the size of the current input.
    def forward(self, arr):
        print(arr.size())
        return arr

In [None]:
model = nn.Sequential(
    FirstHundred(),
    nn.Embedding(vocab_size, embedding_size),
    nn.Flatten(),
    nn.Linear(embedding_size * review_size, hidden_layer_size),
    nn.ReLU(),
    nn.Linear(hidden_layer_size,5) 
)

In [None]:
emb_learner = Learner(dls=dls, model=model,
                loss_func=CrossEntropyLossFlat(), 
                      opt_func=SGD,
                metrics=accuracy)
emb_learner.summary()

In [None]:
emb_learner.fit(100, cbs=[SaveModelCallback(), EarlyStoppingCallback(patience=20)])

In [None]:
emb_learner.export('embeddings.pkl')

## Make predictions

In [None]:
# load the saved models
# awd_learner = load_learner('../input/export/export_lab7.pkl')
# emb_learner = load_learner('../input/embeddings/embeddings.pkl')
bow_learner = load_learner('../input/bagofwords/bag_of_words.pkl')

In [None]:
# create a test dataloader
test_dl = bow_learner.dls.test_dl(df_test)

In [None]:
# make preds with the given dataloader
# awd_preds,awd_probs = awd_learner.get_preds(dl=test_dl)
bow_preds,bow_probs = bow_learner.get_preds(dl=test_dl)
# emb_preds,emb_probs = emb_learner.get_preds(dl=test_dl)

In [None]:
# awd_list_preds = []
# for row in preds:
#     awd_list_preds.append(torch.argmax(row).item())
    
bow_list_preds = []
for row in bow_preds:
    bow_list_preds.append(torch.argmax(row).item())
    
# emb_list_preds = []
# for row in preds:
#     emb_list_preds.append(torch.argmax(row).item())

## Create Submission File

In [None]:
# clean up test file for submission
del df_test['SentenceId']
del df_test['text']

In [None]:
# df_test.insert(1, "Sentiment", awd_list_preds, True)
# df_test.head()
# df_test.to_csv('awd_submission.csv', index=False)

# del df_test['Sentiment']

# df_test.insert(1, "Sentiment", bow_list_preds, True)
# df_test.head()
df_test.to_csv('submission.csv', index=False)

# del df_test['Sentiment']

# df_test.insert(1, "Sentiment", emb_list_preds, True)
# df_test.head()
# df_test.to_csv('emb_submission.csv', index=False)