<a href="https://colab.research.google.com/github/shivammehta007/-Workshop-Introduction-to-Web-Development/blob/master/Comparison_of_NLP_Models_IMDB_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# We will compare NLP Models with IMDB Classification

## Imports

In [68]:
import os
import re
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext

import numpy as np
import pandas as pd

from collections import namedtuple
from pprint import pprint

!pip install -U tqdm
# Need to restart runtime on google collab for it to work

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from tqdm.notebook import tqdm
from torchtext import datasets
from torchtext import data

from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.Defaults.create_tokenizer(nlp)

Requirement already up-to-date: tqdm in /usr/local/lib/python3.6/dist-packages (4.42.0)


## Seed Set

In [0]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
random.seed(SEED)

# Load the Data

## TorchText Data

Initialize two Field and Label 

In [0]:
TEXT = data.Field(tokenize=tokenizer)
LABEL = data.LabelField(dtype = torch.float)

In [4]:
%%time
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

CPU times: user 1min 10s, sys: 2.22 s, total: 1min 12s
Wall time: 1min 13s


In [5]:
print(len(train_data), len(test_data))
print(vars(train_data.examples[0]))

25000 25000
{'text': This is an absurdist dark comedy from Belgium. Shot perfectly in crisp black and white, Benoît Poelvoorde (Man Bites Dog) is on fine form as Roger, the angry, obsessive father of a family in a small, sullen Belgian mining town. Roger is a photographer who, along with his young daughter Luise, visits road accidents to take photos. He is also obsessed with winning a car by entering a competition where the contestant has to break a record - and he decides that his son, Michel, must attempt to break the record of perpetually walking through a door - he even hires an overweight coach to train him. Michel dresses as Elvis and has a spot on a radio show called 'Cinema Lies', where he describes mistakes in films. Luise is friendly with near neighbour Felix, a pigeon fancier. Roger is a callous figure as he pushes Michel right over the limit during the record attempt, which almost results in his death. Interspersed throughout the film are Magritte-like surreal images. It's 

In [0]:
train_data, valid_data = train_data.split(split_ratio=0.8, random_state=random.seed(SEED))

In [7]:
print(f'Train Data: \t\t {len(train_data):,}')
print(f'Validation Dataset: \t {len(valid_data):,}')
print(f'Test Data: \t\t {len(test_data):,}')

Train Data: 		 20,000
Validation Dataset: 	 5,000
Test Data: 		 25,000


## Numpy Data
We Converted Data to Numpy for implementation of Classical Algorithms

In [8]:
type(train_data.examples[0].text)

spacy.tokens.doc.Doc

In [0]:
def get_dataframe_from_dataset(dataset, labels = {'X': 'text', 'y': 'label'}):
    """Utility Method to convert torchext.data.Dataset to numpy array of text and label"""
    i = 0
    data = {'X' : [], 'y' : []}
    for example in tqdm(dataset):
        data['X'].append(example.text.text)
        data['y'].append(example.label)
    
    assert len(data['X']) == len(data['y'])

    return pd.DataFrame(data).rename(columns=labels)


In [10]:
train_df = get_dataframe_from_dataset(train_data)
val_df = get_dataframe_from_dataset(valid_data)
test_df = get_dataframe_from_dataset(test_data)

HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




In [11]:
test_df.head()

Unnamed: 0,text,label
0,Hehehe. This was one of the best funny road mo...,pos
1,"This movie is good for entertainment purposes,...",pos
2,Corniness Warning. As many fellow IMDb users a...,pos
3,This movie starts off somewhat slowly and gets...,pos
4,I got some free tickets via the Times to see t...,pos


# Implementation

## Classical ML algorithms
First Step will be **Preprocessing**
### List of Algorithms Implemented
1.   Naïve Bais Classifier
2.   Random Forest Classifier
3.   Support Vector Machines (SVM) Linear
4.   SGD Classifiers
5.   XGBoost
6.   Lightgbm



## Pre Processing

### Steps:

1.   Decontract
2.   Stopwords Removal
3.   Punctuation Removal
4.   Vectorize
5.   Label Encode



In [0]:
dataframes = [train_df, val_df, test_df]

#### 1. Decontract

In [0]:
def decontracted(text):
    # specific
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)

    # general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

In [0]:
for df in dataframes:
    df['text'] = df['text'].apply(decontracted)

#### 2. Stopwords Removal

In [15]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
stopwords = set(stopwords.words("english"))

In [0]:
def remove_stopwords(text):
    """Removes Stopwords from the text uses scapy tokenizer little heavy but good"""
    text = ' '.join([word.text for word in tokenizer(text) if word.text not in stopwords])
    return text

In [18]:
for df in tqdm(dataframes):
    df['text'] = df['text'].apply(remove_stopwords)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




#### 3. Punctuation Removal

In [0]:
from string import punctuation

def remove_punctuation(text):
    text = text.translate(str.maketrans('', '', punctuation))
    return text

In [20]:
for df in tqdm(dataframes):
    df['text'] = df['text'].apply(remove_punctuation)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




#### 4. Vectorize
from Sklearn.feature_extraction.text import:
1.   CountVectorizer
2.   TfIdfVectorizer



In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

We will use them later ! Let's Try Pipleline !!

#### 5. Encode Label

In [0]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [23]:
for df in tqdm(dataframes):
    df['label'] = le.fit_transform(df['label'])

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [24]:
train_df.head()

Unnamed: 0,text,label
0,honestly I begin This low budget HORRIBLY a...,0
1,This one unusual cases movie novel based great...,1
2,Words seriously enough convey emotional power ...,1
3,In August 1980 disappearance baby Azaria Chamb...,1
4,Oh dear I disappointed movie rip Japan Ringu...,0


### Converting to Numpy arrays 
For faster computation and generic variable names

In [0]:
split_df = lambda df: (np.array(df['text']), np.array(df['label']))

In [0]:
X_train, y_train = split_df(train_df)
X_valid, y_valid = split_df(val_df)
X_test, y_test = split_df(test_df)

## Pipeline of Algorithms

In [0]:
from sklearn.pipeline import Pipeline

### 1. Naïve Bais Classifier

In [0]:
from sklearn.naive_bayes import MultinomialNB

#### Generating Pipelines
Generate two pipelines with differenet Vectorizers, whichever will show better performance over valid set, we will tune Hyperparameters over it

In [0]:
naive_bais1 = Pipeline(
    [('cv', CountVectorizer()),
      ('nb', MultinomialNB())])

naive_bais2 = Pipeline(
    [('tfidf', TfidfVectorizer()),
      ('nb', MultinomialNB())])

In [0]:
Model = namedtuple('Model', ['pipeline', 'predictions', 'accuracy'])

In [86]:
best_accuracy = 0.0
best_model = Model(None, None, None)
for pipeline in tqdm([naive_bais1, naive_bais2]):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = Model(pipeline, y_pred, accuracy)

print('Best Pipeline found: {} with accuracy: {}'.format([step[0] for step in best_model.pipeline.steps], best_model.accuracy))

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Best Pipeline found: ['tfidf', 'nb'] with accuracy: 0.8692


#### Tune Hyperparameters
We will use GridSearchCV