# Install

In [1]:
! pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194

In [2]:
! pip install xgboost



In [3]:
! pip install gensim



# Imports

In [4]:
import os
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

from datasets import load_dataset

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [5]:
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import gensim
from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Mount google drive

In [7]:
drive.mount('/content/drive')

Mounted at /content/drive


# Config

In [8]:
# Locations
DRIVE_PATH = '/content/drive/My Drive/'
DATA_DIR = 'Study/Projects/Problem statements/movie_review_classification/data/'
RES_DIR = 'Study/Projects/Problem statements/movie_review_classification/results/'

# Debug
debug = True

# Dataset
DATASET_NAME = "imdb"

# Limit the data size (for testing purpose)
LIMIT_DATA = False
LIMIT_DATA_SIZE = 100

# Pre-processing flags
PR_REMOVE_PUNCTUATIONS = True
PR_REMOVE_SPECIAL_CHAR = True
PR_REMOVE_HTML_TAGS = True
PR_REMOVE_STOPWORDS = True
PR_REMOVE_URL = True
PR_REMOVE_EXTRA_SPACE = True
PR_LEMMATIZATION = True

# Unknown token
UNK_TOKEN = 'UNK'
UNK_TOKEN_IDX = 1

# Padding
PAD = 'PAD'
PAD_IDX = 0

# Number of features for BOW, TF-IDF based models
NUM_FEATURES = 5000

# Random state for classifiers
RANDOM_STATE = 42

# Target labels
POS_LABEL = 1
NEG_LABEL = 0

# Word2Vec
EMB_SIZE = 100

# Utils

### Baseline classification models

In [9]:
def base_classification_model(train_features, train_labels, test_features, test_labels):

    # Check: Logistic regression code

    # Logistic Regression classifier
    print(f'LOGISTIC REGRESSION CLASSIFIER')
    lr_classifier = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
    lr_classifier.fit(train_features, train_labels)
    y_pred = lr_classifier.predict(test_features)
    model_evaluate(list(test_labels), y_pred, True)
    print('===================================================')

    # Random forest classifer
    print(f'RANDOM FOREST CLASSIFIER:')
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
    rf_classifier.fit(train_features, train_labels)
    y_pred = rf_classifier.predict(test_features)
    model_evaluate(list(test_labels), y_pred, True)
    print('===================================================')

    # XGBoost classifer
    print(f'XGBoost CLASSIFIER')
    xgb_classifier = XGBClassifier(random_state=RANDOM_STATE)
    xgb_classifier.fit(train_features, train_labels)
    y_pred_1 = xgb_classifier.predict(test_features)
    model_evaluate(list(test_labels), y_pred_1, True)
    print('===================================================')

    # LightGBM classifer
    print(f'LightGBM CLASSIFIER')
    lgbm_classifier = LGBMClassifier(random_state=RANDOM_STATE)
    lgbm_classifier.fit(train_features, train_labels)
    # lgbm_classifier.fit(pd.DataFrame(train_features.values), train_labels)
    y_pred_2 = lgbm_classifier.predict(test_features)
    # y_pred_2 = lgbm_classifier.predict(pd.DataFrame(test_features.values))
    model_evaluate(list(test_labels), y_pred_2, True)
    print('===================================================')

### Classification metrics

In [10]:
def model_evaluate(true_val, pred_val, print_res):

    '''
    Calculate classification metrics (Accuracy, Precision, Recall, F1-Score)
    '''

    true_val = list(true_val)
    pred_val = list(pred_val)

    tp = 0 # Number of true positives
    tn = 0 # Number of true negatives
    fp = 0 # Number of false positives
    fn = 0 # Number of false negatives

    for i in range(len(true_val)): # For each sample

        if true_val[i] == pred_val[i]:
            if true_val[i] == POS_LABEL:
                tp += 1
            elif true_val[i] == NEG_LABEL:
                tn += 1

        if (true_val[i] == NEG_LABEL) and (pred_val[i] == POS_LABEL):
            fp += 1

        if (true_val[i] == POS_LABEL) and (pred_val[i] == NEG_LABEL):
            fn += 1

    accuracy = round(((tp + tn) / (tp + tn + fp + fn)) * 100, 2) # Overall accuracy

    precision_pos = round(((tp) / (tp + fp)) * 100, 2) # Precision of positive class
    recall_pos = round(((tp) / (tp + fn)) * 100, 2) # Recall of positive class (Sensitivity)
    f1_pos = round(((2 * precision_pos * recall_pos) / (precision_pos + recall_pos)), 2) # F1 Score of positive class

    precision_neg = round(((tn) / (tn + fn)) * 100, 2)
    recall_neg = round(((tn) / (tn + fp)) * 100, 2) # Specificity
    f1_neg = round(((2 * precision_neg * recall_neg) / (precision_neg + recall_neg)), 2)

    # Print metrics
    if print_res:
        print(f'Accuracy: {accuracy}')
        print(f'TP: {tp}')
        print(f'TN: {tn}')
        print(f'FP: {fp}')
        print(f'FN: {fn}')
        print(f'precision_pos: {precision_pos}')
        print(f'precision_neg: {precision_neg}')
        print(f'recall_pos: {recall_pos}')
        print(f'recall_neg: {recall_neg}')
        print(f'f1_pos: {f1_pos}')
        print(f'f1_neg: {f1_neg}')

    else:
        return [accuracy, tp, tn, fp, fn, precision_pos, precision_neg, recall_pos, recall_neg, f1_pos, f1_neg]

### Write pandas df as csv in drive

In [11]:
def write_to_drive(df, location, filename):
    print(f'Saving {filename} to {DRIVE_PATH + location}')
    df.to_csv(DRIVE_PATH + location + filename, index=False)

### Write dictionary

In [12]:
def write_dict(dict_data, location, filename):
    with open(DRIVE_PATH + location + filename + '.json', 'w') as json_file:
        json.dump(dict_data, json_file)

### Read dictionary

In [13]:
def read_dict(location, filename):
    with open(DRIVE_PATH + location + filename + '.json', 'r') as json_file:
        return(json.load(json_file))

# Data

### Load

In [13]:
imdb_dataset = load_dataset(DATASET_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

### Train/Test Extract

In [14]:
train_dataset = imdb_dataset['train']
test_dataset = imdb_dataset['test']

### Information

In [15]:
print(f'Type(x_train): {type(train_dataset)}')
print(f'Type(y_train): {type(test_dataset)}')

Type(x_train): <class 'datasets.arrow_dataset.Dataset'>
Type(y_train): <class 'datasets.arrow_dataset.Dataset'>


In [16]:
print(f'Training dataset length: {len(train_dataset)}')
print(f'Testing dataset length: {len(test_dataset)}')

Training dataset length: 25000
Testing dataset length: 25000


In [17]:
print(f'type(train_dataset[0]): {type(train_dataset[0])}')

type(train_dataset[0]): <class 'dict'>


In [18]:
print(f'train_dataset[0] keys: {list(train_dataset[0].keys())}')

train_dataset[0] keys: ['text', 'label']


### Sample

In [19]:
print(train_dataset[0])

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [20]:
print(test_dataset[0])

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

### Convert to pandas dataframe

In [21]:
train_df = pd.DataFrame(train_dataset)
test_df = pd.DataFrame(test_dataset)

In [22]:
train_df.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [23]:
test_df.head()

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [24]:
train_df.shape

(25000, 2)

In [25]:
test_df.shape

(25000, 2)

### Limit data size

In [26]:
if LIMIT_DATA:
    print('Limiting Train/Test Data')

    train_df = train_df.iloc[0:LIMIT_DATA_SIZE, :]
    test_df = test_df.iloc[0:LIMIT_DATA_SIZE, :]

    print(f'Train Size: {train_df.shape}')
    print(f'Test Size: {test_df.shape}')

In [27]:
TRAIN_DATA_SIZE = train_df.shape[0]
TEST_DATA_SIZE = test_df.shape[0]

print(f'TRAIN_DATA_SIZE: {TRAIN_DATA_SIZE}')
print(f'TEST_DATA_SIZE: {TEST_DATA_SIZE}')

TRAIN_DATA_SIZE: 25000
TEST_DATA_SIZE: 25000


### Write/Read data

In [28]:
DRIVE_PATH + DATA_DIR + 'train_df.csv'

'/content/drive/My Drive/Study/Projects/Problem statements/movie_review_classification/data/train_df.csv'

In [29]:
write_to_drive(train_df, DATA_DIR, 'train_df.csv')

Saving train_df.csv to /content/drive/My Drive/Study/Projects/Problem statements/movie_review_classification/data/


In [14]:
train_df = pd.read_csv(DRIVE_PATH + DATA_DIR + 'train_df.csv')

In [31]:
write_to_drive(test_df, DATA_DIR, 'test_df.csv')

Saving test_df.csv to /content/drive/My Drive/Study/Projects/Problem statements/movie_review_classification/data/


In [15]:
test_df = pd.read_csv(DRIVE_PATH + DATA_DIR + 'test_df.csv')

In [16]:
train_df.shape, test_df.shape

((25000, 2), (25000, 2))

# Data cleaning
1: Lowercase conversion
<br>2: Remove punctuations
<br>3: Remove special characters
<br>4: Remove HTML tags
<br>5: Remove URLs
<br>6: Remove stop words
<br>7: Remove extra space

### EDA

In [34]:
train_df['text'].head(10)

0    I rented I AM CURIOUS-YELLOW from my video sto...
1    "I Am Curious: Yellow" is a risible and preten...
2    If only to avoid making this type of film in t...
3    This film was probably inspired by Godard's Ma...
4    Oh, brother...after hearing about this ridicul...
5    I would put this at the top of my list of film...
6    Whoever wrote the screenplay for this movie ob...
7    When I first saw a glimpse of this movie, I qu...
8    Who are these "They"- the actors? the filmmake...
9    This is said to be a personal film for Peter B...
Name: text, dtype: object

In [35]:
train_df['text'][0]

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

In [36]:
print(train_df['text'][0])

I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, eve

### Cleaning

In [37]:
if PR_REMOVE_PUNCTUATIONS:
    punctuations = string.punctuation
    print(f'punctuations: {punctuations}')

punctuations: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [38]:
if PR_REMOVE_STOPWORDS:
    stopwords_en = stopwords.words('english')
    print(f'Number of stopwords: {len(stopwords_en)}')

Number of stopwords: 179


In [39]:
def clean_data(x):

    if debug: print(f'Input: {x}')
    if debug: print(f'Input data length: {len(x.split(" "))}')

    # Convert to lower case
    x1 = x.lower()

    # Replace punctuations with space
    # TODO: Will join words like ABCD-XYZ. Handle this.
    # TODO: Seems repetitive (since punctuations will also be removed in special characters removal step)
    if PR_REMOVE_PUNCTUATIONS:
        x1 = " ".join(["".join([char for char in list(word) if char not in punctuations]) for word in x1.split(" ")])
        if debug: print(f'x1 (After punctuations removal): {x1}')
        if debug:print(f'Length: {len(x1.split(" "))}')

    # Replace special characters (anything other than a-z / A-Z / 0-9) with space
    if PR_REMOVE_SPECIAL_CHAR:
        x2 = re.sub("[^a-zA-Z0-9]", " ", x1)
        if debug: print(f'x2 (After special characters removal): {x2}')
        if debug:print(f'Length: {len(x2.split(" "))}')

    # Replace HTML tags with space
    if PR_REMOVE_HTML_TAGS:
        x3 = re.sub("<.*?>", " ", x2)
        if debug: print(f'x3 (After HTML tags removal): {x3}')
        if debug:print(f'Length: {len(x3.split(" "))}')

    # Replace URL with space
    if PR_REMOVE_URL:
        x4 = re.sub("(http|https|www)\S+", "", x3)
        if debug: print(f'x4 (After URL removal): {x4}')
        if debug:print(f'Length: {len(x4.split(" "))}')

    # Remove stopwords
    if PR_REMOVE_STOPWORDS:
        x5 = " ".join([word for word in x4.split(" ") if word not in stopwords_en])
        if debug: print(f'x5 (After stopwords removal): {x5}')
        if debug:print(f'Length: {len(x5.split(" "))}')

    # TODO: Strip spaces from start and end of the sentence.
    # Remove extra space
    if PR_REMOVE_EXTRA_SPACE:
        x6 = re.sub(" +", " ", x5)
        if debug: print(f'x5 (After stopwords removal): {x5}')
        if debug:print(f'Length: {len(x6.split(" "))}')

    return x6

In [40]:
# Testing
if debug:
    clean_data(train_df['text'][0])

Input: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far betwe

In [41]:
debug = False
train_df['text_preprocessed'] = train_df['text'].apply(clean_data)

In [42]:
train_df.head()

Unnamed: 0,text,label,text_preprocessed
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,rented curiousyellow video store controversy s...
1,"""I Am Curious: Yellow"" is a risible and preten...",0,curious yellow risible pretentious steaming pi...
2,If only to avoid making this type of film in t...,0,avoid making type film future film interesting...
3,This film was probably inspired by Godard's Ma...,0,film probably inspired godards masculin f mini...
4,"Oh, brother...after hearing about this ridicul...",0,oh brotherafter hearing ridiculous film umptee...


In [43]:
test_df['text_preprocessed'] = test_df['text'].apply(clean_data)

In [44]:
test_df.head()

Unnamed: 0,text,label,text_preprocessed
0,I love sci-fi and am willing to put up with a ...,0,love scifi willing put lot scifi moviestv usua...
1,"Worth the entertainment value of a rental, esp...",0,worth entertainment value rental especially li...
2,its a totally average film with a few semi-alr...,0,totally average film semialright action sequen...
3,STAR RATING: ***** Saturday Night **** Friday ...,0,star rating saturday night friday night friday...
4,"First off let me say, If you haven't enjoyed a...",0,first let say havent enjoyed van damme movie s...


### Lemmatization

In [45]:
if PR_LEMMATIZATION:
    lemma = WordNetLemmatizer()

In [46]:
def lemmatize_text(review):
    return " ".join([lemma.lemmatize(word, 'v') for word in review.split(" ")])

In [47]:
train_df['text_preprocessed_1'] = train_df['text_preprocessed'].apply(lemmatize_text)

In [48]:
test_df['text_preprocessed_1'] = test_df['text_preprocessed'].apply(lemmatize_text)

In [49]:
train_df.head()

Unnamed: 0,text,label,text_preprocessed,text_preprocessed_1
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,rented curiousyellow video store controversy s...,rent curiousyellow video store controversy sur...
1,"""I Am Curious: Yellow"" is a risible and preten...",0,curious yellow risible pretentious steaming pi...,curious yellow risible pretentious steam pile ...
2,If only to avoid making this type of film in t...,0,avoid making type film future film interesting...,avoid make type film future film interest expe...
3,This film was probably inspired by Godard's Ma...,0,film probably inspired godards masculin f mini...,film probably inspire godards masculin f minin...
4,"Oh, brother...after hearing about this ridicul...",0,oh brotherafter hearing ridiculous film umptee...,oh brotherafter hear ridiculous film umpteen y...


In [50]:
test_df.head()

Unnamed: 0,text,label,text_preprocessed,text_preprocessed_1
0,I love sci-fi and am willing to put up with a ...,0,love scifi willing put lot scifi moviestv usua...,love scifi will put lot scifi moviestv usually...
1,"Worth the entertainment value of a rental, esp...",0,worth entertainment value rental especially li...,worth entertainment value rental especially li...
2,its a totally average film with a few semi-alr...,0,totally average film semialright action sequen...,totally average film semialright action sequen...
3,STAR RATING: ***** Saturday Night **** Friday ...,0,star rating saturday night friday night friday...,star rat saturday night friday night friday mo...
4,"First off let me say, If you haven't enjoyed a...",0,first let say havent enjoyed van damme movie s...,first let say havent enjoy van damme movie sin...


### Write/Read data

In [51]:
write_to_drive(train_df, DATA_DIR, 'train_df_1.csv')

Saving train_df_1.csv to /content/drive/My Drive/Study/Projects/Problem statements/movie_review_classification/data/


In [52]:
write_to_drive(test_df, DATA_DIR, 'test_df_1.csv')

Saving test_df_1.csv to /content/drive/My Drive/Study/Projects/Problem statements/movie_review_classification/data/


In [20]:
train_df = pd.read_csv(DRIVE_PATH + DATA_DIR + 'train_df_1.csv')

In [21]:
test_df = pd.read_csv(DRIVE_PATH + DATA_DIR + 'test_df_1.csv')

In [22]:
train_df.shape, test_df.shape

((25000, 4), (25000, 4))

In [56]:
train_df.head()

Unnamed: 0,text,label,text_preprocessed,text_preprocessed_1
0,I rented I AM CURIOUS-YELLOW from my video sto...,0,rented curiousyellow video store controversy s...,rent curiousyellow video store controversy sur...
1,"""I Am Curious: Yellow"" is a risible and preten...",0,curious yellow risible pretentious steaming pi...,curious yellow risible pretentious steam pile ...
2,If only to avoid making this type of film in t...,0,avoid making type film future film interesting...,avoid make type film future film interest expe...
3,This film was probably inspired by Godard's Ma...,0,film probably inspired godards masculin f mini...,film probably inspire godards masculin f minin...
4,"Oh, brother...after hearing about this ridicul...",0,oh brotherafter hearing ridiculous film umptee...,oh brotherafter hear ridiculous film umpteen y...


In [57]:
test_df.head()

Unnamed: 0,text,label,text_preprocessed,text_preprocessed_1
0,I love sci-fi and am willing to put up with a ...,0,love scifi willing put lot scifi moviestv usua...,love scifi will put lot scifi moviestv usually...
1,"Worth the entertainment value of a rental, esp...",0,worth entertainment value rental especially li...,worth entertainment value rental especially li...
2,its a totally average film with a few semi-alr...,0,totally average film semialright action sequen...,totally average film semialright action sequen...
3,STAR RATING: ***** Saturday Night **** Friday ...,0,star rating saturday night friday night friday...,star rat saturday night friday night friday mo...
4,"First off let me say, If you haven't enjoyed a...",0,first let say havent enjoyed van damme movie s...,first let say havent enjoy van damme movie sin...


# Make vocabulary
Ways to make vocabulary:
<br>1: Use a pre-defined vocabulary
<br>2: Make your own vocabulary
<br>2-1: RULE-1: Select all words. This will be computationally expensive.
<br>2-2: RULE-2: Select words on the basis of some metric (count)

Findings:
<br>Total vocabulary size: 109016

In [58]:
vocab = {} # {word : count}
word2idx = {}
idx2word = {}

# Padding: index-0 ; Unknown token: index=1
word2idx[UNK_TOKEN] = UNK_TOKEN_IDX
idx2word[UNK_TOKEN_IDX] = UNK_TOKEN

In [59]:
len(word2idx), UNK_TOKEN_IDX

(1, 1)

In [60]:
def make_vocab(x):
    for i in tqdm(range(x.shape[0])):
        review = x['text_preprocessed_1'][i]
        for word in review.split(' '):
            if word in vocab.keys():
                vocab[word] += 1
            else:
                vocab[word] = 1
                word2idx[word] = len(word2idx) + 1
                idx2word[len(word2idx)] = word

In [61]:
make_vocab(train_df)

100%|██████████| 25000/25000 [00:02<00:00, 8563.88it/s]


In [62]:
len(vocab), len(word2idx), len(idx2word)

(109016, 109017, 109017)

In [63]:
test_word = 'awesome'
vocab[test_word], word2idx[test_word], idx2word[word2idx[test_word]]

(465, 3154, 'awesome')

In [64]:
sorted_vocab = dict(sorted(vocab.items(), key=lambda item: item[1], reverse=True))

In [65]:
# Deciding top n occuring words as features
# num_selected_words = 0
# for word in vocab.keys():
#     if vocab[word] > 15:
#         num_selected_words += 1
# print(num_selected_words)

### Write/Read data

In [66]:
write_dict(vocab, DATA_DIR, 'vocab')

In [23]:
vocab = read_dict(DATA_DIR, 'vocab')

In [68]:
write_dict(word2idx, DATA_DIR, 'word2idx')

In [24]:
word2idx = read_dict(DATA_DIR, 'word2idx')

In [70]:
write_dict(idx2word, DATA_DIR, 'idx2word')

In [25]:
idx2word = read_dict(DATA_DIR, 'idx2word')

In [72]:
write_dict(sorted_vocab, DATA_DIR, 'sorted_vocab')

In [26]:
sorted_vocab = read_dict(DATA_DIR, 'sorted_vocab')

# Bag of Words Model

### Making Features

##### Feature list
Selecting top n max occuring features

In [74]:
NUM_FEATURES

5000

In [27]:
feature_list = list(sorted_vocab.keys())[:NUM_FEATURES+10]
len(feature_list), feature_list[0:10]
#  ['br', 'film', 'movie', 'one', 'make', 'like', 'see', 'get', 'time', 'good'])

(5010,
 ['br', 'film', 'movie', 'one', 'make', 'like', 'see', 'get', 'time', 'good'])

In [28]:
# Editing feature list (TODO: removing some noisy features which should have been handled after pre-processing)

# Removing 'br
feature_list.remove('br')
print(len(feature_list), feature_list[0:10])

# Removing ''
feature_list.remove('')
print(len(feature_list), feature_list[0:10])

feature_list = feature_list[:NUM_FEATURES]
print(len(feature_list), feature_list[0:10])

5009 ['film', 'movie', 'one', 'make', 'like', 'see', 'get', 'time', 'good', 'character']
5008 ['film', 'movie', 'one', 'make', 'like', 'see', 'get', 'time', 'good', 'character']
5000 ['film', 'movie', 'one', 'make', 'like', 'see', 'get', 'time', 'good', 'character']


##### Train features

In [83]:
bow_train = []

for i in tqdm(range(train_df.shape[0])):
    review = train_df['text_preprocessed_1'][i]
    bow_train.append([review.split(' ').count(word) for word in feature_list])

100%|██████████| 25000/25000 [19:16<00:00, 21.62it/s]


##### Test features

In [84]:
bow_test = []

for i in tqdm(range(test_df.shape[0])):
    review = test_df['text_preprocessed_1'][i]
    bow_test.append([review.split(' ').count(word) for word in feature_list])

100%|██████████| 25000/25000 [17:02<00:00, 24.44it/s]


##### Train: Conversion to pandas, Write, Read

In [85]:
bow_train_df = pd.DataFrame(bow_train)
bow_train_df.columns = feature_list

In [86]:
bow_train_df.head()

Unnamed: 0,film,movie,one,make,like,see,get,time,good,character,...,iq,dukes,quinn,meg,inhabitants,jade,gina,email,rukh,resurrect
0,4,0,0,3,1,1,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,1,0,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,4,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,1,5,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
write_to_drive(bow_train_df, DATA_DIR, 'bow_train_df.csv')

Saving bow_train_df.csv to /content/drive/My Drive/Study/Projects/Problem statements/movie_review_classification/data/


In [17]:
bow_train_df = pd.read_csv(DRIVE_PATH + DATA_DIR + 'bow_train_df.csv')

In [18]:
bow_train_df.head()

Unnamed: 0,film,movie,one,make,like,see,get,time,good,character,...,iq,dukes,quinn,meg,inhabitants,jade,gina,email,rukh,resurrect
0,4,0,0,3,1,1,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,1,0,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,4,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,1,5,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
bow_train_df.shape

(25000, 5000)

##### Test: Conversion to pandas, Write, Read

In [91]:
bow_test_df = pd.DataFrame(bow_test)
bow_test_df.columns = feature_list

In [92]:
bow_test_df.head()

Unnamed: 0,film,movie,one,make,like,see,get,time,good,character,...,iq,dukes,quinn,meg,inhabitants,jade,gina,email,rukh,resurrect
0,0,0,0,0,2,0,0,0,2,4,...,0,0,0,0,0,0,0,0,0,0
1,0,2,1,1,2,1,1,0,2,1,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,3,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,8,0,2,3,0,0,2,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,5,0,0,2,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
write_to_drive(bow_test_df, DATA_DIR, 'bow_test_df.csv')

Saving bow_test_df.csv to /content/drive/My Drive/Study/Projects/Problem statements/movie_review_classification/data/


In [20]:
bow_test_df = pd.read_csv(DRIVE_PATH + DATA_DIR + 'bow_test_df.csv')

In [21]:
bow_test_df.head()

Unnamed: 0,film,movie,one,make,like,see,get,time,good,character,...,iq,dukes,quinn,meg,inhabitants,jade,gina,email,rukh,resurrect
0,0,0,0,0,2,0,0,0,2,4,...,0,0,0,0,0,0,0,0,0,0
1,0,2,1,1,2,1,1,0,2,1,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,3,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,8,0,2,3,0,0,2,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,5,0,0,2,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
bow_test_df.shape

(25000, 5000)

### Baseline models

In [98]:
base_classification_model(bow_train_df.iloc[:, :NUM_FEATURES],
                          train_df['label'],
                          bow_test_df.iloc[:, :NUM_FEATURES],
                          test_df['label'])

LOGISTIC REGRESSION CLASSIFIER
Accuracy: 85.09
TP: 10504
TN: 10768
FP: 1732
FN: 1996
precision_pos: 85.85
precision_neg: 84.36
recall_pos: 84.03
recall_neg: 86.14
f1_pos: 84.93
f1_neg: 85.24
RANDOM FOREST CLASSIFIER:
Accuracy: 84.72
TP: 10477
TN: 10704
FP: 1796
FN: 2023
precision_pos: 85.37
precision_neg: 84.1
recall_pos: 83.82
recall_neg: 85.63
f1_pos: 84.59
f1_neg: 84.86
XGBoost CLASSIFIER
Accuracy: 86.15
TP: 10977
TN: 10561
FP: 1939
FN: 1523
precision_pos: 84.99
precision_neg: 87.4
recall_pos: 87.82
recall_neg: 84.49
f1_pos: 86.38
f1_neg: 85.92
LightGBM CLASSIFIER
[LightGBM] [Info] Number of positive: 12500, number of negative: 12500
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 4.504559 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23086
[LightGBM] [Info] Number of data points in the train set: 25000, number of used features: 4997

# TF-IDF Model

### Making Features

##### Document frequency
Number of documents in which a particular word/feature occur

In [99]:
# Document frequency of all the vocab words (on the basis of train data)
vocab_df = {}

def calc_doc_freq(x):
    for feature in tqdm(feature_list):
        for i in range(x.shape[0]):
            review = x['text_preprocessed_1'][i].split(' ')
            if feature in review:
                # if feature in vocab_df.keys():
                #     vocab_df[feature] += 1
                # else:
                #     vocab_df[feature] = 1
                vocab_df[feature] = vocab_df.get(feature, 0) + 1

In [100]:
calc_doc_freq(train_df)

100%|██████████| 5000/5000 [42:55<00:00,  1.94it/s]


In [102]:
vocab_df['well'] # 6933

6933

In [103]:
write_dict(vocab_df, DATA_DIR, 'vocab_df')

In [29]:
vocab_df = read_dict(DATA_DIR, 'vocab_df')

In [30]:
vocab_df['well']

6933

##### Document length (for TF)
Number of words in each document

In [39]:
def calc_num_words(x):
    return len(x.split(' '))

In [40]:
train_df['text_preprocessed_1_num_words'] = train_df['text_preprocessed_1'].apply(calc_num_words)

In [41]:
test_df['text_preprocessed_1_num_words'] = test_df['text_preprocessed_1'].apply(calc_num_words)

In [42]:
tfidf_train_df = bow_train_df.copy()

In [43]:
tfidf_test_df = bow_test_df.copy()

In [44]:
tfidf_train_df['text_preprocessed_1_num_words'] = list(train_df['text_preprocessed_1_num_words'])

In [45]:
tfidf_test_df['text_preprocessed_1_num_words'] = list(test_df['text_preprocessed_1_num_words'])

In [47]:
tfidf_train_df.head()

Unnamed: 0,film,movie,one,make,like,see,get,time,good,character,...,dukes,quinn,meg,inhabitants,jade,gina,email,rukh,resurrect,text_preprocessed_1_num_words
0,4,0,0,3,1,1,0,0,2,0,...,0,0,0,0,0,0,0,0,0,152
1,4,0,1,0,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,120
2,2,0,4,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,53
3,4,1,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,62
4,9,0,0,0,1,5,3,1,0,0,...,0,0,0,0,0,0,0,0,0,173


In [48]:
tfidf_test_df.head()

Unnamed: 0,film,movie,one,make,like,see,get,time,good,character,...,dukes,quinn,meg,inhabitants,jade,gina,email,rukh,resurrect,text_preprocessed_1_num_words
0,0,0,0,0,2,0,0,0,2,4,...,0,0,0,0,0,0,0,0,0,132
1,0,2,1,1,2,1,1,0,2,1,...,0,0,0,0,0,0,0,0,0,127
2,3,0,0,3,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,72
3,8,0,2,3,0,0,2,0,1,0,...,0,0,0,0,0,0,0,0,0,212
4,0,5,0,0,2,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,64


##### TF

In [49]:
for i in tqdm(tfidf_train_df.columns[:-1]):
    tfidf_train_df[i] = tfidf_train_df[i] / tfidf_train_df['text_preprocessed_1_num_words']
    tfidf_test_df[i] = tfidf_test_df[i] / tfidf_test_df['text_preprocessed_1_num_words']

100%|██████████| 5000/5000 [00:09<00:00, 512.91it/s]


In [50]:
tfidf_train_df.head()

Unnamed: 0,film,movie,one,make,like,see,get,time,good,character,...,dukes,quinn,meg,inhabitants,jade,gina,email,rukh,resurrect,text_preprocessed_1_num_words
0,0.026316,0.0,0.0,0.019737,0.006579,0.006579,0.0,0.0,0.013158,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,152
1,0.033333,0.0,0.008333,0.0,0.008333,0.016667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120
2,0.037736,0.0,0.075472,0.018868,0.0,0.0,0.0,0.018868,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53
3,0.064516,0.016129,0.0,0.0,0.0,0.016129,0.0,0.016129,0.016129,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62
4,0.052023,0.0,0.0,0.0,0.00578,0.028902,0.017341,0.00578,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,173


In [51]:
tfidf_test_df.head()

Unnamed: 0,film,movie,one,make,like,see,get,time,good,character,...,dukes,quinn,meg,inhabitants,jade,gina,email,rukh,resurrect,text_preprocessed_1_num_words
0,0.0,0.0,0.0,0.0,0.015152,0.0,0.0,0.0,0.015152,0.030303,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,132
1,0.0,0.015748,0.007874,0.007874,0.015748,0.007874,0.007874,0.0,0.015748,0.007874,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,127
2,0.041667,0.0,0.0,0.041667,0.0,0.0,0.0,0.013889,0.0,0.013889,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72
3,0.037736,0.0,0.009434,0.014151,0.0,0.0,0.009434,0.0,0.004717,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,212
4,0.0,0.078125,0.0,0.0,0.03125,0.0,0.0,0.0,0.046875,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64


##### IDF

In [52]:
import math

In [54]:
num_doc = tfidf_train_df.shape[0]
num_doc

25000

In [55]:
idf = [math.log(num_doc/(vocab_df[feature] + 1)) for feature in feature_list]
len(idf)

5000

In [56]:
j = 0
for i in tqdm(tfidf_train_df.columns[:-1]):
    tfidf_train_df[i] = tfidf_train_df[i] * idf[j]
    tfidf_test_df[i] = tfidf_test_df[i] * idf[j]
    j = j + 1

100%|██████████| 5000/5000 [00:22<00:00, 221.90it/s]


In [57]:
tfidf_train_df.head()

Unnamed: 0,film,movie,one,make,like,see,get,time,good,character,...,dukes,quinn,meg,inhabitants,jade,gina,email,rukh,resurrect,text_preprocessed_1_num_words
0,0.014026,0.0,0.0,0.013308,0.004724,0.004767,0.0,0.0,0.012858,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,152
1,0.017767,0.0,0.004957,0.0,0.005984,0.012075,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120
2,0.020113,0.0,0.04489,0.012722,0.0,0.0,0.0,0.017564,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53
3,0.034387,0.008243,0.0,0.0,0.0,0.011686,0.0,0.015014,0.015761,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62
4,0.027728,0.0,0.0,0.0,0.004151,0.02094,0.014814,0.005381,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,173


In [58]:
tfidf_test_df.head()

Unnamed: 0,film,movie,one,make,like,see,get,time,good,character,...,dukes,quinn,meg,inhabitants,jade,gina,email,rukh,resurrect,text_preprocessed_1_num_words
0,0.0,0.0,0.0,0.0,0.01088,0.0,0.0,0.0,0.014806,0.033973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,132
1,0.0,0.008049,0.004683,0.005309,0.011309,0.005705,0.006726,0.0,0.015389,0.008828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,127
2,0.022208,0.0,0.0,0.028095,0.0,0.0,0.0,0.012929,0.0,0.015571,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72
3,0.020113,0.0,0.005611,0.009542,0.0,0.0,0.008059,0.0,0.004609,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,212
4,0.0,0.039929,0.0,0.0,0.02244,0.0,0.0,0.0,0.045807,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64


### Write/Read data

In [59]:
write_to_drive(tfidf_train_df, DATA_DIR, 'tfidf_train_df.csv')

Saving tfidf_train_df.csv to /content/drive/My Drive/Study/Projects/Problem statements/movie_review_classification/data/


In [14]:
tfidf_train_df = pd.read_csv(DRIVE_PATH + DATA_DIR + 'tfidf_train_df.csv')

In [61]:
write_to_drive(tfidf_test_df, DATA_DIR, 'tfidf_test_df.csv')

Saving tfidf_test_df.csv to /content/drive/My Drive/Study/Projects/Problem statements/movie_review_classification/data/


In [15]:
tfidf_test_df = pd.read_csv(DRIVE_PATH + DATA_DIR + 'tfidf_test_df.csv')

In [16]:
tfidf_train_df.shape, tfidf_test_df.shape

((25000, 5001), (25000, 5001))

In [17]:
tfidf_train_df.head()

Unnamed: 0,film,movie,one,make,like,see,get,time,good,character,...,dukes,quinn,meg,inhabitants,jade,gina,email,rukh,resurrect,text_preprocessed_1_num_words
0,0.014026,0.0,0.0,0.013308,0.004724,0.004767,0.0,0.0,0.012858,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,152
1,0.017767,0.0,0.004957,0.0,0.005984,0.012075,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120
2,0.020113,0.0,0.04489,0.012722,0.0,0.0,0.0,0.017564,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53
3,0.034387,0.008243,0.0,0.0,0.0,0.011686,0.0,0.015014,0.015761,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62
4,0.027728,0.0,0.0,0.0,0.004151,0.02094,0.014814,0.005381,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,173


In [18]:
tfidf_test_df.head()

Unnamed: 0,film,movie,one,make,like,see,get,time,good,character,...,dukes,quinn,meg,inhabitants,jade,gina,email,rukh,resurrect,text_preprocessed_1_num_words
0,0.0,0.0,0.0,0.0,0.01088,0.0,0.0,0.0,0.014806,0.033973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,132
1,0.0,0.008049,0.004683,0.005309,0.011309,0.005705,0.006726,0.0,0.015389,0.008828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,127
2,0.022208,0.0,0.0,0.028095,0.0,0.0,0.0,0.012929,0.0,0.015571,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72
3,0.020113,0.0,0.005611,0.009542,0.0,0.0,0.008059,0.0,0.004609,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,212
4,0.0,0.039929,0.0,0.0,0.02244,0.0,0.0,0.0,0.045807,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64


### Baseline models

In [23]:
base_classification_model(tfidf_train_df.iloc[:, :NUM_FEATURES],
                          train_df['label'],
                          tfidf_test_df.iloc[:, :NUM_FEATURES],
                          test_df['label'])

LOGISTIC REGRESSION CLASSIFIER
Accuracy: 87.12
TP: 11004
TN: 10777
FP: 1723
FN: 1496
precision_pos: 86.46
precision_neg: 87.81
recall_pos: 88.03
recall_neg: 86.22
f1_pos: 87.24
f1_neg: 87.01
RANDOM FOREST CLASSIFIER:
Accuracy: 84.72
TP: 10444
TN: 10736
FP: 1764
FN: 2056
precision_pos: 85.55
precision_neg: 83.93
recall_pos: 83.55
recall_neg: 85.89
f1_pos: 84.54
f1_neg: 84.9
XGBoost CLASSIFIER
Accuracy: 85.82
TP: 10943
TN: 10512
FP: 1988
FN: 1557
precision_pos: 84.63
precision_neg: 87.1
recall_pos: 87.54
recall_neg: 84.1
f1_pos: 86.06
f1_neg: 85.57
LightGBM CLASSIFIER
[LightGBM] [Info] Number of positive: 12500, number of negative: 12500
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.314038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 402527
[LightGBM] [Info] Number of data points in the train set: 25000, number of used features: 4997

# Word2Vec (CBOW) Embeddings

### Tokenization

In [24]:
tokenized_sentences = [review.split(' ') for review in list(train_df['text_preprocessed_1'])]
tokenized_sentences[0:1]

[['rent',
  'curiousyellow',
  'video',
  'store',
  'controversy',
  'surround',
  'first',
  'release',
  '1967',
  'also',
  'hear',
  'first',
  'seize',
  'us',
  'customs',
  'ever',
  'try',
  'enter',
  'country',
  'therefore',
  'fan',
  'film',
  'consider',
  'controversial',
  'really',
  'see',
  'myselfbr',
  'br',
  'plot',
  'center',
  'around',
  'young',
  'swedish',
  'drama',
  'student',
  'name',
  'lena',
  'want',
  'learn',
  'everything',
  'life',
  'particular',
  'want',
  'focus',
  'attentions',
  'make',
  'sort',
  'documentary',
  'average',
  'swede',
  'think',
  'certain',
  'political',
  'issue',
  'vietnam',
  'war',
  'race',
  'issue',
  'unite',
  'state',
  'ask',
  'politicians',
  'ordinary',
  'denizens',
  'stockholm',
  'opinions',
  'politics',
  'sex',
  'drama',
  'teacher',
  'classmates',
  'marry',
  'menbr',
  'br',
  'kill',
  'curiousyellow',
  '40',
  'years',
  'ago',
  'consider',
  'pornographic',
  'really',
  'sex',
  'n

### Model train

In [25]:
# vector_size = Embedding dimension
# window = n ie. Total window = n words before + n words after the current word
# min_count = 1 ie consider all the words (no elimination on the basis of word count)
# workers = number of CPU cores to use
model_word2vec_cbow = Word2Vec(sentences=tokenized_sentences,
                               vector_size=EMB_SIZE,
                               window=5,
                               min_count=1,
                               workers=4)

### Model save/load

In [26]:
# Save model
model_word2vec_cbow.save(DRIVE_PATH + DATA_DIR + "word2vec_CBOW.model")

In [27]:
# Load model
model_word2vec_cbow = Word2Vec.load(DRIVE_PATH + DATA_DIR + "word2vec_CBOW.model")

### Manual testing
Similar words closer embeddings in vector space

In [28]:
# Test words
words = ['good', 'movie', 'bad', 'actor', 'action',
         'year', 'immediately', 'husband', 'sister',
         'hand', 'lose', 'director', 'minutes', 'funny',
         'vote', 'brazil', 'finish', 'oldest', 'noise',
         'project', 'poor', 'brazilian', 'singer', 'king',
         'queen', 'man', 'woman', 'suck', 'edit']

i = 0
for word in words:
    # Word embeddings
    # vector = model_word2vec_cbow.wv[word]

    # Most similar words
    similar_words = model_word2vec_cbow.wv.most_similar(word)
    # print(f"Most similar words {similar_words}")

    # Most similar word
    if i == 0:
        print('Most similar word for:')
        i = 1
    print(f'{word} --> {max(similar_words, key=lambda x: x[1])[0]}')

Most similar word for:
good --> decent
movie --> film
bad --> awful
actor --> actress
action --> thrill
year --> yrs
immediately --> soon
husband --> wife
sister --> brother
hand --> fix
lose --> change
director --> filmmaker
minutes --> minute
funny --> hilarious
vote --> rat
brazil --> sevenbr
finish --> till
oldest --> lila
noise --> crunch
project --> filmmaker
poor --> atrocious
brazilian --> hail
singer --> baio
king --> stephen
queen --> princess
man --> woman
woman --> girl
suck --> stink
edit --> choppy


### Model vocabulary

In [29]:
# Model vocabulary
model_word2vec_cbow_vocab = list(model_word2vec_cbow.wv.key_to_index.keys())

# Model vocabulary size
model_word2vec_cbow_vocab_size = len(model_word2vec_cbow.wv.key_to_index)

len(model_word2vec_cbow_vocab), model_word2vec_cbow_vocab_size

(109016, 109016)

# Word2Vec (Skipgram) Embeddings

### Model train

In [31]:
# sg=1 means SKipgram model (Defualt is 0: CBOW)
model_word2vec_sg = Word2Vec(sentences=tokenized_sentences,
                             sg=1,
                             vector_size=EMB_SIZE,
                             window=5,
                             min_count=1,
                             workers=4)

###Model save/load

In [32]:
# Save the model
model_word2vec_sg.save(DRIVE_PATH + DATA_DIR + "word2vec_Skipgram.model")

In [33]:
# Load the model
model_word2vec_sg = Word2Vec.load(DRIVE_PATH + DATA_DIR + "word2vec_Skipgram.model")

### Manual testing
Similar words closer embeddings in vector space

In [35]:
# Test words
words = ['good', 'movie', 'bad', 'actor', 'action',
         'year', 'immediately', 'husband', 'sister',
         'hand', 'lose', 'director', 'minutes', 'funny',
         'vote', 'brazil', 'finish', 'oldest', 'noise',
         'project', 'poor', 'brazilian', 'singer', 'king',
         'queen', 'man', 'woman', 'suck', 'edit']

i = 0
for word in words:
    # Word embeddings
    # vector = model_word2vec_sg.wv[word]

    # Most similar words
    similar_words = model_word2vec_sg.wv.most_similar(word)
    # print(f"Most similar words {similar_words}")

    # Most similar word
    if i == 0:
        print('Most similar word for:')
        i = 1
    print(f'{word} --> {max(similar_words, key=lambda x: x[1])[0]}')

Most similar word for:
good --> decent
movie --> film
bad --> terrible
actor --> versatile
action --> shootouts
year --> years
immediately --> sookie
husband --> wife
sister --> youngest
hand --> manos
lose --> ark
director --> filmmaker
minutes --> min
funny --> hilarious
vote --> voters
brazil --> sevenbr
finish --> finishbr
oldest --> molester
noise --> chant
project --> fiasco
poor --> lousy
brazilian --> milestone
singer --> singers
king --> lion
queen --> pinup
man --> woman
woman --> man
suck --> horrible
edit --> shaky


### Combined testing CBOW_Skipgram

In [34]:
# Test words
words = ['good', 'movie', 'bad', 'actor', 'action',
         'year', 'immediately', 'husband', 'sister',
         'hand', 'lose', 'director', 'minutes', 'funny',
         'vote', 'brazil', 'finish', 'oldest', 'noise',
         'project', 'poor', 'brazilian', 'singer', 'king',
         'queen', 'man', 'woman', 'suck', 'edit']

i = 0
for word in words:
    # Most similar words
    similar_words_cbow = model_word2vec_cbow.wv.most_similar(word) # CBOW
    similar_words_skipgram = model_word2vec_sg.wv.most_similar(word) # Skipgram
    # print(f"Most similar words {similar_words_cbow}")

    # Most similar word
    if i == 0:
        print('Most similar word for:')
        i = 1
    print(f'{word} --> CBOW: {max(similar_words_cbow, key=lambda x: x[1])[0]} --> Skipgram: {max(similar_words_skipgram, key=lambda x: x[1])[0]}')

Most similar word for:
good --> CBOW: decent --> Skipgram: decent
movie --> CBOW: film --> Skipgram: film
bad --> CBOW: awful --> Skipgram: terrible
actor --> CBOW: actress --> Skipgram: versatile
action --> CBOW: thrill --> Skipgram: shootouts
year --> CBOW: yrs --> Skipgram: years
immediately --> CBOW: soon --> Skipgram: sookie
husband --> CBOW: wife --> Skipgram: wife
sister --> CBOW: brother --> Skipgram: youngest
hand --> CBOW: fix --> Skipgram: manos
lose --> CBOW: change --> Skipgram: ark
director --> CBOW: filmmaker --> Skipgram: filmmaker
minutes --> CBOW: minute --> Skipgram: min
funny --> CBOW: hilarious --> Skipgram: hilarious
vote --> CBOW: rat --> Skipgram: voters
brazil --> CBOW: sevenbr --> Skipgram: sevenbr
finish --> CBOW: till --> Skipgram: finishbr
oldest --> CBOW: lila --> Skipgram: molester
noise --> CBOW: crunch --> Skipgram: chant
project --> CBOW: filmmaker --> Skipgram: fiasco
poor --> CBOW: atrocious --> Skipgram: lousy
brazilian --> CBOW: hail --> Skipgram: 

# Word2Vec CBOW Embeddings based classification
Using average of embeddings for all words as a feature

### Making Features

In [37]:
def calc_avg_emb_feature(review, model):
    emb_feature = np.zeros(EMB_SIZE)
    num_words = len(review.split(' '))

    for word in review.split(' '):
        try:
            emb_feature = emb_feature + model.wv[word]
        except KeyError:
            emb_feature = emb_feature + np.zeros(EMB_SIZE)

    emb_feature = emb_feature / num_words

    return list(emb_feature)

In [38]:
word2vec_model_1_train_dict = {}
word2vec_model_1_test_dict = {}

model = Word2Vec.load(DRIVE_PATH + DATA_DIR + "word2vec_CBOW.model")

for i in tqdm(range(train_df.shape[0])):
    word2vec_model_1_train_dict[str(i)] = calc_avg_emb_feature(train_df['text_preprocessed_1'][i], model)
    word2vec_model_1_test_dict[str(i)] = calc_avg_emb_feature(test_df['text_preprocessed_1'][i], model)

100%|██████████| 25000/25000 [00:26<00:00, 929.99it/s] 


In [39]:
word2vec_model_1_train_df = pd.DataFrame(word2vec_model_1_train_dict)
word2vec_model_1_test_df = pd.DataFrame(word2vec_model_1_test_dict)

In [40]:
word2vec_model_1_train_df.shape, word2vec_model_1_test_df.shape

((100, 25000), (100, 25000))

In [41]:
word2vec_model_1_train_df = word2vec_model_1_train_df.T
word2vec_model_1_test_df = word2vec_model_1_test_df.T
word2vec_model_1_train_df.shape, word2vec_model_1_test_df.shape

((25000, 100), (25000, 100))

In [42]:
train_df.shape, test_df.shape

((25000, 4), (25000, 4))

### Baseline models

In [43]:
base_classification_model(word2vec_model_1_train_df.iloc[:, :EMB_SIZE],
                          train_df['label'],
                          word2vec_model_1_test_df.iloc[:, :EMB_SIZE],
                          test_df['label'])

LOGISTIC REGRESSION CLASSIFIER
Accuracy: 80.78
TP: 10050
TN: 10146
FP: 2354
FN: 2450
precision_pos: 81.02
precision_neg: 80.55
recall_pos: 80.4
recall_neg: 81.17
f1_pos: 80.71
f1_neg: 80.86
RANDOM FOREST CLASSIFIER:
Accuracy: 77.46
TP: 9664
TN: 9700
FP: 2800
FN: 2836
precision_pos: 77.54
precision_neg: 77.38
recall_pos: 77.31
recall_neg: 77.6
f1_pos: 77.42
f1_neg: 77.49
XGBoost CLASSIFIER
Accuracy: 79.25
TP: 9902
TN: 9910
FP: 2590
FN: 2598
precision_pos: 79.27
precision_neg: 79.23
recall_pos: 79.22
recall_neg: 79.28
f1_pos: 79.24
f1_neg: 79.25
LightGBM CLASSIFIER
[LightGBM] [Info] Number of positive: 12500, number of negative: 12500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 25000, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscor

# Word2Vec Skipgram Embeddings based classification
Using average of embeddings for all words as a feature

### Making Features

In [44]:
def calc_avg_emb_feature(review, model):
    emb_feature = np.zeros(EMB_SIZE)
    num_words = len(review.split(' '))

    for word in review.split(' '):
        try:
            emb_feature = emb_feature + model.wv[word]
        except KeyError:
            emb_feature = emb_feature + np.zeros(EMB_SIZE)

    emb_feature = emb_feature / num_words

    return list(emb_feature)

In [45]:
word2vec_model_2_train_dict = {}
word2vec_model_2_test_dict = {}

model = Word2Vec.load(DRIVE_PATH + DATA_DIR + "word2vec_Skipgram.model")

for i in tqdm(range(train_df.shape[0])):
    word2vec_model_2_train_dict[str(i)] = calc_avg_emb_feature(train_df['text_preprocessed_1'][i], model)
    word2vec_model_2_test_dict[str(i)] = calc_avg_emb_feature(test_df['text_preprocessed_1'][i], model)

100%|██████████| 25000/25000 [00:25<00:00, 995.99it/s] 


In [46]:
word2vec_model_2_train_df = pd.DataFrame(word2vec_model_2_train_dict)
word2vec_model_2_test_df = pd.DataFrame(word2vec_model_2_test_dict)

In [47]:
word2vec_model_2_train_df.shape, word2vec_model_2_test_df.shape

((100, 25000), (100, 25000))

In [48]:
word2vec_model_2_train_df = word2vec_model_2_train_df.T
word2vec_model_2_test_df = word2vec_model_2_test_df.T
word2vec_model_2_train_df.shape, word2vec_model_2_test_df.shape

((25000, 100), (25000, 100))

In [49]:
train_df.shape, test_df.shape

((25000, 4), (25000, 4))

### Baseline models

In [50]:
base_classification_model(word2vec_model_2_train_df.iloc[:, :EMB_SIZE],
                          train_df['label'],
                          word2vec_model_2_test_df.iloc[:, :EMB_SIZE],
                          test_df['label'])

LOGISTIC REGRESSION CLASSIFIER
Accuracy: 83.29
TP: 10460
TN: 10362
FP: 2138
FN: 2040
precision_pos: 83.03
precision_neg: 83.55
recall_pos: 83.68
recall_neg: 82.9
f1_pos: 83.35
f1_neg: 83.22
RANDOM FOREST CLASSIFIER:
Accuracy: 79.77
TP: 9937
TN: 10006
FP: 2494
FN: 2563
precision_pos: 79.94
precision_neg: 79.61
recall_pos: 79.5
recall_neg: 80.05
f1_pos: 79.72
f1_neg: 79.83
XGBoost CLASSIFIER
Accuracy: 82.17
TP: 10341
TN: 10201
FP: 2299
FN: 2159
precision_pos: 81.81
precision_neg: 82.53
recall_pos: 82.73
recall_neg: 81.61
f1_pos: 82.27
f1_neg: 82.07
LightGBM CLASSIFIER
[LightGBM] [Info] Number of positive: 12500, number of negative: 12500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030328 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 25000, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> inits