<center>
    <h1>Movie Review Classification</h1>
</center>

## Libraries

In [None]:
# Data manipulation
import numpy as np
import pandas as pd
# Data visualisation
import matplotlib.pyplot as plt
# Text pre-processing
from string import punctuation
from nltk.corpus import stopwords
import spacy
# Splitting data into train, evaluation and test
from sklearn.model_selection import train_test_split
# Naïve Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# Evaluation
from sklearn.metrics import classification_report
# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

## Loading data

In [None]:
reviews = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

print('Number of reviews :', len(reviews))
reviews.head(10)

In [None]:
s = reviews['sentiment'].value_counts()
s = (s/s.sum())*100

plt.figure()
bars = plt.bar(s.index, s.values, color = ['green', 'red'], alpha = .6)
plt.xticks(s.index, ['Positive', 'Negative'], fontsize = 15)
plt.tick_params(bottom = False, top = False, left = False, right = False, labelleft = False)
for spine in plt.gca().spines.values():
    spine.set_visible(False)
for bar in bars:
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() - 5, s = str(bar.get_height())[:2] + '%', ha = 'center', fontsize = 15)
plt.title('Reviews polarity', fontsize = 17)
plt.show()

## Text pre-processing

In [None]:
nlp = spacy.load('en')
stopwords_ = set(stopwords.words('english'))
keep_track = 1

def remove_stopwords(text, stopwords):
    tokens = text.split(' ')
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

def lemmatize(text):
    global keep_track
    
    doc = nlp(text)
    lemms = []
    for token in doc:
        if token.lemma_.startswith('-'):
            lemms.append(str(token).lower())
        else:
            lemms.append(token.lemma_)
    lemms = [lemm for lemm in lemms if not lemm.startswith('-')]
    if keep_track % 100 == 0:
        print(str(np.round((keep_track/50000)*100, 2)) + '%', end = '\r')
    keep_track += 1
    return ' '.join(lemms) 

In [None]:
print('Pre-processing ...')
# Removing line breakers ...
reviews['review'] = reviews['review'].str.replace('<br />', ' ')
# Removing digits ...
reviews['review'] = reviews['review'].str.replace('\d+', ' ')
# Lower casing ...
reviews['review'] = reviews['review'].str.lower()
print('Done! ')

print('Removing stopwords ...')
reviews['review'] = reviews['review'].apply(lambda review: remove_stopwords(review, stopwords_))
print('Done! ')

print('Lemmatizing ...')
reviews['review'] = reviews['review'].apply(lemmatize)
print('Done! ')

print('Removing punctuation ...')
reviews['review'] = reviews['review'].str.replace('[' + punctuation +']', ' ', regex = True)
# Squeezing white spaces ...
reviews['review'] = reviews['review'].str.replace('\s+', ' ')
print('Done! ')
reviews['review'].head(10)

## Splitting data for training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reviews['review'], reviews['sentiment'], train_size = .9)

print('Training dataset : {} reviews'.format(X_train.shape[0]))
print('Testing dataset : {} reviews'.format(X_test.shape[0]))

## Vectorizing reviews for training

Having a large vocabulary usually makes ML models overfit the training data. Thus, we will only be using terms that appear in more than 1% of our documents (reviews).

Note : I did go through multiple values of `min_df` and 1% seems to be the optimal value to balance between overfitting and underfitting.

In [None]:
vect = TfidfVectorizer(ngram_range = (1, 2), min_df = .01)
X_train_vect = vect.fit_transform(X_train)

## Training Machine learning model

### Multinomial Naïve Bayes

In [None]:
model = MultinomialNB(alpha = 0.001).fit(X_train_vect, y_train)
y_pred_test = model.predict(vect.transform(X_test))
y_pred_train = model.predict(X_train_vect)

print('Training data')
print(classification_report(y_train, y_pred_train))
print('Test data')
print(classification_report(y_test, y_pred_test))

### Logistic Regression

In [None]:
model = LogisticRegression().fit(X_train_vect, y_train)
y_pred_test = model.predict(vect.transform(X_test))
y_pred_train = model.predict(X_train_vect)

print('Training data')
print(classification_report(y_train, y_pred_train))
print('Test data')
print(classification_report(y_test, y_pred_test))

### Random Forest

In [None]:
model = RandomForestClassifier().fit(X_train_vect, y_train)
y_pred_test = model.predict(vect.transform(X_test))
y_pred_train = model.predict(X_train_vect)

print('Training data')
print(classification_report(y_train, y_pred_train))
print('Test data')
print(classification_report(y_test, y_pred_test))

Random Forest seems to be overfitting.

Logistic Regression is giving the best accuracy (89%) which is still low

## Going through wrong predictions (Logistic Regression)

In [None]:
model = LogisticRegression().fit(X_train_vect, y_train)
y_pred_test = model.predict(vect.transform(X_test))

In [None]:
df = pd.DataFrame()
df['reviews'] = X_test
df['predicted sentiment'] = y_pred_test
df['actual sentiment'] = y_test
df = df[df['actual sentiment'] != df['predicted sentiment']]

df

Looking at review 33066 which was labeled positive and predicted negative

In [None]:
reviews['review'][33066]

One of the most common issues encountered while making ML model or DL model is mis-labeled data and on the example above we can clearly notice an example. The review reflected negative feedback with term such as "weak script", "slow pace", "exhaustingly long", ...

This doesn't change the fact that some reviews are wrongly classified (example below)

In [None]:
reviews['review'][7428]