In [4]:
!pip install nltk --q
!pip install kaggle --q

In [5]:
import os
import sys
import json
from zipfile import ZipFile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
# from wordcloud import WordCloud, STOPWORDS

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords

import re, string, unicodedata
from sklearn.metrics import classification_report, f1_score, confusion_matrix, accuracy_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.pipeline import make_pipeline, Pipeline

from string import punctuation
from prettytable import PrettyTable #used to create and display ASCII tables in a readable format

import warnings
warnings.filterwarnings('ignore')

import scipy as sp


In [6]:

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pragya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/pragya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/pragya/nltk_data...


True

In [11]:
kaggle_credential = json.load(open('kaggle.json'))

os.environ['KAGGLE_USERNAME'] = kaggle_credential['username']
os.environ['KAGGLE_KEY'] = kaggle_credential['key']


In [12]:
!kaggle datasets download -d crisbam/imdb-dataset-of-65k-movie-reviews-and-translation


Downloading imdb-dataset-of-65k-movie-reviews-and-translation.zip to /Users/pragya/Desktop/LEARN/NLP Projects/SentimentAnalysis
100%|████████████████████████████████████████| 147M/147M [00:26<00:00, 6.14MB/s]
100%|████████████████████████████████████████| 147M/147M [00:26<00:00, 5.88MB/s]


In [13]:
!ls

SA_ModelSelection.ipynb
imdb-dataset-of-65k-movie-reviews-and-translation.zip
kaggle.json


In [14]:
with ZipFile('imdb-dataset-of-65k-movie-reviews-and-translation.zip', 'r') as zipref:
  zipref.extractall()

In [15]:
df = pd.read_csv('IMDB-Dataset.csv')


## Data preprocessing


In [16]:
stop_words = stopwords.words('english')
stop_words.remove("not")

new_stopwords = ["might", "could", "one", "film", "movie", "would", "shall"]
stop_words.extend(new_stopwords)

stop_words = set(stop_words)

In [17]:
contraction_mapping = {
    "won't": "would not",
    "can't": "can not",
    "don't": "do not",
    "shouldn't": "should not",
    "needn't": "need not",
    "hasn't": "has not",
    "haven't": "have not",
    "weren't": "were not",
    "mightn't": "might not",
    "didn't": "did not"
}

def preprocessing_text(text):

  for contraction, expanded_form in contraction_mapping.items():
    text = re.sub(r"\b" + re.escape(contraction) + r"\b", expanded_form, text)

    text = text.lower()
    text = re.sub('<.*?>', ' ', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = re.sub('https\S\s',  ' ', text)

    text = nltk.word_tokenize(text)

    text = [word for word in text if word not in stop_words]

    lmtr = WordNetLemmatizer()

    text = [lmtr.lemmatize(word) for word in text]

    return " ".join(text)


In [18]:
df['Clean_Review'] = df['Reviews'].apply(preprocessing_text)


## Feature Engineering


In [19]:
df['Label'] = df['Ratings'].apply(lambda x: '1' if x >=7 else ('0' if x <=4 else '2'))
df = df[df['Label'] < '2']


In [20]:
X_train, X_test, y_train, y_test = train_test_split(df['Clean_Review'], df['Label'], test_size = 0.2, random_state = 42, stratify = df['Label'], shuffle = True)

In [21]:
tfidfvect = TfidfVectorizer(analyzer = "word", ngram_range=(1,3), min_df = 10, max_features = 10000)

X_train_tfidf = tfidfvect.fit_transform(X_train).toarray()
X_test_tfidf  = tfidfvect.fit_transform(X_test).toarray()



#Model selection

The `average` parameter in Scikit-learn's precision_score function specifies how to average the data to compute the precision score in multiclass or multilabel classification. Understanding these averaging methods is crucial as they influence the calculation of the scores.

1. **Micro Averaging (`average='micro'`)**: Aggregates the contributions of all classes by counting the total true positives, false negatives, and false positives, and then calculates the precision from these aggregated counts. Useful for evaluating overall model performance, treating each individual prediction equally.

2. **Macro Averaging (`average='macro'`)**: Calculates the precision for each class independently and then takes the unweighted average of these precisions. Useful when you want to give equal importance to each class, regardless of their frequency.

3. **Weighted Averaging (`average='weighted'`)**: Calculates the precision for each class independently and then takes the average weighted by the number of true instances for each class. Useful for accounting for class imbalance while still providing a single average precision score.

4. **Samples Averaging (`average='samples'`)**: Calculates the precision for each instance and then averages over all instances. Useful for multilabel classification where performance needs to be evaluated on a per-sample basis.

### Logistic Regression

In [22]:
model_1 = Pipeline(
    steps=[
        ("classifier", LogisticRegression())
    ]
)

In [23]:
model_1.fit(X_train_tfidf, y_train)

In [25]:
# ovo Stands for "One-Versus-One". It means the model will build a binary classifier for every pair of classes.
# For N classes, the ovo will give N(N-1)/2 classifiers and the class with maximum score would be chosen.

f1_score_1 = f1_score(y_test, model_1.predict(X_test_tfidf), average = 'weighted')

print("Training dataset:")
print("Precision Score", precision_score(y_train, model_1.predict(X_train_tfidf), average = 'micro'))
print("f1 Score", f1_score(y_train, model_1.predict(X_train_tfidf), average = 'weighted'))
print(" ")

print("Test dataset:")
print("Precision Score", precision_score(y_test, model_1.predict(X_test_tfidf), average = 'micro'))
print("f1 Score", f1_score(y_test, model_1.predict(X_test_tfidf), average = 'weighted'))
print(" ")


Training dataset:
Precision Score 0.9132291666666666
f1 Score 0.9132286857713058
 
Test dataset:
Precision Score 0.6057083333333333
f1 Score 0.6048585260267842
 


### Decision Tree Classifier

In [26]:
model_2 = Pipeline(
    steps=[
        ('classifier', DecisionTreeClassifier(criterion = 'gini', max_depth = 11, min_samples_split = 2, min_samples_leaf = 1))
    ]
)

In [27]:
model_2.fit(X_train_tfidf, y_train)

In [28]:

f1_score_2 = f1_score(y_test, model_2.predict(X_test_tfidf), average = 'weighted')

print("Training dataset:")
print("Precision Score", precision_score(y_train, model_2.predict(X_train_tfidf), average = 'micro'))
print("f1 Score", f1_score(y_train, model_2.predict(X_train_tfidf), average = 'weighted'))
print(" ")

print("Test dataset:")
print("Precision Score", precision_score(y_test, model_2.predict(X_test_tfidf), average = 'micro'))
print("f1 Score", f1_score(y_test, model_2.predict(X_test_tfidf), average = 'weighted'))
print(" ")


Training dataset:
Precision Score 0.74753125
f1 Score 0.7440291363849421
 
Test dataset:
Precision Score 0.521875
f1 Score 0.4045031690907737
 


### Random Forest Classifier

In [29]:
model_3 = Pipeline(
    steps=[
        ('classifier', RandomForestClassifier())
    ]
)

model_3.fit(X_train_tfidf, y_train)

In [30]:

f1_score_3 = f1_score(y_test, model_3.predict(X_test_tfidf), average = 'weighted')

print("Training dataset:")
print("Precision Score", precision_score(y_train, model_3.predict(X_train_tfidf), average = 'micro'))
print("f1 Score", f1_score(y_train, model_3.predict(X_train_tfidf), average = 'weighted'))
print(" ")

print("Test dataset:")
print("Precision Score", precision_score(y_test, model_3.redict(X_test_tfidf), average = 'micro'))
print("f1 Score", f1_score(y_test, model_3.predict(X_test_tfidf), average = 'weighted'))
print(" ")

Training dataset:
Precision Score 0.9999270833333334
f1 Score 0.9999270833331355
 
Test dataset:
Precision Score 0.5899166666666666
f1 Score 0.5754654028458915
 


### Adaboost classifier

In [34]:
model_4 = Pipeline(
    steps=[
        ('classifier', AdaBoostClassifier(
            estimator=DecisionTreeClassifier(max_depth=4),  # Specify base estimator
            n_estimators=100,
            learning_rate=0.8
        ))
    ]
)


model_4.fit(X_train_tfidf, y_train)

In [35]:

f1_score_4 = f1_score(y_test, model_4.predict(X_test_tfidf), average = 'weighted')

print("Training dataset:")
print("Precision Score", precision_score(y_train, model_4.predict(X_train_tfidf), average = 'micro'))
print("f1 Score", f1_score(y_train, model_4.predict(X_train_tfidf), average = 'weighted'))
print(" ")

print("Test dataset:")
print("Precision Score", precision_score(y_test, model_4.predict(X_test_tfidf), average = 'micro'))
print("f1 Score", f1_score(y_test, model_4.predict(X_test_tfidf), average = 'weighted'))
print(" ")

Training dataset:
Precision Score 0.896875
f1 Score 0.8968744183072659
 
Test dataset:
Precision Score 0.57225
f1 Score 0.5649923197796571
 


### Hyperparameter tuning with grid search

In [36]:
from sklearn import ensemble, metrics, model_selection

In [37]:
def hyperparamtune(classifier, param_grid, metric, verbose_value, cv):
    model = model_selection.GridSearchCV(
        estimator = classifier, 
        param_grid = param_grid,
        scoring = metric, 
        verbose = verbose_value,
        cv = cv
    )

    model.fit(X_train_tfidf, y_train)
    print(f'Best Score: {model.best_score_}')
    print('Best hyperparameter set:')

    best_parameters = model.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print(f"\t{param_name}: {best_parameters[param_name]}")
    return model, best_parameters
        

### Hyperparameter tuning of logistic regression

In [38]:
param_gd={"penalty":["l2","l1"],
         "C":[0.01,0.1,1.0,10],
         "tol":[0.0001,0.001,0.01],
         "max_iter":[100,200]}
model_5, best_param = hyperparamtune(LogisticRegression(), param_gd, "accuracy", 10, 5)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5; 1/48] START C=0.01, max_iter=100, penalty=l2, tol=0.0001...............
[CV 1/5; 1/48] END C=0.01, max_iter=100, penalty=l2, tol=0.0001;, score=0.858 total time=  30.5s
[CV 2/5; 1/48] START C=0.01, max_iter=100, penalty=l2, tol=0.0001...............
[CV 2/5; 1/48] END C=0.01, max_iter=100, penalty=l2, tol=0.0001;, score=0.858 total time=  29.9s
[CV 3/5; 1/48] START C=0.01, max_iter=100, penalty=l2, tol=0.0001...............
[CV 3/5; 1/48] END C=0.01, max_iter=100, penalty=l2, tol=0.0001;, score=0.860 total time=  25.3s
[CV 4/5; 1/48] START C=0.01, max_iter=100, penalty=l2, tol=0.0001...............
[CV 4/5; 1/48] END C=0.01, max_iter=100, penalty=l2, tol=0.0001;, score=0.861 total time=  25.0s
[CV 5/5; 1/48] START C=0.01, max_iter=100, penalty=l2, tol=0.0001...............
[CV 5/5; 1/48] END C=0.01, max_iter=100, penalty=l2, tol=0.0001;, score=0.859 total time=  24.4s
[CV 1/5; 2/48] START C=0.01, max_iter=100, penal

In [39]:
f1_score_5 = f1_score(y_test, model_5.predict(X_test_tfidf), average = 'weighted')

print("Training dataset:")
print("Precision Score", precision_score(y_train, model_5.predict(X_train_tfidf), average = 'micro'))
print("f1 Score", f1_score(y_train, model_5.predict(X_train_tfidf), average = 'weighted'))
print(" ")

print("Test dataset:")
print("Precision Score", precision_score(y_test, model_5.predict(X_test_tfidf), average = 'micro'))
print("f1 Score", f1_score(y_test, model_5.predict(X_test_tfidf), average = 'weighted'))
print(" ")

Training dataset:
Precision Score 0.9132291666666666
f1 Score 0.9132286857713058
 
Test dataset:
Precision Score 0.6057083333333333
f1 Score 0.6048585260267842
 


Settling down with the logistic regression for better training speed and accuracy