# Project 3 – IMDB MOVIE REVIEW

In [105]:
#Import Libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
import re
wnl = nltk.wordnet.WordNetLemmatizer()

In [49]:
df = pd.read_excel("IMDB_dataset.xlsx")
df.head(5)

Unnamed: 0,review,sentiment
0,I thought this was a wonderful way to spend ti...,positive
1,"Probably my all-time favorite movie, a story o...",positive
2,I sure would like to see a resurrection of a u...,positive
3,"This show was an amazing, fresh & innovative i...",negative
4,Encouraged by the positive comments about this...,negative


In [50]:
df['sentiment'] = df['sentiment'].replace('positive',1)

In [51]:
df['sentiment'] = df['sentiment'].replace('negative',0)

In [52]:
df.head(5)

Unnamed: 0,review,sentiment
0,I thought this was a wonderful way to spend ti...,1
1,"Probably my all-time favorite movie, a story o...",1
2,I sure would like to see a resurrection of a u...,1
3,"This show was an amazing, fresh & innovative i...",0
4,Encouraged by the positive comments about this...,0


In [53]:
df.shape

(2999, 2)

In [54]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

# Preprocessing Text Data

In [115]:
stopwords = nltk.corpus.stopwords.words('english')
from sklearn.feature_extraction.text import TfidfVectorizer
ps = nltk.PorterStemmer()

# Perform TFIDF Vectorization

In [None]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(df['review'])

Splitting the data

In [116]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['sentiment'], test_size=0.20, random_state = 0)

# Exploring parameter settings using GridSearchCV on Random Forest

In [120]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [121]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [10,50,100],
    'max_depth': [2, 16, 32]
}

cv = GridSearchCV(rf, parameters, cv=2)
cv.fit(X_train, y_train)
print_results(cv)

BEST PARAMS: {'max_depth': 16, 'n_estimators': 100}

0.559 (+/-0.056) for {'max_depth': 2, 'n_estimators': 10}
0.671 (+/-0.005) for {'max_depth': 2, 'n_estimators': 50}
0.667 (+/-0.068) for {'max_depth': 2, 'n_estimators': 100}
0.689 (+/-0.007) for {'max_depth': 16, 'n_estimators': 10}
0.77 (+/-0.034) for {'max_depth': 16, 'n_estimators': 50}
0.795 (+/-0.027) for {'max_depth': 16, 'n_estimators': 100}
0.699 (+/-0.007) for {'max_depth': 32, 'n_estimators': 10}
0.781 (+/-0.011) for {'max_depth': 32, 'n_estimators': 50}
0.793 (+/-0.061) for {'max_depth': 32, 'n_estimators': 100}


In [124]:
rfc1=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 100, max_depth=16, criterion='gini')

In [126]:
rfc1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=16, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [127]:
pred=rfc1.predict(X_test)

In [128]:
print("Accuracy for Random Forest on CV data: ",accuracy_score(y_test,pred))

Accuracy for Random Forest on CV data:  0.8316666666666667


In [131]:
import joblib
joblib.dump(cv.best_estimator_, 'E:/AI and ML/Archit - AI for Desicion Making/3/RF_model.pkl')

['E:/AI and ML/Archit - AI for Desicion Making/3/RF_model.pkl']

# Exploring parameter settings using GridSearchCV on Gradient Boosting Classifier

In [132]:
from sklearn.ensemble import GradientBoostingClassifier

In [138]:
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators': [50,80],
    'max_depth': [7,10],
    'learning_rate': [1,10]
}

cv = GridSearchCV(gb, parameters, cv=2)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'learning_rate': 1, 'max_depth': 10, 'n_estimators': 80}

0.731 (+/-0.001) for {'learning_rate': 1, 'max_depth': 7, 'n_estimators': 50}
0.744 (+/-0.016) for {'learning_rate': 1, 'max_depth': 7, 'n_estimators': 80}
0.743 (+/-0.044) for {'learning_rate': 1, 'max_depth': 10, 'n_estimators': 50}
0.752 (+/-0.009) for {'learning_rate': 1, 'max_depth': 10, 'n_estimators': 80}
0.416 (+/-0.077) for {'learning_rate': 10, 'max_depth': 7, 'n_estimators': 50}
0.442 (+/-0.147) for {'learning_rate': 10, 'max_depth': 7, 'n_estimators': 80}
0.533 (+/-0.085) for {'learning_rate': 10, 'max_depth': 10, 'n_estimators': 50}
0.559 (+/-0.118) for {'learning_rate': 10, 'max_depth': 10, 'n_estimators': 80}


In [139]:
joblib.dump(cv.best_estimator_, 'E:/AI and ML/Archit - AI for Desicion Making/3/GB_model.pkl')

['E:/AI and ML/Archit - AI for Desicion Making/3/GB_model.pkl']

# Final evaluation of models

In [141]:
models = {}

for mdl in ['RF', 'GB']:
    models[mdl] = joblib.load('E:/AI and ML/Archit - AI for Desicion Making/3/{}_model.pkl'.format(mdl))

In [142]:
models

{'RF': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=16, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 'GB': GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=1, loss='deviance', max_depth=10,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_e

In [143]:
def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels, pred), 3)
    precision = round(precision_score(labels, pred), 3)
    recall = round(recall_score(labels, pred), 3)
    print('{} -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}ms'.format(name,
                                                                                   accuracy,
                                                                                   precision,
                                                                                   recall,
                                                                                   round((end - start)*1000, 1)))

In [148]:
from time import time
for name, mdl in models.items():
    evaluate_model(name, mdl, X_test, y_test)

RF -- Accuracy: 0.823 / Precision: 0.832 / Recall: 0.821 / Latency: 79.5ms
GB -- Accuracy: 0.808 / Precision: 0.808 / Recall: 0.821 / Latency: 6.6ms


# Best performing model

Random Forest is the best performing as compare to the GB because:-
1. RF repeats the process multiple times and then make final prediction on each observation
2. Random Forest, we build a lot of fully grown trees with low bias
3. GB is slow learner and takes to much time to learn