In [None]:
import nltk

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# df = pd.read_csv("imdb_dataset.csv", encoding='unicode_escape')

df = pd.read_csv("imdb_master.csv", encoding='unicode_escape')

In [None]:
df.head()

In [None]:
df.sample(7)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.describe().T

In [None]:
df['label'].unique()

In [None]:
df['label'].value_counts()

In [None]:
sns.countplot(df['label'])  

In [None]:
df['review'].str.len().hist()

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,8))
ax1.hist(df[df['label']=='pos']['review'].str.len())
ax1.set_title( 'Positive Reviews')
ax2.hist(df[df['label']=='neg']['review'].str.len())
ax2.set_title( 'Negative Reviews')

In [None]:
label = LabelEncoder()
df['label'] = label.fit_transform(df['label'])

In [None]:
df.head()

In [None]:
x = df['review']
y = df['label']

In [None]:
ps = PorterStemmer()
corpus = []

for i in range(len(x)):
    print(i)
    review = re.sub("^a-zA-Z"," ",x[i]) 
    review = review.lower() 
    review = review.split() 
    review = [ps.stem (word) for word in review if word not in set (stopwords.words ("english"))]
    review = " ".join(review)
    corpus.append(review)

In [None]:
corpus

In [None]:
df

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer (max_features=590)
X = cv.fit_transform (corpus).toarray()

In [None]:
X.shape

In [None]:
X

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2 , random_state=101)

In [None]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

Complement NB 

In [None]:
from sklearn.naive_bayes import ComplementNB
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
CNB = ComplementNB()
CNB.fit(X_train, Y_train)

from sklearn import metrics
predictedCNB = CNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predictedCNB, Y_test)


In [None]:

print('ComplementNB model accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(Y_test, predictedCNB)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(Y_test, predictedCNB))

In [None]:
confusion_matrix = confusion_matrix(Y_test, predictedCNB)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
cm_display.plot()

 Bernoulli NB

In [None]:
from sklearn.naive_bayes import BernoulliNB

import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

BNB = BernoulliNB()
BNB.fit(X_train, Y_train)

predictedBNB = BNB.predict(X_test)
accuracy_score_bnb = metrics.accuracy_score(predictedBNB,Y_test)

In [None]:
print('BernoulliNB model accuracy = ' + str('{:4.2f}'.format(accuracy_score_bnb*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(Y_test, predictedBNB)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(Y_test, predictedBNB))

In [None]:
confusion_matrix = confusion_matrix(Y_test, predictedBNB)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
cm_display.plot()

Multinomial NB


In [None]:
from sklearn.naive_bayes import MultinomialNB

import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

predictedMNB = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predictedMNB, Y_test)

In [None]:

print('MultinominalNB model accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(Y_test, predictedMNB)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(Y_test, predictedMNB))

In [None]:
confusion_matrix = confusion_matrix(Y_test, predictedMNB)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
cm_display.plot()

Testing


In [None]:
pd.DataFrame(np.c_[Y_test, predictedCNB] , columns=["Actual" , "Predicted"])    

In [None]:
pd.DataFrame(np.c_[Y_test, predictedBNB] , columns=["Actual" , "Predicted"])    

In [None]:
pd.DataFrame(np.c_[Y_test, predictedMNB] , columns=["Actual" , "Predicted"])    

In [None]:
pickle.dump (cv, open("count-Vectorizer.pkl", "wb"))
pickle.dump(MNB, open("Movies_review_Classification.pkl", "wb")) # 1: pos, 0:Neg

In [None]:
save_cv = pickle.load(open('count-Vectorizer.pkl', 'rb'))
model = pickle.load(open('Movies_Review_Classification.pkl','rb'))

In [None]:
def test_model(sentence):
    sen = save_cv.transform([sentence]).toarray()
    res = model.predict (sen)[0]
    if res == 1:
        return 'Positive review'
    else:
        return 'Negative review'

In [None]:
sen = 'This is the worst movie, I have ever seen in my life'
res = test_model (sen)
print (res)

In [None]:
sen = "The movie was simply great"
res = test_model (sen)
print (res)