In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:
data = pd.read_csv('IMDB Dataset.csv')

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
data.shape

(50000, 2)

In [6]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
# Transforming the values of Sentiment column into 0's and 1's
label_encoder = LabelEncoder()

data['sentiment'] = label_encoder.fit_transform(data['sentiment'])

In [8]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [40]:
def text_preprocessor(text):   
    text = re.sub(r'<.*?>', '', text)
    
    return text

data['review'] = data['review'].apply(text_preprocessor)

In [10]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [11]:
vectorizer = TfidfVectorizer(max_features=2500)

X = vectorizer.fit_transform(data['review']).toarray()
y = data['sentiment']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
model = GaussianNB()
model.fit(x_train, y_train)

In [14]:
predictions = model.predict(x_test)

In [15]:
print('Model Accuracy: ', accuracy_score(y_test, predictions))
print('Confusion Matrix: ', confusion_matrix(y_test, predictions))
print('Classification Report:\n', classification_report(y_test, predictions))

Model Accuracy:  0.8209
Confusion Matrix:  [[4089  872]
 [ 919 4120]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82      4961
           1       0.83      0.82      0.82      5039

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000



In [16]:
model1 = RandomForestClassifier(random_state=42, class_weight='balanced')
model1.fit(x_train, y_train)

In [17]:
predictions1 = model1.predict(x_test)

In [18]:
print('Model Accuracy: ', accuracy_score(y_test, predictions1))
print('Confusion Matrix: ', confusion_matrix(y_test, predictions1))
print('Classification Report:\n', classification_report(y_test, predictions1))

Model Accuracy:  0.8483
Confusion Matrix:  [[4224  737]
 [ 780 4259]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.85      0.85      4961
           1       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [19]:
model2 = SVC(kernel='linear')
model2.fit(x_train, y_train)

In [20]:
predictions2 = model2.predict(x_test)

In [21]:
print('Model Accuracy: ', accuracy_score(y_test, predictions2))
print('Confusion Matrix: ', confusion_matrix(y_test, predictions2))
print('Classification Report:\n', classification_report(y_test, predictions2))

Model Accuracy:  0.888
Confusion Matrix:  [[4359  602]
 [ 518 4521]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.88      0.89      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



Thus, we can conclude that SVM Model gives the best score among the 3 classification algorithms.