In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import string
import pickle
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset
import tensorflow as tf

## **Uploaded Dataset**
positive=501
negative=499


In [3]:
dataset = pd.read_csv('IMDB Dataset.csv')

In [None]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
print(f'Rows: {dataset.shape[1]}\nColumns: {dataset.shape[0]}')

Rows: 2
Columns: 1000


In [None]:
dataset.shape

(1000, 2)

Rows: 1000
Columns: 2


In [None]:
print(f'Columns Names: {list(dataset.columns)}')

Columns Names: ['review', 'sentiment']


## **Text Operation**

In [None]:
nlp = spacy.load('en_core_web_sm')
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

In [None]:
def tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations]
    if len(mytokens) == 0:
        print("Warning: Empty document after tokenization:", sentence)
    return mytokens

# Check if the tokenizer is returning any tokens
print(tokenizer("This is a sample sentence."))

['sample', 'sentence']


## **Transformation and Vectorization**

In [None]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    return text.strip().lower()

In [None]:
vectorizer = CountVectorizer(tokenizer = tokenizer, ngram_range=(1,1))
tfvectorizer = TfidfVectorizer(tokenizer = tokenizer)

## **Split the Dataset**

In [None]:
X = dataset['review']
y = dataset['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

In [None]:
import tensorflow as tf

# Check for GPU availability
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
    device = '/device:GPU:0'
else:
    print("GPU not found. Using CPU instead.")
    device = '/device:CPU:0'

# Print TensorFlow version
print("TensorFlow version:", tf.__version__)

Default GPU Device: /device:GPU:0
TensorFlow version: 2.15.0


## **Logistic Regression**

In [None]:


classifier = LogisticRegression()
LRmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

# Train the Model
LRmodel.fit(X_train,y_train)
LRpred = LRmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test,LRpred)}')
print(f'\nClassification Report:\n{classification_report(y_test,LRpred)}')
print(f'Accuracy: {accuracy_score(y_test,LRpred)*100}%')
print('Logistic Regression trained Model Saved')



Confusion Matrix:
[[84 23]
 [11 82]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.88      0.79      0.83       107
    positive       0.78      0.88      0.83        93

    accuracy                           0.83       200
   macro avg       0.83      0.83      0.83       200
weighted avg       0.84      0.83      0.83       200

Accuracy: 83.0%
Logistic Regression trained Model Saved


In [None]:
# Another random review
pre = LRmodel.predict(["oduction has an iPrncredibly important place to shoot a series or film. Sometimes even a very minimalist story can reach an incredibly successful point after the right production stages. The Witcher series is far from minimalist. The Witcher is one of the best Middle-earth works in the world. Production quality is essential if you want to handle such a topic successfully."])
print(f'Prediction: {pre[0]}')

Prediction: positive


In [None]:
# Another random review
pre = LRmodel.predict(["this movie was amazing but the chracters were acting bad"])
print(f'Prediction: {pre[0]}')

Prediction: negative


## **Random Forest**

In [None]:

RFclassifier = RandomForestClassifier(n_estimators = 200)
RFmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', RFclassifier)])

# Train the Model
RFmodel.fit(X_train,y_train)
RFpred = RFmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test,RFpred)}')
print(f'\nClassification Report:\n{classification_report(y_test,RFpred)}')
print(f'Accuracy: {accuracy_score(y_test,RFpred)*100}%')
print('RandomForest trained Model Saved')



Confusion Matrix:
[[81 26]
 [ 9 84]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.76      0.82       107
    positive       0.76      0.90      0.83        93

    accuracy                           0.82       200
   macro avg       0.83      0.83      0.82       200
weighted avg       0.84      0.82      0.82       200

Accuracy: 82.5%
RandomForest trained Model Saved


In [None]:
# Another random review
pre = RFmodel.predict(["I think this is my first review. This series is so bad I had to write one. I don't understand the good score. I have tried on 2 separate occasions to watch this show. Haven't even gotten past the 2nd episode because it is SO BORING."])
print(f'Prediction: {pre[0]}')

Prediction: negative


## **LinearSVC**

In [None]:

SVCclassifier = LinearSVC()
SVCmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', SVCclassifier)])

# Train the Model
SVCmodel.fit(X_train,y_train)
SVCpred = SVCmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test,SVCpred)}')
print(f'\nClassification Report:\n{classification_report(y_test,SVCpred)}')
print(f'Accuracy: {accuracy_score(y_test,SVCpred)*100}%')
print('LinearSVC trained Model Saved')



Confusion Matrix:
[[86 21]
 [12 81]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.88      0.80      0.84       107
    positive       0.79      0.87      0.83        93

    accuracy                           0.83       200
   macro avg       0.84      0.84      0.83       200
weighted avg       0.84      0.83      0.84       200

Accuracy: 83.5%
LinearSVC trained Model Saved


In [None]:
pre = SVCmodel.predict(["Henry cavill nailed the role perfectly. The fight scenes, the music, the cinematography, the whole atmosphere is beyond amazing. Netflix did it again"])
print(f'Prediction: {pre[0]}')

Prediction: positive


**Naive** Bayes

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB  # or GaussianNB for different types of data
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Assuming you've defined predictors() and vectorizer appropriately
NaiveBayesClassifier = MultinomialNB()  # or GaussianNB()

NaiveBayesModel = Pipeline([
    ("cleaner", predictors()),
    ('vectorizer', vectorizer),
    ('classifier', NaiveBayesClassifier)
])

# Train the Model
NaiveBayesModel.fit(X_train, y_train)
NaiveBayesPred = NaiveBayesModel.predict(X_test)

print(f'Confusion Matrix:\n{confusion_matrix(y_test, NaiveBayesPred)}')
print(f'\nClassification Report:\n{classification_report(y_test, NaiveBayesPred)}')
print(f'Accuracy: {accuracy_score(y_test, NaiveBayesPred) * 100}%')
print('Naive Bayes trained Model Saved')




Confusion Matrix:
[[89 18]
 [19 74]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.82      0.83      0.83       107
    positive       0.80      0.80      0.80        93

    accuracy                           0.81       200
   macro avg       0.81      0.81      0.81       200
weighted avg       0.81      0.81      0.81       200

Accuracy: 81.5%
Naive Bayes trained Model Saved


In [None]:
pre = NaiveBayesModel.predict(["Henry cavill nailed the role perfectly. The fight scenes, the music, the cinematography, the whole atmosphere is beyond amazing. Netflix did it again"])
print(f'Prediction: {pre[0]}')

Prediction: positive
