In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import string
import pickle
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [8]:
dataset = pd.read_csv('/content/IMDB Dataset.csv', on_bad_lines='skip')

In [9]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [10]:
print(f'Rows: {dataset.shape[1]}\nColumns: {dataset.shape[0]}')

Rows: 2
Columns: 50000


In [11]:
print(f'Columns Names: {list(dataset.columns)}')

Columns Names: ['review', 'sentiment']


In [12]:
nlp = English()
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

In [13]:
def tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

In [14]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    return text.strip().lower()

In [15]:
vectorizer = CountVectorizer(tokenizer = tokenizer, ngram_range=(1,1))
tfvectorizer = TfidfVectorizer(tokenizer = tokenizer)

In [16]:
X = dataset['review']
y = dataset['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
import re

# Define the text cleaner
class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.clean_text(doc) for doc in X]

    @staticmethod
    def clean_text(text):
        text = re.sub(r'\W+', ' ', text)
        return text.lower().strip()

vectorizer = TfidfVectorizer(
    token_pattern=r'\b\w+\b',
    stop_words='english',
    lowercase=True
)
classifier = LogisticRegression()
LRmodel = Pipeline([
    ("cleaner", TextCleaner()),
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

LRmodel.fit(X_train, y_train)

LRpred = LRmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test, LRpred)}')
print(f'\nClassification Report:\n{classification_report(y_test, LRpred)}')
print(f'Accuracy: {accuracy_score(y_test, LRpred) * 100:.2f}%')


Confusion Matrix:
[[4479  572]
 [ 424 4525]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      5051
    positive       0.89      0.91      0.90      4949

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Accuracy: 90.04%


In [32]:
pre = LRmodel.predict(["This movie was a complete disappointment. The plot was poorly developed, the characters were one-dimensional, and the pacing was painfully slow."])
print(f'Prediction: {pre[0]}')

Prediction: negative
