In [1]:
import numpy as np
import pandas as pd

import re
import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import pickle


In [2]:
from sklearn.pipeline import Pipeline

In [3]:
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jeevan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df = pd.read_csv("IMDB Dataset.csv")
print(df.shape)
df.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})


In [6]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()                      # lowercase
    text = re.sub(r'[^a-z\s]', '', text)     # remove punctuation & numbers
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)


In [7]:
df['clean_review'] = df['review'].apply(clean_text)
df.head()


Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,1,one reviewers mentioned watching oz episode yo...
1,A wonderful little production. <br /><br />The...,1,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter matteis love time money visually stunni...


In [8]:
X = df['clean_review']
y = df['sentiment']


In [9]:
# X, y

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [11]:
# vectorizer = TfidfVectorizer()
# X_train_tfidf = vectorizer.fit_transform(X_train)
# X_test_tfidf = vectorizer.transform(X_test)


In [12]:
# print(hasattr(vectorizer, "idf_"))
# 

In [13]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression(max_iter=1000))
])

In [14]:
# model = LogisticRegression()
pipeline.fit(X_train, y_train)


In [15]:
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8964
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [16]:
# Save ONE object
with open("sentiment_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

In [17]:
with open("sentiment_pipeline.pkl", "rb") as f:
    pipe = pickle.load(f)

pipe.predict(["I absolutely loved this movie"])


array([1], dtype=int64)