In [2]:
# Import necessary libraries 
!pip install nltk
import pandas as pd
import string
import re
import nltk
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Download stopwords
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv('IMDB Dataset.csv')  

# Map sentiment to binary
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Preprocessing function
def preprocess(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text) # Replace all punctuations with empty strings
    tokens = text.split()                              # Cleaned text into individual words
    stop_words = set(stopwords.words('english'))       # common stopwords using nltk
    lemmatizer = WordNetLemmatizer()                   # Initialize lemmatizer
    cleaned = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # reduces words to their dictionary/base form ignores stopwords
    return " ".join(cleaned)   # Joins the cleaned tokens 

# Apply preprocessing and add a new column named cleaned_review
df['cleaned_review'] = df['review'].apply(preprocess)

# TF-IDF Vectorization (Term Frequency - Inverse Document Frequency) text -> numerical features
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_review']).toarray()
y = df['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

# Model training
model = LogisticRegression()  # why?- binary classification,fast and simple,performs well with TF-IDF
model.fit(X_train, y_train)   # learning 

# Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

# Save model and vectorizer
with open('sentiment_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)





[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAMANAA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RAMANAA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.8871
Precision: 0.8777047913446677
Recall: 0.901567771383211
F1 Score: 0.8894762604013705
