In [40]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [41]:
def clean_text(text):                             
    text = re.sub(r'<.*?>', '', text)                
    text = re.sub(r'[^a-z\s]', '', text)              
    text = re.sub(r'\s+', ' ', text).strip()          
    return text

In [42]:
df = pd.read_csv("Dataset/IMDB Dataset.csv")
df["sentiment"] = df["sentiment"].map({"positive":1,"negative":0})
X = df["review"].apply(clean_text)
y = df["sentiment"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y, random_state=42) 
pipeline = Pipeline([('tfidf', TfidfVectorizer(max_features= 5000, stop_words='english')),('model', LogisticRegression(max_iter = 1000))])

In [46]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [48]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8851
Precision: 0.8764418377321603
Recall: 0.8966
F1 Score: 0.886406327236777

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.87      0.88      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

