In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score
import nltk
from nltk.corpus import stopwords
import string


download stopwords

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Qasim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessing

In [3]:
import re

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r"<.*?>", " ", text)
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove stopwords
    words = [word for word in text.split() if word not in stop_words]
    
    return " ".join(words)


dataset

In [22]:
# Load IMDb dataset
data = pd.read_csv("IMDBDataset.csv")

# Apply preprocessing
data['review'] = data['review'].apply(preprocess_text)

# Encode labels: Positive = 1, Negative = 0
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

print(data.head())


                                              review  sentiment
0  one reviewers mentioned watching 1 oz episode ...          1
1  wonderful little production filming technique ...          1
2  thought wonderful way spend time hot summer we...          1
3  basically theres family little boy jake thinks...          0
4  petter matteis love time money visually stunni...          1


trainig and testing and spliting

In [23]:
# Split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(
    data['review'], data['sentiment'], test_size=0.2, random_state=42
)

# Convert text into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)  # limit to 5000 features
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


training models

In [24]:
# Logistic Regression
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train_vec, y_train)
y_pred_lr = log_reg.predict(X_test_vec)

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
y_pred_nb = nb.predict(X_test_vec)

# Support Vector Machine
svm = LinearSVC()
svm.fit(X_train_vec, y_train)
y_pred_svm = svm.predict(X_test_vec)


models evaluation

In [25]:
def evaluate_model(name, y_true, y_pred):
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("-" * 30)

evaluate_model("Logistic Regression", y_test, y_pred_lr)
evaluate_model("Naive Bayes", y_test, y_pred_nb)
evaluate_model("SVM", y_test, y_pred_svm)


Model: Logistic Regression
Accuracy: 0.8917
F1 Score: 0.8939171319424037
------------------------------
Model: Naive Bayes
Accuracy: 0.8536
F1 Score: 0.855164226355362
------------------------------
Model: SVM
Accuracy: 0.883
F1 Score: 0.8851815505397449
------------------------------


extract samples from dataset

In [26]:
import pandas as pd

df = pd.read_csv("IMDBDataset.CSV")  
print(df.sample(n=5))


                                                  review sentiment
47478  Director Douglas Sirk once said `there's a ver...  positive
36879  I had the misfortune of wasting 10 quid buying...  negative
25615  The parallels between this film and "Captain W...  negative
46300  This movie is one of the most unintentionally ...  negative
103    No, this hilariously horrible 70's made-for-TV...  positive


input

In [27]:
def predict_sentiment(review, model):
    # Preprocess review
    review_clean = preprocess_text(review)
    review_vec = vectorizer.transform([review_clean])
    prediction = model.predict(review_vec)[0]
    return "Positive" if prediction == 1 else "Negative"


# Interactive loop
print(" Sentiment Analysis System (type 'quit' to exit)\n")
while True:
    user_review = input("Enter a movie review: ")
    if user_review.lower() == "quit":
        print("Exit...")
        break
    sentiment = predict_sentiment(user_review, log_reg)  # You can replace log_reg with svm/nb
    print("Predicted Sentiment:", sentiment, "\n")


 Sentiment Analysis System (type 'quit' to exit)

Predicted Sentiment: Negative 

Exit...
