In [2]:
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
# Download necessary NLTK resources 
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)  # For lemmatization

True

In [4]:
# 1. Load the IMDb dataset
try:
    df = pd.read_csv("./assets/IMDB Dataset.csv")
except FileNotFoundError:
    print("Error: IMDB Dataset.csv not found. Please place the file in the same directory or provide the correct path.")
    exit()  # Exit the script if the file is not found

In [5]:
# 2. Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text, re.UNICODE) # Remove punctuation
    text = text.lower() # Lowercasing
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")] # Lemmatization
    text = [lemmatizer.lemmatize(token, "v") for token in text] # Lemmatization verbs
    text = [word for word in text if not word in stop_words] # Remove stopwords
    text = " ".join(text)
    return text

df['review'] = df['review'].apply(clean_text)

df.head()

Unnamed: 0,review,sentiment
0,one reviewer ha mention watch 1 oz episode you...,positive
1,wonderful little production br br film techniq...,positive
2,think wa wonderful way spend time hot summer w...,positive
3,basically family little boy jake think zombie ...,negative
4,petter matteis love time money visually stun f...,positive


In [6]:
# 3. Feature Extraction (TF-IDF)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

# Convert sentiment to numerical values (positive=1, negative=0)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
y = df['sentiment']

In [7]:
# 4. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80% train, 20% test

In [8]:
# 5. Train a Naive Bayes classifier (or any other suitable classifier)
model = MultinomialNB()
model.fit(X_train, y_train)

In [9]:
# 6. Make predictions on the test set
y_pred = model.predict(X_test)

In [10]:

# 7. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

def predict_sentiment(new_review):
    cleaned_review = clean_text(new_review)
    new_review_vectorized = vectorizer.transform([cleaned_review]) # Important: Use the *same* vectorizer
    prediction = model.predict(new_review_vectorized)[0]
    return "positive" if prediction == 1 else "negative"

new_reviews = [
    "This movie was absolutely fantastic! I loved every minute of it.",
    "I found this film to be quite boring and predictable.",
    "The acting was superb, and the plot was captivating.",
    "A waste of time and money. I would not recommend it.",
]

for review in new_reviews:
    sentiment = predict_sentiment(review)
    print(f"Review: {review}\nSentiment: {sentiment}\n")

Accuracy: 0.8633
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      4961
           1       0.88      0.84      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Confusion Matrix:
 [[4378  583]
 [ 784 4255]]
Review: This movie was absolutely fantastic! I loved every minute of it.
Sentiment: positive

Review: I found this film to be quite boring and predictable.
Sentiment: negative

Review: The acting was superb, and the plot was captivating.
Sentiment: positive

Review: A waste of time and money. I would not recommend it.
Sentiment: negative

