## Import Dependencies

In [28]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib 

## Import Dataset 

In [51]:
df=pd.read_csv('dataset/IMDB Dataset.csv')

In [52]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [53]:
df.sample(6)

Unnamed: 0,review,sentiment
27579,This Wrestlemania just didn't do it for me. Wh...,positive
39081,I especially liked the ending of this movie--I...,positive
44850,I knew as soon as I saw the first trailer for ...,positive
35760,From the opening scenes of FIERCE PEOPLE (an i...,positive
42014,"When I first watched this, we borrowed it from...",positive
13854,I've been wanting to see this movie for a very...,positive


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [55]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

print(df['sentiment'].value_counts())

sentiment
1    25000
0    25000
Name: count, dtype: int64


##### This dataset has equal positive and negative sentiment reviews. Positive = 25000 and Negative = 25000

## Processing the text of the reviews

In [56]:
swords=set(stopwords.words('english'))

In [57]:
def process(review):
    review = BeautifulSoup(review).get_text()
    review = re.sub("[^a-zA-Z]",' ',review)
    review = review.lower()
    review = review.split()
    review = [w for w in review if w not in swords]
    return(" ".join(review))

This and this ==> this 

In [58]:
df['processed_review'] = df.review.apply(process)

In [59]:
df['review'][100]

"This short film that inspired the soon-to-be full length feature - Spatula Madness - is a hilarious piece that contends against similar cartoons yielding multiple writers. The short film stars Edward the Spatula who after being fired from his job, joins in the fight against the evil spoons. This premise allows for some funny content near the beginning, but is barely present for the remainder of the feature. This film's 15-minute running time is absorbed by some odd-ball comedy and a small musical number. Unfortunately not much else lies below it. The plot that is set up doesn't really have time to show. But it's surely follows it plot better than many high-budget Hollywood films. This film is worth watching at least a few times. Take it for what it is, and don't expect a deep story."

In [60]:
df['processed_review'][100]


'short film inspired soon full length feature spatula madness hilarious piece contends similar cartoons yielding multiple writers short film stars edward spatula fired job joins fight evil spoons premise allows funny content near beginning barely present remainder feature film minute running time absorbed odd ball comedy small musical number unfortunately much else lies plot set really time show surely follows plot better many high budget hollywood films film worth watching least times take expect deep story'

## Feature Engineering: TF-IDF Vectorization

In [61]:
X=df['processed_review']

In [62]:
y=df['sentiment']

In [63]:
X_train, X_valid, y_train, y_valid =  train_test_split(X,y, test_size=0.2,random_state=42, stratify=y)

In [64]:
print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_valid)} samples")

Training set size: 40000 samples
Testing set size: 10000 samples


In [65]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [66]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

print("transformation complete")
print(f"Shape of TF-IDF matrix (Train): {X_train_tfidf.shape}") # (num_samples, num_features)
print(f"Shape of TF-IDF matrix (Test): {X_valid_tfidf.shape}")

transformation complete
Shape of TF-IDF matrix (Train): (40000, 5000)
Shape of TF-IDF matrix (Test): (10000, 5000)


Transforming test data...
TF-IDF transformation complete.
Shape of TF-IDF matrix (Train): (40000, 5000)
Shape of TF-IDF matrix (Test): (10000, 5000)


## Training the Model

In [67]:
lreg = LogisticRegression(C=1.0,max_iter=1000,random_state=42,solver='liblinear') 

##### liblinear is good for binary classification with larger datasets

In [68]:
lreg.fit(X_train_tfidf,y_train)


In [69]:
# Optional: Save the model and vectorizer
joblib.dump(lreg, 'logistic_regression_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
print("Model and Vectorizer saved.")

Model and Vectorizer saved.


## Evaluate the Model

In [70]:
print("\nEvaluating model on the test set...")
y_pred = lreg.predict(X_valid_tfidf)

accuracy = accuracy_score(y_valid,y_pred)
print(f"\nAccuracy:{accuracy:.4f}")

print('Confusion Matrix:')
print(confusion_matrix(y_valid,y_pred))



Evaluating model on the test set...

Accuracy:0.8939
Confusion Matrix:
[[4434  566]
 [ 495 4505]]


In [71]:
print("\nClassification Report:")
print(classification_report(y_valid, y_pred, target_names=['Negative (0)', 'Positive (1)']))


Classification Report:
              precision    recall  f1-score   support

Negative (0)       0.90      0.89      0.89      5000
Positive (1)       0.89      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



## Testing on examples

In [72]:
custom_reviews_3 = [
    "This movie was absolutely fantastic !The acting was superb and the storyline kept me engaged throughout.",
    "What a waste of time. The plot was predictable and the characters were incredibly boring. I would not recommend this film.",
    "It was an okay movie, not great but not terrible either. Some good moments but overall quite average."
]

print("\n--- Testing on New Reviews ---")

# Preprocess the new reviews
cleaned_new_reviews = [process(review) for review in custom_reviews_3]
print("Cleaned Reviews:", cleaned_new_reviews)

# Transform using the fitted TF-IDF vectorizer
new_reviews_tfidf = tfidf_vectorizer.transform(cleaned_new_reviews)
print("Shape of TF-IDF for new reviews:", new_reviews_tfidf.shape)

# Predict sentiment
new_predictions = lreg.predict(new_reviews_tfidf)
sentiment_labels = {1: 'Positive', 0: 'Negative'}

# Print results
for review, prediction in zip(custom_reviews_3, new_predictions):
    print(f"\nReview: \"{review[:100]}...\"")
    print(f"Predicted Sentiment: {sentiment_labels[prediction]} ({prediction})")


--- Testing on New Reviews ---
Cleaned Reviews: ['movie absolutely fantastic acting superb storyline kept engaged throughout', 'waste time plot predictable characters incredibly boring would recommend film', 'okay movie great terrible either good moments overall quite average']
Shape of TF-IDF for new reviews: (3, 5000)

Review: "This movie was absolutely fantastic! The acting was superb and the storyline kept me engaged through..."
Predicted Sentiment: Positive (1)

Review: "What a waste of time. The plot was predictable and the characters were incredibly boring. I would no..."
Predicted Sentiment: Negative (0)

Review: "It was an okay movie, not great but not terrible either. Some good moments but overall quite average..."
Predicted Sentiment: Negative (0)
