In [1]:
# 1.1 Import Libraries
import pandas as pd
import numpy as np
import re # Regular expressions for text cleaning
import nltk # Natural Language Toolkit
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # Optional: for stemming
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib # To save/load models and vectorizer (optional)

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
df=pd.read_csv("imdb.csv")
# Display basic info and first few rows
print("\nDataset Info:")
df.info()

print("\nFirst 5 rows:")
print(df.head())

# Map sentiment labels to numerical values
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

print("\nSentiment value counts (1: positive, 0: negative):")
print(df['sentiment'].value_counts())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

First 5 rows:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Sentiment value counts (1: positive, 0: negative):
sentiment
1    25000
0    25000
Name: count, dtype: int64


In [3]:
# 2. Text Preprocessing Function
stop_words = set(stopwords.words('english'))
# ps = PorterStemmer() # Initialize stemmer if using

def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation and numbers, keep only letters
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # Convert to lowercase
    text = text.lower()
    # Tokenize (split into words) and remove stop words
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Optional: Stemming
    # words = [ps.stem(word) for word in words]
    # Join words back into a string
    text = ' '.join(words)
    return text

# Apply the preprocessing function to the 'review' column
print("\nPreprocessing text data... (This may take a few minutes)")
# Create a new column for cleaned reviews
df['cleaned_review'] = df['review'].apply(preprocess_text)
print("Text preprocessing complete.")

# Display original vs cleaned review for one example
print("\nExample Preprocessing:")
print("Original:", df['review'][0][:200] + "...") # Show first 200 chars
print("Cleaned:", df['cleaned_review'][0][:200] + "...")


Preprocessing text data... (This may take a few minutes)
Text preprocessing complete.

Example Preprocessing:
Original: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me abo...
Cleaned: one reviewers mentioned watching oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show ...


In [4]:
# 3.1 Split Data into Training and Testing Sets
X = df['cleaned_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
# stratify=y ensures the proportion of positive/negative reviews is similar in train and test sets

print(f"\nData Split:")
print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")

# 3.2 TF-IDF Vectorization
# Initialize TF-IDF Vectorizer
# max_features limits the vocabulary size to the most frequent terms, useful for large datasets
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # You can tune max_features

# Fit the vectorizer on the training data and transform the training data
print("\nFitting TF-IDF Vectorizer and transforming training data...")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data using the *same* fitted vectorizer
print("Transforming test data...")
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("TF-IDF transformation complete.")
print(f"Shape of TF-IDF matrix (Train): {X_train_tfidf.shape}") # (num_samples, num_features)
print(f"Shape of TF-IDF matrix (Test): {X_test_tfidf.shape}")


Data Split:
Training set size: 37500 samples
Testing set size: 12500 samples

Fitting TF-IDF Vectorizer and transforming training data...
Transforming test data...
TF-IDF transformation complete.
Shape of TF-IDF matrix (Train): (37500, 5000)
Shape of TF-IDF matrix (Test): (12500, 5000)


In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Step 1: Define the DNN model
dnn_model = Sequential()
dnn_model.add(Dense(128, activation='relu', input_shape=(X_train_tfidf.shape[1],)))
dnn_model.add(Dropout(0.3))
dnn_model.add(Dense(64, activation='relu'))
dnn_model.add(Dropout(0.3))
dnn_model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Step 2: Compile the model
dnn_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Step 3: Train the model
print("\nTraining Deep Neural Network model...")
history = dnn_model.fit(
    X_train_tfidf.toarray(), y_train,  # TF-IDF is usually sparse, convert to dense
    epochs=10,
    batch_size=512,
    validation_split=0.2,
    verbose=1
)
print("Model training complete.")

# Step 4: Evaluate on test set
loss, accuracy = dnn_model.evaluate(X_test_tfidf.toarray(), y_test)
print(f"\nTest Accuracy: {accuracy:.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Training Deep Neural Network model...
Epoch 1/10
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 46ms/step - accuracy: 0.7096 - loss: 0.6234 - val_accuracy: 0.8781 - val_loss: 0.2976
Epoch 2/10
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.8954 - loss: 0.2675 - val_accuracy: 0.8825 - val_loss: 0.2750
Epoch 3/10
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.9173 - loss: 0.2140 - val_accuracy: 0.8821 - val_loss: 0.2891
Epoch 4/10
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.9267 - loss: 0.1921 - val_accuracy: 0.8779 - val_loss: 0.3018
Epoch 5/10
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.9399 - loss: 0.1682 - val_accuracy: 0.8756 - val_loss: 0.3149
Epoch 6/10
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.9528 - loss: 0.1420 - val_accuracy: 0.8745 - val_loss: 0.

In [6]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Predict probabilities on the test set
print("\nEvaluating DNN model on the test set...")
y_pred_prob = dnn_model.predict(X_test_tfidf.toarray())

# Convert probabilities to binary predictions (0 or 1)
y_pred = (y_pred_prob > 0.5).astype("int32").flatten()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
# Format:
# [[TN, FP],
#  [FN, TP]]

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative (0)', 'Positive (1)']))


Evaluating DNN model on the test set...
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step

Accuracy: 0.8691

Confusion Matrix:
[[5395  855]
 [ 781 5469]]

Classification Report:
              precision    recall  f1-score   support

Negative (0)       0.87      0.86      0.87      6250
Positive (1)       0.86      0.88      0.87      6250

    accuracy                           0.87     12500
   macro avg       0.87      0.87      0.87     12500
weighted avg       0.87      0.87      0.87     12500



In [7]:
from sklearn.preprocessing import FunctionTransformer
from tensorflow.keras.preprocessing.text import tokenizer_from_json

# Sentiment mapping
sentiment_labels = {1: 'Positive', 0: 'Negative'}

# New review examples
new_reviews = [
    "This movie was absolutely fantastic! The acting was superb and the storyline kept me engaged throughout.",
    "What a waste of time. The plot was predictable and the characters were incredibly boring. I would not recommend this film.",
    "It was an okay movie, not great but not terrible either. Some good moments but overall quite average."
]

print("\n--- Testing on New Reviews ---")

# Step 1: Preprocess the new reviews
cleaned_new_reviews = [preprocess_text(review) for review in new_reviews]
print("Cleaned Reviews:", cleaned_new_reviews)

# Step 2: Transform to TF-IDF (use the same vectorizer you used for training)
new_reviews_tfidf = tfidf_vectorizer.transform(cleaned_new_reviews)
print("Shape of TF-IDF for new reviews:", new_reviews_tfidf.shape)

# Step 3: Predict using the DNN model
new_predictions_prob = dnn_model.predict(new_reviews_tfidf.toarray())
new_predictions = (new_predictions_prob > 0.5).astype("int32").flatten()

# Step 4: Print results
for review, pred, prob in zip(new_reviews, new_predictions, new_predictions_prob):
    print(f"\nReview: \"{review[:100]}...\"")
    print(f"Predicted Sentiment: {sentiment_labels[pred]} ({pred}) with confidence: {prob[0]:.4f}")



--- Testing on New Reviews ---
Cleaned Reviews: ['movie absolutely fantastic acting superb storyline kept engaged throughout', 'waste time plot predictable characters incredibly boring would recommend film', 'okay movie great terrible either good moments overall quite average']
Shape of TF-IDF for new reviews: (3, 5000)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step

Review: "This movie was absolutely fantastic! The acting was superb and the storyline kept me engaged through..."
Predicted Sentiment: Positive (1) with confidence: 1.0000

Review: "What a waste of time. The plot was predictable and the characters were incredibly boring. I would no..."
Predicted Sentiment: Negative (0) with confidence: 0.0000

Review: "It was an okay movie, not great but not terrible either. Some good moments but overall quite average..."
Predicted Sentiment: Negative (0) with confidence: 0.2362


In [9]:
extra_reviews = [
    "Honestly, I fell asleep halfway through. That should tell you enough.",
    "I've never laughed so hard in my life. 10/10 comedy!",
    "This was the best worst movie I've ever seen. So bad it's actually good.",
    "Meh. It exists. That's about the best thing I can say.",
    "Wow. Just wow. I didn't expect much, and yet I was still disappointed.",
    "What a cinematic masterpiece. A pure work of art!",
    "Two hours of my life I can never get back. Thanks a lot.",
    "It was alright, I guess. Not my favorite but watchable.",
    "Acting? What acting? Felt like a school play on a low budget.",
    "Not terrible, not great. Just average in every way."
]
# Clean and vectorize
cleaned_extra_reviews = [preprocess_text(review) for review in extra_reviews]
extra_reviews_tfidf = tfidf_vectorizer.transform(cleaned_extra_reviews)

# Predict using the DNN model
extra_predictions_prob = dnn_model.predict(extra_reviews_tfidf.toarray())
extra_predictions = (extra_predictions_prob > 0.5).astype("int32").flatten()

# Output predictions
print("\n--- Testing on Extra Reviews ---")
for review, pred, prob in zip(extra_reviews, extra_predictions, extra_predictions_prob):
    print(f"\nReview: \"{review[:100]}...\"")
    print(f"Predicted Sentiment: {'😊 Positive' if pred == 1 else '😠 Negative'} ({prob[0]:.4f})")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step

--- Testing on Extra Reviews ---

Review: "Honestly, I fell asleep halfway through. That should tell you enough...."
Predicted Sentiment: 😠 Negative (0.0004)

Review: "I've never laughed so hard in my life. 10/10 comedy!..."
Predicted Sentiment: 😊 Positive (0.9043)

Review: "This was the best worst movie I've ever seen. So bad it's actually good...."
Predicted Sentiment: 😠 Negative (0.0021)

Review: "Meh. It exists. That's about the best thing I can say...."
Predicted Sentiment: 😊 Positive (0.9918)

Review: "Wow. Just wow. I didn't expect much, and yet I was still disappointed...."
Predicted Sentiment: 😠 Negative (0.0071)

Review: "What a cinematic masterpiece. A pure work of art!..."
Predicted Sentiment: 😊 Positive (0.9751)

Review: "Two hours of my life I can never get back. Thanks a lot...."
Predicted Sentiment: 😊 Positive (0.9690)

Review: "It was alright, I guess. Not my favorite but watchable...."
Predicted 