# Step 1--> import libraries

In [28]:
#load librearies
import pandas as pd
#text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
#feature extraction and selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
# classification
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Step 2--> Load dataset

In [4]:
#load the dataset
df=pd.read_csv("IMDB Dataset.csv")

In [5]:
#sample of dataset
df.head(4)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative


# Dataset Overview: IMDB Movie Review Dataset
working with a dataset that contains movie reviews along with their sentiment labels. 

Column Name	Description

review	This column contains the actual text of the movie review. It's usually unstructured (raw) text, and may include HTML tags, punctuation, mixed cases, etc.

sentiment	This column indicates the sentiment label for the review — typically either "positive" or "negative".

# Use the review column as input and sentiment column as output:

Text Preprocessing-->review

Feature Extraction (TF-IDF)-->	review

Feature Selection (Chi-Square)-->	review, sentiment

Classification (Naive Bayes)--> review, sentiment

Evaluation (Precision, Recall) --> Model predictions vs sentiment

In [6]:
#columns of dataset
df.columns

Index(['review', 'sentiment'], dtype='object')

In [7]:
#check for null values
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [8]:
#check the shape of dataset
df.shape

(50000, 2)

In [9]:
#information of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [10]:
#description of dataset
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


# Step 3--> Preprocess the Text

In [19]:
# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [14]:
# Define a preprocessing function
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize
    words = word_tokenize(text)
    
    # Remove stopwords and apply stemming
    filtered_words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    return ' '.join(filtered_words)

In [15]:
# Apply the preprocessing function to the review column
df['clean_review'] = df['review'].apply(preprocess)

In [17]:
# Display the first few preprocessed reviews
print(df[['review', 'clean_review']].head(4))

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   

                                        clean_review  
0  one review mention watch 1 oz episod youll hoo...  
1  wonder littl product br br film techniqu unass...  
2  thought wonder way spend time hot summer weeke...  
3  basic there famili littl boy jake think there ...  


# Step 4-->Feature extraction using Tf-idf

In [21]:
# Convert sentiment (target) to binary labels: 0 = negative, 1 = positive
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['sentiment'])  # e.g., ['positive', 'negative'] → [1, 0]

In [24]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)  # You can adjust max_features if needed
X_tfidf = tfidf.fit_transform(df['clean_review'])

In [25]:
print("Original TF-IDF shape:", X_tfidf.shape)

Original TF-IDF shape: (50000, 5000)


We have 50,000 reviews, and 5,000 unique words/features after TF-IDF.

# Step 5-->Feature section using chi-square test

In [23]:
# Chi-square Feature Selection: select top 1000 features
selector = SelectKBest(score_func=chi2, k=1000)
X_selected = selector.fit_transform(X_tfidf, y)

In [26]:
print("Selected features shape:", X_selected.shape)

Selected features shape: (50000, 1000)


Chi-square kept the top 1,000 most informative words.

# Step 6-->Classification Using Naive Bayes

Split the data into training and testing sets

Train the MultinomialNB model

Predict sentiment labels for the test set

In [29]:
# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [30]:
# Initialize and train the Multinomial Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

In [31]:
# Predict on test data
y_pred = model.predict(X_test)

In [32]:
# Evaluate the model
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.87      0.86      0.86      4961
    positive       0.86      0.87      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



# Step 7--> Evaluate Using Precision and Recall

In [33]:
from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Precision (Positive class):", precision)
print("Recall (Positive class):", recall)

Precision (Positive class): 0.8633418417435696
Recall (Positive class): 0.8725937686048819


# Result Analysis-->

Precision (86.3%): Most predicted positive reviews are truly positive.

Recall (87.3%): The model correctly identifies most actual positive reviews.

Conclusion: The model performs well with balanced precision and recall, making it reliable for sentiment analysis.