In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


#  Movie Review Sentiment Analysis using Bag-of-Words & Multinomial Naive Bayes & Random Forest

## Introduction

This notebook demonstrates **sentiment analysis on movie reviews**.  
We will classify reviews as **positive or negative** using classical machine learning techniques.  

Key highlights:
- **Dataset:** Movie review dataset (IMDB / Kaggle dataset)
- **Techniques:** Text preprocessing, Bag-of-Words with unigrams & bigrams
- **Model:** Multinomial Naive Bayes and Radom Forest Classifier
- **Goal:** Predict sentiment with good accuracy and showcase classical NLP workflow


In [2]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

## Dataset Overview

Let's take a look at the dataset:

- Number of reviews
- Number of positive and negative labels
- Sample reviews


In [3]:
df.shape

(50000, 2)

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.value_counts("sentiment")

sentiment
negative    25000
positive    25000
Name: count, dtype: int64

In [6]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(418)

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.duplicated().sum()

np.int64(0)

In [10]:
df.shape

(49582, 2)

In [11]:
df.value_counts("sentiment")

sentiment
positive    24884
negative    24698
Name: count, dtype: int64

## Data Preprocessing

-  lowercasing reviews
-  removing unnecessary punctuations and html tags
-  Convert tokenized reviews back to string if necessary

We use Bag-of-Words, which requires text in string format.


In [12]:
import re
def remove_html_tags(text):
    abc = re.compile('<.*?>')
    return abc.sub('',text)

In [13]:
df["review"] = df["review"].apply(remove_html_tags)

In [14]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [15]:
df["review"] = df["review"].str.lower()

In [16]:
df.sample()

Unnamed: 0,review,sentiment
47776,"there are bad movies, movies that are horrible...",negative


In [17]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
exclude = '"#$%&\'()*+,-./:;<=>@[\\]^_`{|}~'
def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))
        

In [19]:
df["review"] = df["review"].apply(remove_punc)

In [20]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [21]:
import re

def normalize_punctuation(text):
    text = re.sub(r'!{2,}', '!', text)
    text = re.sub(r'\?{2,}', '?', text)
    return text
df["review"] = df["review"].apply(normalize_punctuation)

In [22]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


## Tokenization

**Tokenization** is the process of splitting text into smaller pieces called **tokens** (usually words).  
It allows the model to process and understand text at the word level rather than the entire sentence.

For example:
- Input: "The movie was amazing and fun to watch."
- Tokens: ["The", "movie", "was", "amazing", "and", "fun", "to", "watch"]

Tokenization is a crucial step in NLP pipelines, especially before feature extraction with Bag-of-Words or TF-IDF.


In [23]:
from nltk.tokenize import word_tokenize
df["review"] = df["review"].apply(
    lambda x : word_tokenize(x)
)

In [24]:
df["review"]

0        [one, of, the, other, reviewers, has, mentione...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, this, was, a, wonderful, way, to,...
3        [basically, theres, a, family, where, a, littl...
4        [petter, matteis, love, in, the, time, of, mon...
                               ...                        
49995    [i, thought, this, movie, did, a, down, right,...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, am, a, catholic, taught, in, parochial, el...
49998    [im, going, to, have, to, disagree, with, the,...
49999    [no, one, expects, the, star, trek, movies, to...
Name: review, Length: 49582, dtype: object

In [25]:
df=df.reset_index(drop=True)

## Stopword Removal

**Stopwords** are common words that usually do not carry significant meaning in text analysis, such as:
- "is", "the", "and", "of", "to"

Removing stopwords helps:
- Reduce the size of the vocabulary
- Improve model efficiency
- Focus on meaningful words that contribute to sentiment

In movie reviews, words like "the", "was", or "and" are typically not useful for predicting sentiment.


In [26]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df["review"] = df["review"].apply(
    lambda x : [word for word in x if word not in stop_words]    
)

In [27]:
df["review"]

0        [one, reviewers, mentioned, watching, 1, oz, e...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, theres, family, little, boy, jake,...
4        [petter, matteis, love, time, money, visually,...
                               ...                        
49577    [thought, movie, right, good, job, wasnt, crea...
49578    [bad, plot, bad, dialogue, bad, acting, idioti...
49579    [catholic, taught, parochial, elementary, scho...
49580    [im, going, disagree, previous, comment, side,...
49581    [one, expects, star, trek, movies, high, art, ...
Name: review, Length: 49582, dtype: object

## Lemmatization

**Lemmatization** is the process of reducing a word to its **base or root form** (lemma).  
It helps reduce variations of the same word, making the model more efficient.

For example:
- "running" → "run"
- "better" → "good"
- "actors" → "actor"

By lemmatizing text, we normalize words so the model treats similar words consistently, improving accuracy and reducing vocabulary size.


In [28]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df["review"] = df["review"].apply(
    lambda x : [lemmatizer.lemmatize(word) for word in x]
)

In [29]:
df["review"]

0        [one, reviewer, mentioned, watching, 1, oz, ep...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, there, family, little, boy, jake, ...
4        [petter, matteis, love, time, money, visually,...
                               ...                        
49577    [thought, movie, right, good, job, wasnt, crea...
49578    [bad, plot, bad, dialogue, bad, acting, idioti...
49579    [catholic, taught, parochial, elementary, scho...
49580    [im, going, disagree, previous, comment, side,...
49581    [one, expects, star, trek, movie, high, art, f...
Name: review, Length: 49582, dtype: object

In [30]:
df

Unnamed: 0,review,sentiment
0,"[one, reviewer, mentioned, watching, 1, oz, ep...",positive
1,"[wonderful, little, production, filming, techn...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, there, family, little, boy, jake, ...",negative
4,"[petter, matteis, love, time, money, visually,...",positive
...,...,...
49577,"[thought, movie, right, good, job, wasnt, crea...",positive
49578,"[bad, plot, bad, dialogue, bad, acting, idioti...",negative
49579,"[catholic, taught, parochial, elementary, scho...",negative
49580,"[im, going, disagree, previous, comment, side,...",negative


In [31]:
x = df.iloc[:,0:1]
y = df["sentiment"]

In [32]:
x

Unnamed: 0,review
0,"[one, reviewer, mentioned, watching, 1, oz, ep..."
1,"[wonderful, little, production, filming, techn..."
2,"[thought, wonderful, way, spend, time, hot, su..."
3,"[basically, there, family, little, boy, jake, ..."
4,"[petter, matteis, love, time, money, visually,..."
...,...
49577,"[thought, movie, right, good, job, wasnt, crea..."
49578,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49579,"[catholic, taught, parochial, elementary, scho..."
49580,"[im, going, disagree, previous, comment, side,..."


## Label Encoding

Machine learning models work with **numerical data**, so categorical labels like `"positive"` and `"negative"` must be converted into numbers.

**Label Encoding** is the process of converting categorical labels into numerical form:
- `"positive"` → 1
- `"negative"` → 0

This allows models like **Multinomial Naive Bayes** or **Random Forest** to process the target variable correctly.

Using label encoding ensures:
- Models can interpret the target variable
- The workflow remains consistent and efficient


In [33]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [34]:
y

array([1, 1, 1, ..., 0, 0, 0])

## Train-Test Split

Before training a machine learning model, it is important to **split the dataset** into:

- **Training set**: used to train the model  
- **Test set**: used to evaluate the model's performance on unseen data  

This ensures that the model **generalizes well** and does not just memorize the training data.

We typically use an **80:20 or 70:30 split**, meaning:
- 80% of data for training
- 20% of data for testing


In [35]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [36]:
x_train.shape

(39665, 1)

## Bag-of-Words (BoW)

Bag-of-Words converts text into numerical features by counting word occurrences.  
- Each review becomes a vector of word counts.  
- Ignores word order, only considers presence/frequency.  

In this project, we use **CountVectorizer** with:
- **Unigrams and Bigrams** (`ngram_range=(1,2)`)  
- Top **5000 features** (`max_features=5000`)  



In [37]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    lowercase=False
)

In [38]:
X_train_bow = cv.fit_transform(x_train["review"])
X_test_bow = cv.transform(x_test["review"])



In [39]:
X_train_bow.shape

(39665, 179987)

## Multinomial Naive Bayes (MNB)

Multinomial Naive Bayes is a **probabilistic classifier** that works well with **text data represented as word counts (Bag-of-Words)**.  

- Assumes features (word counts) are **conditionally independent** given the class.  
- Efficient and fast for large text datasets.  
- Ideal for **sentiment analysis**, spam detection, and review classification.  

In this project, MNB is used to classify movie reviews as **positive (1)** or **negative (0)**.


In [40]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_bow , y_train)

In [41]:
y_pred = mnb.predict(X_test_bow)


In [42]:
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.8594332963597863

In [43]:
confusion_matrix(y_test,y_pred)

array([[4403,  630],
       [ 764, 4120]])

## Random Forest Classifier

Random Forest is an **ensemble learning method** that combines multiple decision trees to make predictions.  

- Each tree votes on the class, and the majority vote is chosen.  
- Can handle complex relationships in data, but **less efficient with sparse text features**.  
- Often used alongside Bag-of-Words or TF-IDF, but usually performs **slightly worse than Naive Bayes** on text data.  

In this project, Random Forest is used as a comparison to **Multinomial Naive Bayes** for movie review sentiment classification.


In [44]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
cvr = CountVectorizer(
    max_features = 3000,
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    lowercase=False
)

In [45]:
X_train_bow_rf = cvr.fit_transform(x_train["review"])
X_test_bow_rf = cvr.transform(x_test["review"])



In [46]:
rf.fit(X_train_bow_rf,y_train)

In [47]:
y_pred_rf = rf.predict(X_test_bow_rf)

In [48]:
accuracy_score(y_test,y_pred_rf)

0.841181809014823

In [49]:
cvb = CountVectorizer(ngram_range=(1,2),max_features=5000,tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    lowercase=False)
X_train_bow_bg = cvb.fit_transform(x_train["review"])
X_test_bow_bg = cvb.transform(x_test["review"])
rf.fit(X_train_bow_bg,y_train)
y_pred_bg = rf.predict(X_test_bow_bg)
accuracy_score(y_test,y_pred_bg)



0.845921145507714

In [50]:
mnb.fit(X_train_bow_bg,y_train)
y_pred_bg_mnb = mnb.predict(X_test_bow_bg)

In [51]:
accuracy_score(y_test,y_pred_bg_mnb)

0.8503579711606333