In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\91808\.cache\kagglehub\datasets\lakshmi25npathi\imdb-dataset-of-50k-movie-reviews\versions\1


 Problem 1

 Apply all the preprocessing techniques that you think are necessary

In [4]:
#Load the dataset
import pandas as pd

# Path to your dataset file
path = r"C:\Users\91808\.cache\kagglehub\datasets\lakshmi25npathi\imdb-dataset-of-50k-movie-reviews\versions\1\IMDB Dataset.csv"

# Load dataset
df = pd.read_csv(path)

# Display first few rows
print(df.head())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [5]:
# Basic info and checks

print(df.info())
print(df['sentiment'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [6]:
# Preprocessing steps (Full Pipeline) We’ll clean and prepare text for NLP models.

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [7]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91808\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91808\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\91808\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
# Create preprocessing function

# Initialize lemmatizer 
lematizer = WordNetLemmatizer()

In [9]:
# get english stopwords

stop_words = set(stopwords.words('english'))

def preprocess_text(text):

    # Lowercase
    text = text.lower()

    #Remove HTML Tags
    text = re.sub(r'<.*?>','',text)

    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize (split text into words)
    words = text.split()

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Lemmatize words (convert running -> run)
    words = [lematizer.lemmatize(word) for word in words]

    # Join words back into the single strings
    return ' '.join(words)


In [10]:
# Apply preprocessing to dataset
df['cleaned_review'] = df['review'].apply(preprocess_text)


In [11]:
# Verify cleaned data
print(df[['review','cleaned_review']].head(3))

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   

                                      cleaned_review  
0  one reviewer mentioned watching oz episode you...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  


In [12]:
# (Optional): Save cleaned dataset
df.to_csv("cleaned_imdb_dataset.csv", index=False)


 Problem 2

 Find out the number of words in the entire corpus and also the total number of unique words(vocabulary) using just python

In [13]:
# Load cleaned dataset
 
df = pd.read_csv("cleaned_imdb_dataset.csv")

In [14]:
# Combine all reviews into one big text (the corpus)

corpus = ' '.join(df['cleaned_review'])

In [15]:
# Split into individual words
# Split the corpus into words
words = corpus.split()

In [16]:
# Count total and unique words

# Total number of words into corpus
total_words = len(words)

# Total number of unique words(vocabulary)
unique_words = len(set(words))

print("Total number of words in corpus:", total_words)
print("Total number of unique words (vocabulary):", unique_words)

Total number of words in corpus: 5930080
Total number of unique words (vocabulary): 203439


 Problem 3

 Apply One Hot Encoding

In [17]:
# Import dependencies keras (tensorflow)
from tensorflow.keras.preprocessing.text import one_hot


In [18]:
# prepare the text data

# Example: use first 5 reviews for demonstration
corpus = df['cleaned_review'].tolist()

In [19]:
# Define vocabulary size
#You must specify how many unique words (vocabulary size) you want to consider. For example, let’s use 10,000 (you can use the unique word count from Problem 2).

vocab_size = 10000


In [20]:
# Apply One-Hot Encoding

# Encode each review into a list of integers
onehot_repr = [one_hot(review, vocab_size) for review in corpus]

# Display example
print(onehot_repr[0][:20])  # first 20 encoded values of the first review


[7349, 9392, 5197, 2416, 3725, 3254, 6190, 6392, 1616, 485, 3852, 7240, 5268, 572, 6820, 3725, 5968, 4605, 7894, 1052]


In [21]:
# (Optional): Padding sequences
# Since reviews have different lengths, you often pad them to make them equal length for models.

from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 100  # you can adjust this
padded_docs = pad_sequences(onehot_repr, padding='post', maxlen=max_length)

print(padded_docs.shape) 


(50000, 100)


 Problem 4

 Apply bag words and find the vocabulary also find the times each word has occured

In [22]:
# Bag of Words + Vocabulary + Word Counts
# Import CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer


In [23]:
# Create the vectorizer

cv = CountVectorizer()

In [24]:
# Fit and transform your cleaned text
bow_matrix = cv.fit_transform(df['cleaned_review'])


In [25]:
# Get the vocabulary (all unique words)

vocab = cv.get_feature_names_out()
print("Vocabulary size:", len(vocab))
print("vocabulary:",vocab)

Vocabulary size: 203415
vocabulary: ['aa' 'aaa' 'aaaaaaaaaaaahhhhhhhhhhhhhh' ... 'zzzzzzzzzzzzz'
 'zzzzzzzzzzzzzzzzzz' 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz']


In [26]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# ============================
# MEMORY-SAFE CountVectorizer
# ============================
cv = CountVectorizer(
    min_df=5,          # ignore words appearing in < 5 documents (remove rare words)
    max_df=0.80,       # ignore words appearing in > 80% documents (too common)
    max_features=20000 # limit vocabulary to top 20,000 words
)

# Fit and transform (keeps sparse matrix)
bow_matrix = cv.fit_transform(df['cleaned_review'])

# Print matrix shape
print("Sparse Matrix Shape:", bow_matrix.shape)

# Vocabulary
vocab = cv.get_feature_names_out()
print("Vocabulary Size:", len(vocab))

# ============================
# Memory-safe word frequency
# ============================
word_counts = np.asarray(bow_matrix.sum(axis=0)).ravel()

# Word → count dictionary
word_count_dict = dict(zip(vocab, word_counts))

# Sort by count
sorted_word_counts = sorted(word_count_dict.items(), key=lambda x: x[1], reverse=True)

# Show top 20 most frequent words
print("\nTop 20 most frequent words:")
for word, count in sorted_word_counts[:20]:
    print(f"{word}: {count}")


Sparse Matrix Shape: (50000, 20000)
Vocabulary Size: 20000

Top 20 most frequent words:
movie: 99025
film: 89809
one: 52677
like: 39790
time: 29396
good: 28615
character: 27573
get: 24435
even: 24286
story: 24229
would: 24001
make: 23564
see: 23494
really: 22900
scene: 20706
much: 18897
well: 18629
people: 17979
great: 17803
bad: 17673


 Problem 5

Apply bag of bi-gram and bag of tri-gram and write down your observation about the dimensionality of the vocabulary

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
#------- UNIGRAM --------
cv_uni = CountVectorizer()
bow_uni = cv_uni.fit(df['cleaned_review'])
vocab_uni = cv_uni.get_feature_names_out()
print("Unigram Vocabulary Size:",len(vocab_uni))

Unigram Vocabulary Size: 203415


In [29]:
# ---------- BIGRAM ----------
cv_bi = CountVectorizer(ngram_range=(2,2))
bow_bi = cv_bi.fit(df['cleaned_review'])
vocab_bi = cv_bi.get_feature_names_out()
print("Bigram Vocabulary Size:", len(vocab_bi))

Bigram Vocabulary Size: 3071115


In [30]:
# ---------- TRIGRAM ----------
cv_tri = CountVectorizer(ngram_range=(3,3))
bow_tri = cv_tri.fit(df['cleaned_review'])
vocab_tri = cv_tri.get_feature_names_out()
print("Trigram Vocabulary Size:", len(vocab_tri))

Trigram Vocabulary Size: 5371719


Problem 6

Apply tf-idf and find out the idf scores of words, also find out the vocabulary.

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Step 1: Create TF-IDF vectorizer
tfidf = TfidfVectorizer()

# Step 2: Fit the model on the cleaned text (DO NOT convert to array → saves memory)
tfidf_matrix = tfidf.fit_transform(df['cleaned_review'])

# Step 3: Vocabulary (all words)
vocab = tfidf.get_feature_names_out()
print("Vocabulary Size:", len(vocab))
print("\nSample Vocabulary:", vocab[:20])  # print first 20 words

# Step 4: IDF scores of all words
idf_scores = tfidf.idf_

# Step 5: Create a table: word → idf
idf_df = pd.DataFrame({
    'word': vocab,
    'idf_score': idf_scores
})

print("\nIDF Table (first 20 rows):")
print(idf_df.head(20))

# Step 6: Sort words by highest IDF (most unique words)
idf_sorted = idf_df.sort_values(by='idf_score', ascending=False)

print("\nTop 20 highest IDF words (most rare/unique):")
print(idf_sorted.head(20))


Vocabulary Size: 203415

Sample Vocabulary: ['aa' 'aaa' 'aaaaaaaaaaaahhhhhhhhhhhhhh' 'aaaaaaaargh' 'aaaaaaah'
 'aaaaaaahhhhhhggg' 'aaaaagh' 'aaaaah' 'aaaaargh'
 'aaaaarrrrrrgggggghhhhhh' 'aaaaatchkah' 'aaaaaw' 'aaaahhhhhh'
 'aaaahhhhhhh' 'aaaand' 'aaaarrgh' 'aaaawwwwww' 'aaaggghhhhhhh' 'aaaghi'
 'aaah']

IDF Table (first 20 rows):
                          word  idf_score
0                           aa   8.986585
1                          aaa   9.254849
2   aaaaaaaaaaaahhhhhhhhhhhhhh  11.126651
3                  aaaaaaaargh  11.126651
4                     aaaaaaah  11.126651
5             aaaaaaahhhhhhggg  11.126651
6                      aaaaagh  11.126651
7                       aaaaah  11.126651
8                     aaaaargh  11.126651
9      aaaaarrrrrrgggggghhhhhh  11.126651
10                 aaaaatchkah  11.126651
11                      aaaaaw  11.126651
12                  aaaahhhhhh  11.126651
13                 aaaahhhhhhh  11.126651
14                      aaaand  10.72