In [1]:
import pandas as pd
import numpy as np
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shreya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shreya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Load the dataset
import pandas as pd
df = pd.read_csv(r"D:\Downloads\archive (3)\IMDB Dataset.csv") 
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [4]:
#Tokenization
df['tokenized_review'] = df['review'].apply(word_tokenize)
print(df[['review', 'tokenized_review']].head())

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                    tokenized_review  
0  [One, of, the, other, reviewers, has, mentione...  
1  [A, wonderful, little, production, ., <, br, /...  
2  [I, thought, this, was, a, wonderful, way, to,...  
3  [Basically, there, 's, a, family, where, a, li...  
4  [Petter, Mattei, 's, ``, Love, in, the, Time, ...  


In [5]:
#Lowercasing
df['lowercased_review'] = df['review'].apply(lambda x: x.lower())
print(df[['review', 'lowercased_review']].head())


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                   lowercased_review  
0  one of the other reviewers has mentioned that ...  
1  a wonderful little production. <br /><br />the...  
2  i thought this was a wonderful way to spend ti...  
3  basically there's a family where a little boy ...  
4  petter mattei's "love in the time of money" is...  


In [6]:
#stop words 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    tokens = word_tokenize(text)  
    filtered_words = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_words)  
df['no_stopwords_review'] = df['lowercased_review'].apply(remove_stopwords)
print(df[['review', 'lowercased_review', 'no_stopwords_review']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shreya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                   lowercased_review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production. <br /><br />the...   
2  i thought this was a wonderful way to spend ti...   
3  basically there's a family where a little boy ...   
4  petter mattei's "love in the time of money" is...   

                                 no_stopwords_review  
0  one reviewers mentioned watching 1 oz episode ...  
1  wonderful little production . < br / > < br / ...  
2  thought wonderful way spend time hot summer we...  
3  basically 's family little boy ( jake ) thinks...  
4  petter mattei 's `` love time money '' visuall..

In [7]:

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))
df['no_punctuation_review'] = df['no_stopwords_review'].apply(remove_punctuation)
print(df[['review', 'no_stopwords_review', 'no_punctuation_review']].head())


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                 no_stopwords_review  \
0  one reviewers mentioned watching 1 oz episode ...   
1  wonderful little production . < br / > < br / ...   
2  thought wonderful way spend time hot summer we...   
3  basically 's family little boy ( jake ) thinks...   
4  petter mattei 's `` love time money '' visuall...   

                               no_punctuation_review  
0  one reviewers mentioned watching 1 oz episode ...  
1  wonderful little production   br    br   filmi...  
2  thought wonderful way spend time hot summer we...  
3  basically s family little boy  jake  thinks s ...  
4  petter mattei s  love time money  visually stu..

In [8]:
#Stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()
def apply_stemming(text):
    tokens = word_tokenize(text)  
    stemmed_words = [stemmer.stem(word) for word in tokens]  
    return ' '.join(stemmed_words)  

df['stemmed_review'] = df['no_punctuation_review'].apply(apply_stemming)
print(df[['review', 'no_punctuation_review', 'stemmed_review']].head())


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                               no_punctuation_review  \
0  one reviewers mentioned watching 1 oz episode ...   
1  wonderful little production   br    br   filmi...   
2  thought wonderful way spend time hot summer we...   
3  basically s family little boy  jake  thinks s ...   
4  petter mattei s  love time money  visually stu...   

                                      stemmed_review  
0  one review mention watch 1 oz episod ll hook r...  
1  wonder littl product br br film techniqu unass...  
2  thought wonder way spend time hot summer weeke...  
3  basic s famili littl boy jake think s zombi cl...  
4  petter mattei s love time money visual stun fi..

In [10]:
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['stemmed_review'])
print(f"Shape of BoW matrix: {X_bow.shape}")

Shape of BoW matrix: (50000, 105761)


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X_bow, df['sentiment'], test_size=0.1, random_state=200)
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.86
