In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1 | Data Preprocessing ##
# **Prepare the dataset before training**

# **Loading dataset**

In [4]:
import pandas as pd

# Specify the file name you uploaded
file_name = "/content/drive/MyDrive/IMDB Dataset.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_name)

In [5]:
tfr=df.review[:]
tfs=df.sentiment[:]

In [None]:
df.head(10)

In [6]:
# 1.3 Encode output column into binary
df.sentiment.replace('positive', 1, inplace=True)
df.sentiment.replace('negative', 0, inplace=True)
print(f"Dataset head after encoding :\n{df.head(10)}\n")

Dataset head after encoding :
                                              review  sentiment
0  one of the other reviewers has mentioned that ...          1
1  a wonderful little production. <br /><br />the...          1
2  i thought this was a wonderful way to spend ti...          1
3  basically there's a family where a little boy ...          0
4  petter mattei's "love in the time of money" is...          1
5  probably my all-time favorite movie, a story o...          1
6  i sure would like to see a resurrection of a u...          1
7  this show was an amazing, fresh & innovative i...          0
8  encouraged by the positive comments about this...          0
9  if you like original gut wrenching laughter yo...          1



## 2 | Data cleaning ##
Clean dataset reviews as following:
1. Remove HTML tags
2. Remove special characters
3. Convert everything to lowercase
4. Remove stopwords
5. Stemming


In [7]:
!pip install bs4
from bs4 import BeautifulSoup

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()



In [8]:
tfr=tfr.apply(strip_html)

  soup = BeautifulSoup(text, "html.parser")


In [9]:
# 2.2 Remove special characters
def is_special(text):
    rem = ''
    for i in text:
        if i.isalnum():
            rem = rem + i
        else:
            rem = rem + ' '
    return rem

In [10]:
tfr=tfr.apply(is_special)

In [11]:
tfr.str.lower()
tfr.head(10)

0    one of the other reviewers has mentioned that ...
1    a wonderful little production  the filming tec...
2    i thought this was a wonderful way to spend ti...
3    basically there s a family where a little boy ...
4    petter mattei s  love in the time of money  is...
5    probably my all time favorite movie  a story o...
6    i sure would like to see a resurrection of a u...
7    this show was an amazing  fresh   innovative i...
8    encouraged by the positive comments about this...
9    if you like original gut wrenching laughter yo...
Name: review, dtype: object

In [12]:
!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [14]:
# Function to remove stop words from a string
nltk.download('punkt')
from nltk.tokenize import word_tokenize
def rem_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
tfr=tfr.apply(rem_stopwords)

In [16]:
tfr.head(10)

0    [one, reviewers, mentioned, watching, 1, oz, e...
1    [wonderful, little, production, filming, techn...
2    [thought, wonderful, way, spend, time, hot, su...
3    [basically, family, little, boy, jake, thinks,...
4    [petter, mattei, love, time, money, visually, ...
5    [probably, time, favorite, movie, story, selfl...
6    [sure, would, like, see, resurrection, dated, ...
7    [show, amazing, fresh, innovative, idea, 70, f...
8    [encouraged, positive, comments, film, looking...
9    [like, original, gut, wrenching, laughter, lik...
Name: review, dtype: object

In [17]:
#Stemming the text
from nltk.stem import SnowballStemmer
def stem_text(text):
    ss = SnowballStemmer('english')
    return " ".join([ss.stem(w) for w in text])

In [18]:
tfr=tfr.apply(stem_text)

In [19]:
tfr.head(10)

0    one review mention watch 1 oz episod hook righ...
1    wonder littl product film techniqu unassum old...
2    thought wonder way spend time hot summer weeke...
3    basic famili littl boy jake think zombi closet...
4    petter mattei love time money visual stun film...
5    probabl time favorit movi stori selfless sacri...
6    sure would like see resurrect date seahunt ser...
7    show amaz fresh innov idea 70 first air first ...
8    encourag posit comment film look forward watch...
9    like origin gut wrench laughter like movi youn...
Name: review, dtype: object

## 3 | Model Creation ##
Create model to fit it to the data

# 3.1 Creating Bag Of Words (BOW)

In [20]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
X = np.array(df.iloc[:,0].values)
y = np.array(df.sentiment.values)
cv = CountVectorizer(max_features = 2000)
X = cv.fit_transform(tfr).toarray()
print(f"=== Bag of words ===\n")
print(f"BOW X shape : {X.shape}")
print(f"BOW y shape : {y.shape}\n")

=== Bag of words ===

BOW X shape : (50000, 2000)
BOW y shape : (50000,)



# 3.2 Train test split

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)
print(f"Train shapes : X = {X_train.shape}, y = {y_train.shape}")
print(f"Test shapes  : X = {X_test.shape},  y = {y_test.shape}\n")

Train shapes : X = (40000, 2000), y = (40000,)
Test shapes  : X = (10000, 2000),  y = (10000,)



# 3.3 Defining the models and Training them

In [22]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
gnb, mnb, bnb = GaussianNB(), MultinomialNB(alpha=1.0,fit_prior=True), BernoulliNB(alpha=1.0,fit_prior=True)
gnb.fit(X_train, y_train)
mnb.fit(X_train, y_train)
bnb.fit(X_train, y_train)

# 3.4 Make predictions

In [24]:
ypg = gnb.predict(X_test)
ypm = mnb.predict(X_test)
ypb = bnb.predict(X_test)

## 4 | Model Evaluation ##
Evaluate model performance

In [25]:
from sklearn.metrics import accuracy_score
print(f"Gaussian accuracy    =  {round(accuracy_score(y_test, ypg), 2)*100} %")
print(f"Multinomial accuracy =  {round(accuracy_score(y_test, ypm), 2)*100} %")
print(f"Bernoulli accuracy   =  {round(accuracy_score(y_test, ypb), 2)*100} %")

Gaussian accuracy    =  74.0 %
Multinomial accuracy =  84.0 %
Bernoulli accuracy   =  85.0 %
