In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive



## 1 | Data Preprocessing ##
# **Prepare the dataset before training**

# **Loading dataset**

In [2]:
import pandas as pd

# Specify the file name you uploaded
file_name = "/content/drive/MyDrive/IMDB Dataset.csv"

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_name)

In [3]:
tfr=df.review[:]
tfs=df.sentiment[:]

In [4]:
df.head(10)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
5,"probably my all-time favorite movie, a story o...",positive
6,i sure would like to see a resurrection of a u...,positive
7,"this show was an amazing, fresh & innovative i...",negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


In [5]:

# 1.3 Encode output column into binary
df.sentiment.replace('positive', 1, inplace=True)
df.sentiment.replace('negative', 0, inplace=True)
print(f"Dataset head after encoding :\n{df.head(10)}\n")

Dataset head after encoding :
                                              review  sentiment
0  one of the other reviewers has mentioned that ...          1
1  a wonderful little production. <br /><br />the...          1
2  i thought this was a wonderful way to spend ti...          1
3  basically there's a family where a little boy ...          0
4  petter mattei's "love in the time of money" is...          1
5  probably my all-time favorite movie, a story o...          1
6  i sure would like to see a resurrection of a u...          1
7  this show was an amazing, fresh & innovative i...          0
8  encouraged by the positive comments about this...          0
9  if you like original gut wrenching laughter yo...          1



## 2 | Data cleaning ##
Clean dataset reviews as following:
1. Remove HTML tags
2. Remove special characters
3. Convert everything to lowercase
4. Remove stopwords
5. Stemming


In [6]:
!pip install bs4
from bs4 import BeautifulSoup

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [7]:
tfr=tfr.apply(strip_html)

  soup = BeautifulSoup(text, "html.parser")


In [8]:
# 2.2 Remove special characters
def is_special(text):
    rem = ''
    for i in text:
        if i.isalnum():
            rem = rem + i
        else:
            rem = rem + ' '
    return rem

In [9]:
tfr=tfr.apply(is_special)

In [10]:
tfr.str.lower()
tfr.head(10)

0    one of the other reviewers has mentioned that ...
1    a wonderful little production  the filming tec...
2    i thought this was a wonderful way to spend ti...
3    basically there s a family where a little boy ...
4    petter mattei s  love in the time of money  is...
5    probably my all time favorite movie  a story o...
6    i sure would like to see a resurrection of a u...
7    this show was an amazing  fresh   innovative i...
8    encouraged by the positive comments about this...
9    if you like original gut wrenching laughter yo...
Name: review, dtype: object

In [11]:
ft_data=tfr

In [12]:
!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [14]:
# Function to remove stop words from a string
nltk.download('punkt')
from nltk.tokenize import word_tokenize
def rem_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [15]:
tfr=tfr.apply(rem_stopwords)

In [16]:
tfr.head(10)

0    [one, reviewers, mentioned, watching, 1, oz, e...
1    [wonderful, little, production, filming, techn...
2    [thought, wonderful, way, spend, time, hot, su...
3    [basically, family, little, boy, jake, thinks,...
4    [petter, mattei, love, time, money, visually, ...
5    [probably, time, favorite, movie, story, selfl...
6    [sure, would, like, see, resurrection, dated, ...
7    [show, amazing, fresh, innovative, idea, 70, f...
8    [encouraged, positive, comments, film, looking...
9    [like, original, gut, wrenching, laughter, lik...
Name: review, dtype: object

In [17]:
#Stemming the text
from nltk.stem import SnowballStemmer
def stem_text(text):
    ss = SnowballStemmer('english')
    return " ".join([ss.stem(w) for w in text])

In [18]:
tfr=tfr.apply(stem_text)

In [19]:
tfr.head(10)

0    one review mention watch 1 oz episod hook righ...
1    wonder littl product film techniqu unassum old...
2    thought wonder way spend time hot summer weeke...
3    basic famili littl boy jake think zombi closet...
4    petter mattei love time money visual stun film...
5    probabl time favorit movi stori selfless sacri...
6    sure would like see resurrect date seahunt ser...
7    show amaz fresh innov idea 70 first air first ...
8    encourag posit comment film look forward watch...
9    like origin gut wrench laughter like movi youn...
Name: review, dtype: object

## 3 | Model Creation ##
Create model to fit it to the data

# 3.1 Creating Bag Of Words (BOW)

In [20]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
X = np.array(df.iloc[:,0].values)
y = np.array(df.sentiment.values)
cv = CountVectorizer(max_features = 2000)
X = cv.fit_transform(tfr).toarray()
print(f"=== Bag of words ===\n")
print(f"BOW X shape : {X.shape}")
print(f"BOW y shape : {y.shape}\n")

=== Bag of words ===

BOW X shape : (50000, 2000)
BOW y shape : (50000,)



In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text data into TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=2000)
X_tfidf = tfidf_vectorizer.fit_transform(tfr).toarray()
print("=== TF-IDF Features ===")
print(f"TF-IDF X shape: {X_tfidf.shape}")
print(f"TF-IDF y shape: {y.shape}")

=== TF-IDF Features ===
TF-IDF X shape: (50000, 2000)
TF-IDF y shape: (50000,)


# 3.2 Train test split

In [22]:
from sklearn.model_selection import train_test_split
X_traintf, X_testtf, y_traintf, y_testtf = train_test_split(X_tfidf, y, test_size=0.2, random_state=9)
print(f"Train shapes : X = {X_traintf.shape}, y = {y_traintf.shape}")
print(f"Test shapes  : X = {X_testtf.shape},  y = {y_testtf.shape}\n")

Train shapes : X = (40000, 2000), y = (40000,)
Test shapes  : X = (10000, 2000),  y = (10000,)



In [23]:
from sklearn.model_selection import train_test_split
X_trainbow, X_testbow, y_trainbow, y_testbow = train_test_split(X, y, test_size=0.2, random_state=9)
print(f"Train shapes : X = {X_trainbow.shape}, y = {y_trainbow.shape}")
print(f"Test shapes  : X = {X_testbow.shape},  y = {y_testbow.shape}\n")

Train shapes : X = (40000, 2000), y = (40000,)
Test shapes  : X = (10000, 2000),  y = (10000,)



In [24]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

def evaluate_models(xtrain, ytrain, xtest, ytest):
    models = [GaussianNB(), MultinomialNB(alpha=1.0, fit_prior=True),
              BernoulliNB(alpha=1.0, fit_prior=True), LogisticRegression()]

    for model in models:
        model.fit(xtrain, ytrain)
        y_pred = model.predict(xtest)
        accuracy = accuracy_score(ytest, y_pred)
        print(f"{model.__class__.__name__} Accuracy: {accuracy:.2f}")

In [25]:
evaluate_models(X_trainbow, y_trainbow, X_testbow, y_testbow)

GaussianNB Accuracy: 0.74
MultinomialNB Accuracy: 0.84
BernoulliNB Accuracy: 0.85
LogisticRegression Accuracy: 0.87


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
evaluate_models(X_traintf, y_traintf, X_testtf, y_testtf)

GaussianNB Accuracy: 0.80
MultinomialNB Accuracy: 0.84
BernoulliNB Accuracy: 0.85
LogisticRegression Accuracy: 0.88


In [27]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression

def evaluate_models_with_ann(X_train, X_test, y_train, y_test):
    # Normalize the input features
    X_train_normalized = tf.keras.utils.normalize(X_train, axis=1)
    X_test_normalized = tf.keras.utils.normalize(X_test, axis=1)

    # Build the ANN model
    model = tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=(X_train_normalized.shape[1],)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train_normalized, y_train, epochs=20, validation_split=0.2, verbose=0)

    # Evaluate the ANN model on the test set
    y_pred_ann = (model.predict(X_test_normalized) > 0.5).astype("int32")
    accuracy_ann = accuracy_score(y_test, y_pred_ann)
    print(f"ANN Accuracy: {accuracy_ann:.2f}")


In [28]:
evaluate_models_with_ann(X_trainbow, X_testbow, y_trainbow, y_testbow)

ANN Accuracy: 0.86


In [29]:
evaluate_models_with_ann(X_traintf, X_testtf, y_traintf, y_testtf)

ANN Accuracy: 0.86
