### AIM : To Create the model for sentiment analysis using deep learning .

In [None]:
# importing libraries

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import re

import pandas as pd
import numpy as np

# implement ann 
from keras import Sequential
from keras.layers import Dense, Embedding, LSTM, SimpleRNN, GRU, Dropout
import tensorflow as tf

In [None]:
dataframe = pd.read_csv("IMDB Dataset.csv") # importing the dataset 

In [None]:
dataframe.head() 

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
dataframe.shape

(50000, 2)

In [None]:
positive_data = dataframe[dataframe['sentiment'] == 'positive'] # positive comments
negetive_review = dataframe[dataframe['sentiment'] == 'negative'] # negetive comments 

In [None]:
# selecting 10000 rows for training

training = pd.concat([positive_data.iloc[:45000], negetive_review.iloc[:45000]], axis=0)
test = pd.concat([positive_data.iloc[45000:], negetive_review.iloc[45000:]], axis=0)

print(f"training shape : {training.shape} , testing shape : {test.shape}")


training shape : (50000, 2) , testing shape : (0, 2)


In [7]:
training['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [8]:
stop_words =  set(stopwords.words('english'))

In [9]:
def remove_stop_words(text):
    # function to remove the stopwords .

    text = word_tokenize(text)
    result = []

    for i in text :
        if i not in stop_words :
            result.append(i)

    return " ".join(result)

training['review'] = training['review'].apply(remove_stop_words)

In [None]:
def remove_unwanted_text(text):

    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Keep only letters and spaces
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

training['review'] = training['review'].apply(remove_unwanted_text)

In [11]:
training['review'] = training['review'].apply( lambda x : x.lower())
training['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [12]:
# training['sentiment'] = training['sentiment'].apply(lambda x : 1 if 'positive' else 0)

In [13]:
def transform_review(text):
    if text == 'positive':
        return 1
    else:
        return 0
    
training['sentiment'] = training['sentiment'].apply(transform_review)

In [14]:
training.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode ll...,1
1,a wonderful little production br br the filmin...,1
2,i thought wonderful way spend time hot summer ...,1
4,petter mattei s love time money visually stunn...,1
5,probably alltime favorite movie story selfless...,1


### implement Bag-of-words

In [15]:
# max sentence length
max(training['review'].apply(lambda x : len(x.split())))

1532

In [16]:
vectorizer = CountVectorizer(max_features=1000)

x = vectorizer.fit_transform(training['review']).toarray()
y = np.array(training['sentiment'], dtype=np.float32)


In [17]:
x_train, x_test , y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)
print(f'x_train : {x_train.shape}, y_train : {y_train.shape}, x_test : {x_test.shape}, y_test : {y_test.shape}')

x_train : (40000, 1000), y_train : (40000,), x_test : (10000, 1000), y_test : (10000,)


### implement model

In [None]:
# ann model

input_dim = x_train.shape[1]

model = Sequential([
    Dense(16, activation='relu', input_dim=input_dim),
    Dropout(0.3),  # Drops 30% of neurons
    Dense(8, activation='relu'),
    Dropout(0.3),  # Drops 30% of neurons
    Dense(1, activation='sigmoid')
])

with tf.device('/GPU:0'):
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(x_train, y_train, epochs=100, batch_size=100, verbose=1)
    loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
    print(f"ANN Accuracy: {accuracy:.4f}")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
# lstm 

model = Sequential([
    Embedding(input_dim=input_dim, output_dim=10, input_length=input_dim),
    LSTM(32, return_sequences=True),
    LSTM(16),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10, batch_size=20, verbose=1)
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print(f"LSTM Accuracy: {accuracy:.4f}")

In [None]:
# rnn 

model = Sequential([
        Embedding(input_dim=input_dim, output_dim=10, input_length=input_dim),
        SimpleRNN(16, return_sequences=True),
        SimpleRNN(8),
        Dense(1, activation='sigmoid')
    ])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

with tf.device('/GPU:0'):
    model.fit(x_train, y_train, epochs=10, batch_size=2000, verbose=1)
    loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
    print(f"RNN Accuracy: {accuracy:.4f}")

In [None]:
# gru 

model = Sequential([
        Embedding(input_dim=input_dim, output_dim=10, input_length=input_dim),
        GRU(16, return_sequences=True),
        GRU(8),
        Dense(1, activation='sigmoid')
    ])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
with tf.device('/GPU:0'):
    model.fit(x_train, y_train, epochs=10, batch_size=200, verbose=1)
    loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
    print(f"GRU Accuracy: {accuracy:.4f}")

In [None]:
model.save('model.h5') # exporting model 

INFO:tensorflow:Assets written to: ram://47048aef-6e31-4a65-8d2f-f1406202ffec/assets


In [None]:
# exporting vectorizer 

import pickle as pkl

with open('vectorizer.pkl', 'wb') as f:
    pkl.dump(vectorizer, f)