# Install libraries

In [59]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.callbacks import EarlyStopping

In [60]:
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split

In [61]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import re
from nltk import sent_tokenize
from nltk import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [62]:
!wget -O dataset.csv "https://drive.google.com/uc?export=download&id=1s128Fg1-0udJzZb3--3lWY5brFacCQyT&confirm=NEW_FILE_CONFIRM_CODE"

--2024-02-07 01:15:41--  https://drive.google.com/uc?export=download&id=1s128Fg1-0udJzZb3--3lWY5brFacCQyT&confirm=NEW_FILE_CONFIRM_CODE
Resolving drive.google.com (drive.google.com)... 74.125.199.113, 74.125.199.101, 74.125.199.102, ...
Connecting to drive.google.com (drive.google.com)|74.125.199.113|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1s128Fg1-0udJzZb3--3lWY5brFacCQyT&export=download [following]
--2024-02-07 01:15:41--  https://drive.usercontent.google.com/download?id=1s128Fg1-0udJzZb3--3lWY5brFacCQyT&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 172.253.117.132, 2607:f8b0:400e:c0a::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|172.253.117.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 66212309 (63M) [application/octet-stream]
Saving to: ‘dataset.csv’


2024-02-07 01:15:44 (160 MB/s) - ‘

# Initialize Dataset

In [63]:
df = pd.read_csv('dataset.csv')
df = df[:5000]
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [64]:
df.isna().sum()

review       0
sentiment    0
dtype: int64

In [65]:
X = df['review']
y = pd.get_dummies(df['sentiment'], drop_first=True)

# Clean Text data

In [66]:
def clean_text_data(sentence):
  sentence = sentence.lower()
  sentence = re.sub('[^a-zA-Z]',' ', sentence)
  sentence = word_tokenize(sentence)
  sentence = [word for word in sentence if word not in set(stopwords.words('english'))]
  lemmatizer = WordNetLemmatizer()
  sentence = [lemmatizer.lemmatize(word) for word in sentence]
  return ' '.join(sentence)

In [67]:
preprocessed_X = [clean_text_data(item) for item in X]

In [68]:
preprocessed_X[0:3]

['one reviewer mentioned watching oz episode hooked right exactly happened br br first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word br br called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never far away br br would say main appeal show due fact go show dare forget pretty picture painted mainstream audience forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skill prison exper

# Text encoding and padding

In [69]:
vocab_size = 10000
encoded_X = [one_hot(sentence, vocab_size) for sentence in preprocessed_X]

In [70]:
encoded_X[0][1:10]

[1502, 3992, 6616, 1153, 3447, 3573, 9637, 6048, 812]

In [71]:
encoded_X = pad_sequences(encoded_X,maxlen=500, padding='pre', truncating='pre')

# Model training

In [72]:
model = Sequential()
model.add(Embedding(vocab_size,100,input_length=500))
model.add(Dropout(0.3))
model.add(LSTM(200))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 500, 100)          1000000   
                                                                 
 dropout_2 (Dropout)         (None, 500, 100)          0         
                                                                 
 lstm_2 (LSTM)               (None, 200)               240800    
                                                                 
 dropout_3 (Dropout)         (None, 200)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 201       
                                                                 
Total params: 1241001 (4.73 MB)
Trainable params: 1241001 (4.73 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [73]:
X_train, X_test,y_train, y_test = train_test_split(encoded_X,y,test_size=0.33,random_state=20)

In [78]:
call_back1 = EarlyStopping(monitor='loss', patience=3)
model.fit(X_train, y_train, epochs=20, batch_size=200, callbacks=[call_back1])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


<keras.src.callbacks.History at 0x7f4cb9b22e60>

# Model testing

In [79]:
y_pred = model.predict(X_test)
y_pred = np.where(y_pred>0.50,1,0)



In [80]:
accuracy_score(y_pred,y_test)

0.7927272727272727

In [81]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.80      0.80       830
           1       0.80      0.78      0.79       820

    accuracy                           0.79      1650
   macro avg       0.79      0.79      0.79      1650
weighted avg       0.79      0.79      0.79      1650



In [82]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[665, 165],
       [177, 643]])