<a href="https://colab.research.google.com/github/Krishnaugale353/Sentiment-Analysis-on-IMDB-dataset/blob/main/notebook788dc9ca28.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import re
import pandas as pd
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.listdir('/kaggle/input/')

['imdb-dataset']

In [None]:
df = pd.read_csv('/content/drive/MyDrive/train.csv')

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,That's what I kept asking myself during the ma...,negative
1,I did not watch the entire movie. I could not ...,negative
2,A touching love story reminiscent of In the M...,positive
3,This latter-day Fulci schlocker is a totally a...,negative
4,"First of all, I firmly believe that Norwegian ...",negative


In [None]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [None]:
df['review'][4]

"First of all, I firmly believe that Norwegian movies are continually getting better. From the tedious emotional films of the 70's and 80's, movies from this place actually started to contain a bit of humour. Imagine.. Actual comedies were made! Movies were actually starting to get entertaining and funny, as opposed to long, dark, depressing and boring.<br /><br />During the 90's and 00's several really great movies were made by a 'new generation' of filmmakers. Movie after movie were praised by critics and played loads of money. It became the norm!<br /><br />Then came United...<br /><br />*MINOR SPOILERS* It's just simply not funny. Not once. Not ever. But the thing is... We THINK its funny. Because we're used to norwegian movies to be funny. Especially with a cast like this with a few really funny comedians. But.. They neither say nor do anything funny! Where's the humor? Show me the humor! Is it the awkward clerk played by Harald Eia? Is it the overacting totally ridiculously unrea

**Text Preprocessing**

In [None]:
def preprocessing_text(text):
  text = text.lower()
  text = re.sub('[^a-z ]', '', text)
  text = re.sub('<[^>]+>', '', text)
  return text

In [None]:
df['review'] = df['review'].apply(preprocessing_text)

In [None]:
df['review'][13]

'the cast played shakespearebr br shakespeare lostbr br i appreciate that this is trying to bring shakespeare to the masses but why ruin something so goodbr br is it because the scottish play is my favorite shakespeare i do not know what i do know is that a certain rev bowdler hence bowdlerization tried to do something similar in the victorian erabr br in other words you cannot improve perfectionbr br i have no more to write but as i have to write at least ten lines of text and english composition was never my forte i will just have to keep going and say that this movie as the saying goes just does not cut it'

Tokenizing and converting to Sequences (Integer Encoding)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

2024-07-30 10:09:17.306438: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 10:09:17.306553: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 10:09:17.420601: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
tokenizer = Tokenizer(split=" ")

In [None]:
tokenizer.fit_on_texts(df['review'].values)

In [None]:
x = tokenizer.texts_to_sequences(df['review'].values)

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
X = pad_sequences(x)

In [None]:
length=[]
for i in X:
  length.append(len(i))
max(length)

2460

In [None]:
len(tokenizer.word_index)

176094

Building a Word2Vec model

In [None]:
from gensim.models import Word2Vec

In [None]:
reviews =[sent.split() for sent in df['review'].values]

In [None]:
word2vec_model = Word2Vec(reviews, sg=1, window=5, vector_size=100)

In [None]:
word2vec_model.train(reviews, total_examples=word2vec_model.corpus_count, epochs=10)

(84491352, 114571470)

Splitting the data

In [None]:
df['sentiment'].values

array(['positive', 'positive', 'positive', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [None]:
y = np.array([1 if  i == 'positive' else 0 for i in df['sentiment']])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Creating the Embedding Matrix

In [None]:
embedding_matrix = np.zeros((len(tokenizer.word_index)+1, 100)) #(vocab_size, vector_size)

In [None]:
for word, index in tokenizer.word_index.items():
  if word in word2vec_model.wv:
    embedding_matrix[index] = word2vec_model.wv[word]
  else:
    embedding_matrix[index] = np.zeros(100)

Model Building

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
import keras_tuner as kt

In [None]:
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=2460, weights=[embedding_matrix], trainable=False))
    for i in range(hp.Int('num_layers', min_value=1, max_value=4, step=1)):
        model.add(LSTM(units=hp.Int('units'+str(i), min_value=16, max_value=128, step=8), return_sequences=True))
    model.add(LSTM(units=hp.Int('unitsf', min_value=8, max_value=64, step=8)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=hp.Choice('optimizer',values=['adam','nadam']),loss='BinaryCrossentropy', metrics=['accuracy'])
    return model

In [None]:
tuner = kt.RandomSearch(build_model, objective='val_accuracy', max_trials=10, project_name='IMDB', directory='mydir7')



In [None]:
tuner.search(X_train, y_train, epochs=2, batch_size=128, validation_data=(X_test, y_test))

Trial 10 Complete [00h 04m 45s]
val_accuracy: 0.8518999814987183

Best val_accuracy So Far: 0.878000020980835
Total elapsed time: 00h 32m 32s


In [None]:
tuner.get_best_hyperparameters()[0].values

{'num_layers': 2,
 'units0': 112,
 'unitsf': 64,
 'optimizer': 'nadam',
 'units1': 96,
 'units2': 96,
 'units3': 40}

In [None]:
model = tuner.get_best_models(num_models=1)[0]

  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, batch_size=128, epochs=10, initial_epoch=3, validation_data=(X_test, y_test))

Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 368ms/step - accuracy: 0.8623 - loss: 0.3293 - val_accuracy: 0.8677 - val_loss: 0.3060
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 365ms/step - accuracy: 0.8765 - loss: 0.2977 - val_accuracy: 0.8886 - val_loss: 0.2688
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 364ms/step - accuracy: 0.8868 - loss: 0.2802 - val_accuracy: 0.8605 - val_loss: 0.3280
Epoch 7/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 369ms/step - accuracy: 0.8112 - loss: 0.4058 - val_accuracy: 0.8938 - val_loss: 0.2530
Epoch 8/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 372ms/step - accuracy: 0.8975 - loss: 0.2527 - val_accuracy: 0.9062 - val_loss: 0.2315
Epoch 9/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 371ms/step - accuracy: 0.9073 - loss: 0.2314 - val_accuracy: 0.8972 - val_loss: 0.2424
Epoc

<keras.src.callbacks.history.History at 0x7c7b2261fa00>

In [None]:
model.evaluate(X_test, y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 80ms/step - accuracy: 0.9108 - loss: 0.2203


[0.21945638954639435, 0.9118000268936157]

In [None]:
model.save('sentiment.h5')

In [None]:
import pickle

In [None]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)