In [1]:
!unzip IMDB.csv.zip

Archive:  IMDB.csv.zip
  inflating: IMDB Dataset.csv        


In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from numpy import array
import tensorflow as tf

from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences # Use tensorflow.keras
from tensorflow.keras.models import Sequential # Use tensorflow.keras
from tensorflow.keras.layers import Activation, Dropout, Dense # Use tensorflow.keras
from tensorflow.keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM # Use tensorflow.keras
from sklearn.model_selection import train_test_split

In [3]:
movie_reviews = pd.read_csv('IMDB Dataset.csv')

In [4]:
movie_reviews.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
movie_reviews.shape

(50000, 2)

In [6]:
#Checking for null values
movie_reviews.isnull().values.any()

False

In [7]:
movie_reviews['review'][2]

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.<br /><br />This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.<br /><br />This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [8]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
def preprocess(sentence):
    sentence = sentence.lower() #all alphabets changed to lowercase
    sentence = remove_tags(sentence) #HTML-tags removal
    sentence = re.sub('[^a-zA-Z]', ' ', sentence) #punctuation and numbers removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence) #single character removal
    sentence = re.sub(r'\s+', ' ', sentence) #extra space removal

    stop_words = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') #stopwords removal
    sentence = stop_words.sub('', sentence)

    return sentence

In [11]:
preprocessed_reviews = []

for sentence in movie_reviews['review'].values:
    preprocessed_reviews.append(preprocess(sentence))

In [12]:
preprocessed_reviews[1]

'wonderful little production filming technique unassuming old time bbc fashion gives comforting sometimes discomforting sense realism entire piece actors extremely well chosen michael sheen got polari voices pat truly see seamless editing guided references williams diary entries well worth watching terrificly written performed piece masterful production one great master comedy life realism really comes home little things fantasy guard rather use traditional dream techniques remains solid disappears plays knowledge senses particularly scenes concerning orton halliwell sets particularly flat halliwell murals decorating every surface terribly well done '

In [14]:
preprocessed_reviews

['one reviewers mentioned watching oz episode hooked right exactly happened first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use word called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em city home many aryans muslims gangstas latinos christians italians irish scuffles death stares dodgy dealings shady agreements never far away would say main appeal show due fact goes shows dare forget pretty pictures painted mainstream audiences forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards sold nickel inmates kill order get away well mannered middle class inmates turned prison bitches due lack street skil

In [15]:
y = movie_reviews['sentiment']
for i in range(len(y)):
  if y[i] == 'positive':
    y[i] = 1
  else:
    y[i] = 0

In [16]:
y[2]

1

In [17]:
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(preprocessed_reviews,y, test_size=0.2, random_state=42)

In [18]:
sen_tokenizer = Tokenizer()
sen_tokenizer.fit_on_texts(X_train)
X_train = sen_tokenizer.texts_to_sequences(X_train)
X_test = sen_tokenizer.texts_to_sequences(X_test)

vocab_size = len(sen_tokenizer.word_index) + 1
vocab_size

92394

In [19]:
max_length = 100
X_train = pad_sequences(X_train, padding='post', maxlen=max_length)
X_test = pad_sequences(X_test, padding='post', maxlen=max_length)

In [21]:
from numpy import asarray
from numpy import zeros
embeddings = {}
glove_file = open('/content/a2_glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings[word] = vector_dimensions
glove_file.close()

In [22]:
embedding_matrix = zeros((vocab_size, 100))
for each_word, index in sen_tokenizer.word_index.items():
    embedding_vector = embeddings.get(each_word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [23]:
embedding_matrix.shape

(92394, 100)

In [24]:
nn_model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix])

nn_model.add(embedding_layer)
nn_model.add(Flatten())
#nn_model.add(Dense(100, activation='relu'))
nn_model.add(Dense(1, activation='sigmoid'))

In [25]:
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print(nn_model.summary())

None


In [None]:
# Model Training

# Convert X_train and y_train to float32
X_train_new = np.asarray(X_train).astype('float32')
y_train_new = np.asarray(y_train).astype('float32')

In [None]:
nn_model_history = nn_model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

In [None]:
# Predictions on the Test Set

score_nn = nn_model.evaluate(X_test, y_test, verbose=1)

In [26]:
from keras.layers import LSTM

In [27]:
# Neural Network architecture

lstm_model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length , trainable=False)

lstm_model.add(embedding_layer)
lstm_model.add(LSTM(128))

lstm_model.add(Dense(1, activation='sigmoid'))



In [28]:
# Model compiling

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(lstm_model.summary())

None


In [31]:
import numpy as np
# Model Training

# Convert X_train and y_train to float32
X_train_new = np.asarray(X_train).astype('float32')
y_train_new = np.asarray(y_train).astype('float32')

lstm_model_history = lstm_model.fit(X_train_new, y_train_new, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

Epoch 1/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - acc: 0.6842 - loss: 0.5879 - val_acc: 0.8101 - val_loss: 0.4354
Epoch 2/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - acc: 0.8087 - loss: 0.4288 - val_acc: 0.8081 - val_loss: 0.4082
Epoch 3/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - acc: 0.8278 - loss: 0.3881 - val_acc: 0.8388 - val_loss: 0.3578
Epoch 4/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - acc: 0.8533 - loss: 0.3478 - val_acc: 0.8407 - val_loss: 0.3544
Epoch 5/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - acc: 0.8609 - loss: 0.3322 - val_acc: 0.8624 - val_loss: 0.3268
Epoch 6/6
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - acc: 0.8678 - loss: 0.3137 - val_acc: 0.8520 - val_loss: 0.3398


In [38]:
X_test_new = np.asarray(X_test).astype('float32')
y_test_new = np.asarray(y_test).astype('float32')

In [39]:
score = lstm_model.evaluate(X_test_new, y_test_new, verbose=1)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - acc: 0.8476 - loss: 0.3440


In [40]:
lstm_model.save(f"./c1_lstm_model_acc_{round(score[1], 3)}.h5", save_format='h5')

