In [None]:
# load libs and dataset


import os
import re

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from numpy import asarray
from numpy import zeros

from keras.layers import LSTM, SimpleRNN
from keras.layers.core import Dense
from keras.layers.embeddings import Embedding

from keras.models import Sequential

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

ds = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
ds.head(5)

In [None]:
# clean dataset

def clean(sentence):
    
    # remove html tags
    sentence = re.compile(r'<[^>]+>').sub('', sentence)

    # remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # remove single characters
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    # lowercase sentence
    sentence = sentence.lower()

    return sentence

In [None]:
# create x and y dimensions, x = sentences as string, y = ratings as binary

x = []
sentences = list(ds['review'])
for sentence in sentences:
    x.append(clean(sentence))
    
y = ds['sentiment']
y = np.array(list(map(lambda x: 1 if x == "positive" else 0, y)))

#x
#y

In [None]:
# create train (80%) and test (20%) datasets

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

In [None]:
# create tokens (as a word-to-index dictionary) and pad sequences (maxlen 100)

tokenizer = Tokenizer(num_words = 10000)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

maxlen = 100

x_train = pad_sequences(x_train, padding = 'post', maxlen = maxlen)
x_test = pad_sequences(x_test, padding = 'post', maxlen = maxlen)

vocab_size = len(tokenizer.word_index) + 1 # adding 1 because of reserved 0 index

In [None]:
#vocab_size
#x_train
#y_train

In [None]:
# create the feature matrix and provide a dense representation of words that capture something about their meaning
# use Stanford's GloVe 100d word embeddings
# source = https://nlp.stanford.edu/projects/glove/
# info = https://www.kaggle.com/danielwillgeorge/glove6b100dtxt?select=glove.6B.100d.txt

embeddings_dictionary = dict()
glove_file = open('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype = 'float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
#######
# RNN #
#######

# create RNN model, train and evaluate

model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights = [embedding_matrix], input_length = maxlen , trainable = False)
model.add(embedding_layer)
model.add(SimpleRNN(128))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

print(model.summary())

history = model.fit(x_train, y_train, batch_size = 128, epochs = 6, verbose = 1, validation_split = 0.2)
score = model.evaluate(x_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
# plot RNN model results

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('RNN Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train','Test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('RNN Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train','Test'], loc='upper left')
plt.show()

In [None]:
########
# LSTM #
########

# create LSTM model, train and evaluate

model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights = [embedding_matrix], input_length = maxlen , trainable = False)
model.add(embedding_layer)
model.add(LSTM(128))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

print(model.summary())

history = model.fit(x_train, y_train, batch_size = 128, epochs = 6, verbose = 1, validation_split = 0.2)
score = model.evaluate(x_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
# plot LSTM model results

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('LSTM Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train','Test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('LSTM Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train','Test'], loc='upper left')
plt.show()