In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
df.head()

In [None]:
df = df.assign(label = 0)
df.loc[df['sentiment'] == 'positive', 'label'] = 1
df.head()

In [None]:
import re
df['review_clean'] = df['review'].map(lambda x: re.sub('<[^<]+?>', '', x))

In [None]:
df.head()

In [None]:
df['review_clean_words_count'] = df['review_clean'].map(lambda x: len(x.split(' ')))
df.head()

In [None]:
df.describe()

In [None]:
plt.hist(df['review_clean_words_count'], bins=5, range=(0, 1200))

In [None]:
len(df['sentiment'])

In [None]:
texts = df['review_clean'].tolist()
labels = df['label'].tolist()

In [None]:
import keras
from keras.preprocessing import text, sequence

maxlen = 500
max_words = 10000

tokenizer = text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [None]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = sequence.pad_sequences(sequences, maxlen=maxlen)

In [None]:
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

In [None]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

training_samples = 24000
validation_samples = 6000
test_samples = 20000

X_train = data[:training_samples]
y_train = labels[:training_samples]

X_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

X_test = data[training_samples + validation_samples: training_samples + validation_samples + test_samples]
y_test = labels[training_samples + validation_samples: training_samples + validation_samples + test_samples]

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)


In [None]:
def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    
    epochs = range(1, len(acc) + 1)
    
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    
    plt.figure()
    
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()

In [None]:
# Set the embedding layer
from keras import models, layers

model = models.Sequential()
model.add(layers.Embedding(max_words, 128, input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

model.summary()

In [None]:
history = model.fit(X_train, y_train,
                   epochs=10,
                   batch_size=32,
                   validation_data=(X_val, y_val))

In [None]:
plot_history(history)

In [None]:
model.evaluate(X_test, y_test)

Let's try a Simple RNN now

In [None]:
simple_rnn = models.Sequential()
simple_rnn.add(layers.Embedding(max_words, 32, input_length=maxlen))
simple_rnn.add(layers.SimpleRNN(32))
simple_rnn.add(layers.Dense(1, activation='sigmoid'))

simple_rnn.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

simple_rnn.summary()

In [None]:
simple_rnn_history = simple_rnn.fit(X_train, y_train,
                                   epochs=10,
                                   batch_size=128,
                                   validation_data=(X_val, y_val))

In [None]:
plot_history(simple_rnn_history)

In [None]:
simple_rnn.evaluate(X_test, y_test)

LSTM

In [None]:
lstm = models.Sequential()
lstm.add(layers.Embedding(max_words, 32, input_length=maxlen))
lstm.add(layers.LSTM(32))
lstm.add(layers.Dense(1, activation='sigmoid'))

lstm.compile(optimizer='rmsprop',
            loss='binary_crossentropy',
            metrics=['acc'])

lstm.summary()

In [None]:
lstm_history = lstm.fit(X_train, y_train,
                                      epochs=10,
                                      batch_size=128,
                                      validation_data=(X_val, y_val))

In [None]:
plot_history(lstm_history)

In [None]:
lstm.evaluate(X_test, y_test)

What about with Dropout

In [None]:
lstm = models.Sequential()
lstm.add(layers.Embedding(max_words, 32, input_length=maxlen))
lstm.add(layers.LSTM(32, dropout=0.2, recurrent_dropout=0.5, return_sequences=True))
lstm.add(layers.LSTM(32, dropout=0.2, recurrent_dropout=0.5))
lstm.add(layers.Dense(1, activation='sigmoid'))

lstm.compile(optimizer='rmsprop',
            loss='binary_crossentropy',
            metrics=['acc'])

lstm.summary()

In [None]:
lstm_history = lstm.fit(X_train, y_train,
                        epochs=30,
                        batch_size=128,
                        validation_data=(X_val, y_val))

In [None]:
plot_history(lstm_history)

In [None]:
lstm.evaluate(X_test, y_test)