In [None]:
import pandas as pd
import nltk
nltk.download('stopwords')

In [None]:
df = pd.read_csv('imdb_dataset.csv')
df.head(20)

## Cleaning the data

Let's 
- Convert everything to lowercase
- Remove HTML tags
- Smarty remove punctuations
- Remove stopwords (as they don't truly contribute to the sentiment of the sentence)

In [None]:
from nltk.corpus import stopwords
import string

stop_words = stopwords.words('english')
punctuations = string.punctuation

def clean_df(df):
    df['review'] = df['review'].str.lower()
    df['review'] = df['review'].str.replace('<[^>]*>', '', regex=True)
    
    df['review'] = df['review'].str.replace('(?<!\d)[.,-](?!\d)', ' ', regex=True)

    for p in punctuations:
        df['review'] = df['review'].str.replace(p, ' ')

    df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    
    return df

df = clean_df(df)

In [None]:
# Convert the reviews into a 2D list - a list of sentences (which is a list of words)
sentences = df['review'].apply(lambda review: [w for w in review.split(' ') if w != '']).tolist()

## Embed, encode and pad the data

I will use Word2Vec for creating embeddings, then pad them and encode the labels


In [None]:
# Import the Word2Vec embedding model and set it up
from gensim.models import Word2Vec
word2vec = Word2Vec(sentences, vector_size=100, window=5, min_count=10, workers=8)

In [None]:
word2vec.wv.most_similar(positive='good')

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import random
import numpy as np

w2v_weights = word2vec.wv.vectors
vocab_size, embedding_size = w2v_weights.shape

print("Vocabulary Size: {} - Embedding Dim: {}".format(vocab_size, embedding_size))

In [None]:
# A helper function to visualize embeddings in a 2D space
# Shivom - Find a way to reuse this in other projects
def visualize_embeddings(w2v_model, n_samples=500):
    # Sample random words from model dictionary
    random_i = random.sample(range(vocab_size), n_samples)
    random_w = [w2v_model.wv.index_to_key[i] for i in random_i]

    # Generate Word2Vec embeddings of each word
    word_vecs = np.array([w2v_model.wv.get_vector(w) for w in random_w])
    
    # Apply t-SNE to Word2Vec embeddings, reducing to 2 dims
    tsne = TSNE()
    tsne_e = tsne.fit_transform(word_vecs)
    
    # Plot t-SNE result
    plt.figure(figsize=(32, 32))
    plt.scatter(tsne_e[:, 0], tsne_e[:, 1], marker='o', c=range(len(random_w)), cmap=plt.get_cmap('Spectral'))
    
    for label, x, y, in zip(random_w, tsne_e[:, 0], tsne_e[:, 1]):
        plt.annotate(label,
                     xy=(x, y), xytext=(0, 15),
                     textcoords='offset points', ha='right', va='bottom',
                     bbox=dict(boxstyle='round, pad=0.2', fc='yellow', alpha=0.1))

In [None]:
visualize_embeddings(word2vec)

In [None]:
# Let's encode and pad the sentences with Word2Vec, and the sentiment with a label encoder

from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Sequences will be padded or truncated to this length
MAX_SEQUENCE_LENGTH = 200

def generate_sequence(df):
    for idx, row in df.iterrows():
        sentence = []
        for w in row['review'].split(' ')[:MAX_SEQUENCE_LENGTH]:  # Shivom - Do I need to substring with the MAX_SEQUENCE_LENGTH?
            if w != '' and w in word2vec.wv.key_to_index:
                sentence.append(word2vec.wv.key_to_index[w])
        yield np.array(sentence), row['sentiment']

review_set = []
sentiment_set = []

for seq in generate_sequence(df):
    review_set.append(seq[0])
    sentiment_set.append(seq[1])

review_set = pad_sequences(review_set, maxlen=MAX_SEQUENCE_LENGTH, padding='pre', value=0)

label_encoder = LabelEncoder()
sentiment_set = label_encoder.fit_transform(sentiment_set)

## Splitting the dataset

In [None]:
# Split the dataset into train and val

import random

TRAIN_TEST_SPLIT = 0.15

total_samples = df.shape[0]
n_val = int(TRAIN_TEST_SPLIT * total_samples)
n_train = total_samples - n_val

random_i = random.sample(range(total_samples), total_samples)
train_x = review_set[random_i[:n_train]]
train_y = sentiment_set[random_i[:n_train]]
val_x = review_set[random_i[n_train: n_train + n_val]]
val_y = sentiment_set[random_i[n_train: n_train + n_val]]

print("Train Shapes - X: {} - Y: {}".format(train_x.shape, train_y.shape))
print("Val Shapes - X: {} - Y: {}".format(val_x.shape, val_y.shape))

# Let's look at the distribution of categories in both sets
categories, ccount = np.unique(train_y, return_counts=True)
plt.figure(figsize=(16, 8))
plt.title("Training Set - Category Distribution")
plt.xticks(range(len(categories)), cat_dict.keys())
plt.bar(categories, ccount, align='center')
plt.show()

categories, ccount = np.unique(val_y, return_counts=True)
plt.figure(figsize=(16, 8))
plt.title("Validation Set - Category Distribution")
plt.xticks(range(len(categories)), cat_dict.keys())
plt.bar(categories, ccount, align='center')
plt.show()

n_categories = len(categories)

## Creating the model, compiling it and training

- I will create a Sequential model with an Input layer, an Emedding Layer and a Dropout.
- Next, we have a Bi-directional LSTM (studies show they work better than just LSTMs), 
- And then a Dense output layer with 1 neuron (output will be 0 or 1).

In [None]:
from keras.models import Sequential
from keras.layers import Activation, Bidirectional, Dense, Dropout, Embedding, Input, LSTM

model = Sequential()
model.add(Input(shape=(MAX_SEQUENCE_LENGTH,)))
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[w2v_weights], mask_zero=True, trainable=False))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
# Let's compile the model with the adam optimizer and the binary_crossentropy loss function because our output will be binary.
# For metrics we will observe accuracy
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# history = model.fit(train_x, train_y, epochs=10, batch_size=32, validation_data=(val_x, val_y), verbose=1)

In [None]:
# Plotting Loss and Accuracy Graphs
plt.figure(figsize=(6, 6))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

plt.figure(figsize=(6, 6))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
# Let's automate the hyperparameter tuning process and come back later
epoch_options = [5, 10, 20]
batch_sizes = [16, 32, 64, 128]
tuning_results = []

for epochs in epoch_options:
    for batch_size in batch_sizes:
        print(f"Epochs: {epochs}, Batch Size: {batch_size}")
        history = model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, validation_data=(val_x, val_y), verbose=1)
        tuning_results.append([epochs, batch_size, history])

In [None]:
import json

res = []
for history in tuning_results:
    res.append([history[0], history[1], history[2].history])

In [None]:
with open('res1.json', 'w') as f:
    f.write(json.dumps(res))

# Next steps

1. Publish the results of the 'quick' optimization for-loops
2. Look at hyperparameter optimization techniques
    - Grid Search
    - Random Search
    - Bayesian Optimization