### Reference
- https://github.com/WillKoehrsen/recurrent-neural-networks/blob/master/notebooks/Deep%20Dive%20into%20Recurrent%20Neural%20Networks.ipynb

In [10]:
import pandas as pd
import numpy as np

data = pd.read_csv('Users/vism/data/mpst_full_data.csv')
data.head()

# extract synopsis
synopsis = data['plot_synopsis']
len(synopsis)

14828

In [15]:
!pip install -r Users/vism/requirements.txt

Collecting charset-normalizer==3.1.0
  Using cached charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (195 kB)
Collecting ipykernel==6.22.0
  Using cached ipykernel-6.22.0-py3-none-any.whl (149 kB)
Collecting ipython==8.12.0
  Using cached ipython-8.12.0-py3-none-any.whl (796 kB)
Collecting joblib==1.2.0
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting keras==2.12.0
  Using cached keras-2.12.0-py2.py3-none-any.whl (1.7 MB)
Collecting logger==1.4
  Using cached logger-1.4.tar.gz (1.2 kB)
Collecting matplotlib==3.7.1
  Using cached matplotlib-3.7.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.2 MB)
Collecting numpy==1.23.5
  Using cached numpy-1.23.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Collecting packaging==23.1
  Using cached packaging-23.1-py3-none-any.whl (48 kB)
Collecting pandas==2.0.1
  Using cached pandas-2.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
Collecting 

In [11]:
## Global parameters

RANDOM_STATE = 50
EPOCHS = 10
BATCH_SIZE = 128
MAX_WORDS = 10000
EMBEDDING_DIM = 100
MAX_LEN = 100
VERBOSE = 1
SAVE_MODEL = True


In [12]:
from keras.preprocessing.text import Tokenizer
sampleText = "This is a short sentence (1) with one reference to an image. This next sentence, while non-sensical, does not have image and has two commas."
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts([sampleText])
s = tokenizer.texts_to_sequences([sampleText])[0]
print(' '.join(tokenizer.index_word[i] for i in s))
tokenizer.word_index.keys()

ModuleNotFoundError: No module named 'keras'

In [None]:
import re

def format_text(input):
    """Formats the text to treat punctuations"""
    # Add spaces around punctuation
    input = re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', input)
    # remove references to figures
    input = re.sub(r'\((\d+)\)', r'', input)
    # remove double spaces
    input = re.sub(r'\s\s', ' ', input)
    # remove special characters
    input = re.sub(r'\s+([.,;?])', r'\1', input)
    return input
f = format_text(sampleText)
f

In [None]:
tokenizer = Tokenizer(filters='!"#$%&()*+/:;<=>?@[\\]^_`{|}~\t\n', lower=True)
tokenizer.fit_on_texts([f])
s = tokenizer.texts_to_sequences([f])[0]
print(' '.join(tokenizer.index_word[i] for i in s))
print(tokenizer.word_index.keys())

In [None]:
formatted = [format_text(s) for s in synopsis[:10]]  
len(formatted)

In [None]:
def make_sequences(texts, training_lengths=50, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'):
    """Converts text to sequences of integers"""
    
    # create a tokenizer object and fit on texts
    tokenizer = Tokenizer(lower=lower, filters=filters)
    tokenizer.fit_on_texts(texts)
    
    # create lookup dictionaries
    word2idx = tokenizer.word_index
    idx2word = tokenizer.index_word
    num_words = len(word2idx) + 1
    word_counts = tokenizer.word_counts
    
    print(f'There are {num_words} unique words.')
    
    # convert text to sequences of integers
    sequences = tokenizer.texts_to_sequences(texts)
    
    # limit to sequences with more than training length tokens
    seq_lengths = [len(x) for x in sequences]
    over_idx = [i for i, l in enumerate(seq_lengths) if l > training_lengths]
    
    new_texts = []
    new_sequences = []
    
    for i in over_idx:
        new_texts.append(texts[i])
        new_sequences.append(sequences[i])      
        
    training_sequences = []
    labels = []
    
    for seq in new_sequences:
        for i in range(training_lengths, len(seq)):
            extract = seq[i - training_lengths:i + 1]
            training_sequences.append(extract[:-1])
            labels.append(extract[-1])
    
    print(f'There are {len(training_sequences)} training sequences.')
    return training_sequences, labels, word2idx, idx2word, num_words, word_counts, new_texts, new_sequences

In [None]:
TRAINING_LENGTH = 50
filters = '!"#$%&()*+/:<=>@[\\]^_`{|}~\t\n'
features, labels, word2idx, idx2word, num_words, word_counts, new_texts, new_sequences = make_sequences(formatted, TRAINING_LENGTH, lower=True, filters=filters)

In [None]:
n=2
def find_answers(index):
    print('Features=' + ' '.join(idx2word[i] for i in features[index]))
    print('Label=' + idx2word[labels[index]])
find_answers(n)
print('Original Text' + formatted[0][:400])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

def create_training_data(features, labels, num_words, train_fraction=0.7):
    """Creates training and validation data"""
    
    features, labels = shuffle(features, labels, random_state=RANDOM_STATE)
    
    # find number of training samples
    num_train = int(len(features) * train_fraction)
    
    # split data
    train_x = features[:num_train]
    train_y = labels[:num_train]
    val_x = features[num_train:]
    val_y = labels[num_train:]
    
    # convert to arrays
    train_x = np.array(train_x)
    valid_x = np.array(val_x)

    y_train = np.zeros((len(train_y), num_words), dtype=np.int8)
    y_valid = np.zeros((len(val_y), num_words), dtype=np.int8)
    
    # one hot encode outputs
    for i, word in enumerate(train_y):
        y_train[i, word] = 1
        
    for i, word in enumerate(val_y):
        y_valid[i, word] = 1
        
    return train_x, y_train, valid_x, y_valid 

In [None]:
train_x, train_y, valid_x, valid_y =  create_training_data(features, labels, num_words, train_fraction=0.7)

In [None]:
len(train_x), len(train_y), len(valid_x), len(valid_y)  

In [None]:
print(train_x.shape)
print(valid_x.shape)

In [None]:
import os
from keras.utils import get_file
import numpy as np

# Vectors to use
glove_vectors = 'c:\pre-trained-embeddings\glove.6B.zip'

# Download word embeddings if they are not present
# wget --no-check-certificate http://nlp.stanford.edu/data/glove.6B.zip
# unzip glove.6B.zip

# Load in unzipped file
glove_vectors = 'c:\pre-trained-embeddings\glove.6B\glove.6B.100d.txt'
glove = np.loadtxt(glove_vectors, encoding='utf-8', dtype='str', comments=None)

In [None]:
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]
del glove

In [None]:
print(vectors.shape)
print(words.shape)
print(num_words)

In [None]:
# create embedding matrix for words that are part of our vocabulary, using GloVe embeddings
word_lookup = {word: vector for word, vector in zip(words, vectors)}
embedding_matrix = np.zeros((num_words, vectors.shape[1]))
not_found = 0
for i, word in enumerate(word2idx.keys()):
    vector = word_lookup.get(word, None)
    if vector is not None:
        embedding_matrix[i + 1, :] = vector    
    else:
        not_found += 1
print(f'{not_found} words not found.')

In [None]:
import gc
gc.enable()
del vectors
gc.collect()

In [None]:
embedding_matrix.shape

In [None]:
embedding_matrix = embedding_matrix / np.linalg.norm(embedding_matrix, axis=1).reshape((-1, 1))
embedding_matrix = np.nan_to_num(embedding_matrix)

In [None]:
def find_closest(query, embedding_matrix=embedding_matrix, word2idx=word2idx, idx2word=idx2word, n=10):
    """Finds the closest word to a given word using word embeddings"""
    idx = word2idx.get(query, None)
    if idx is None:
        print(f'{query} not found in vocab.')
        return None
    vector = embedding_matrix[idx]
    if(np.all(vector == 0)):
        print(f'{query} has no pre-trained embedding.')
        return None
    else:
        dist = np.dot(embedding_matrix, vector)
        idxs = np.argsort(dist)[::-1][:n]  
        sorted_dist = dist[idxs]
        closest = [idx2word[i] for i in idxs]

    print(f'Query: {query}\n')
    max_len = max([len(i) for i in closest])
    for word, dist in zip(closest, sorted_dist):
        print(f'{word:{max_len + 2}} Cosine similarity {dist:.4f}')
    
find_closest('the')  
print('-'*100)
find_closest('movie') 

In [None]:
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional
from keras.optimizers import Adam
from keras.utils import plot_model

print(num_words)

In [None]:
def create_model(lstms=1, lstm_cells=64):
    model = Sequential()
    model.add(Embedding(num_words, embedding_matrix.shape[1], input_length=TRAINING_LENGTH, weights=[embedding_matrix], trainable=False))
    model.add(Masking(mask_value=0.0))
    for i in range(lstms):
        model.add(LSTM(lstm_cells, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))
    model.add(LSTM(lstm_cells, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))
    model.add(Dense(128, activation='softmax'))
    model.add(Dropout(0.5))
    model.add(Dense(num_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
    return model

model = create_model(lstms=2, lstm_cells=128)

In [None]:
model.summary()

In [None]:
history = model.fit(
    train_x,
    train_y,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=VERBOSE,
    validation_data=(valid_x, valid_y))