In [1]:
import os
import tensorflow as tf

import pandas as pd 
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras import losses

import zipfile

from pathlib import Path
import pickle
from collections import OrderedDict
import re

import nltk
from collections import Counter

In [2]:
np.random.seed(seed=1)

In [3]:
print(tf.__version__)


2.13.1


In [4]:
# make sure you run the baseline notebook that prepares this data
# from the original imdb dataset downloaded from the web.
df = pd.read_csv("./imdb_full_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,14149,I had two reasons for watching this swashbuckl...,0
1,8946,"This is, in my opinion, a very good film, espe...",1
2,22378,I knew this film was supposed to be so bad it ...,0
3,12162,"When the US entered World War I, the governmen...",1
4,4879,Few movies can be viewed almost 60 years later...,1


In [5]:
print("class one average", np.mean(df["label"]))

class one average 0.5


In [6]:
# downloading glove vector from the web to local:
# this may take a while
# inside of this there is only 1 file
# glove.42B.300d.txt
# according to the website:https://nlp.stanford.edu/projects/glove/ 
# this one is uncased. 

# PLEASE MAKE SURE YOU UNCOMMENT AND RUN THE 
# COMMAND BELOW WHEN RUNNING THE NOTEBOOK FOR 
# THE FIRST TIME OR REST OF NOTEBOOK WON'T WORK

# !wget https://nlp.stanford.edu/data/glove.42B.300d.zip ./glove.42B.300d.zip
 

In [7]:
# once downloaded unzip the file
# !unzip glove.42B.300d.zip

In [8]:
# we can also download other variations
# I downloaded this one in particular seems like it
# has various dimension sizes 50D, 100D etc...
# inside of it there are 4 files:
# glove.6B.100d.txt
# glove.6B.200d.txt
# glove.6B.300d.txt
# glove.6B.50d.txt

# WE MAY EXPERIMENT WITH SMALLER VECS LATER
# FOR NOW WE DON'T NEED TO DOWNLOAD THIS ONE

# !wget https://nlp.stanford.edu/data/glove.6B.zip ./glove.6B.zip

In [9]:
# check `glove.42B.300d.txt` and `imdb_full_dataset.csv` exists or see above comments 
!ls

aclImdb		   glove.42B.300d.txt  glove_vecs.bin
aclImdb_v1.tar.gz  glove.42B.300d.zip  imdb_full_dataset.csv
baseline.ipynb	   glove.6B.zip        using_glove.ipynb


In [10]:
# loading glove vectors into dictionary of word -> vec
# this may take a while for first run, once run
# it will cache to a local pickle file and will run faster

def _get_glove_index_dict(unzipped_file="glove.42B.300d.txt"):
    _LOCAL_FILE = "./glove_vecs.bin"
    local_file = Path(_LOCAL_FILE)
    
    if local_file.is_file():
        with open(_LOCAL_FILE, "rb") as fp:
            glove_index = pickle.load(fp)
        
        return glove_index
    
    glove_index = OrderedDict()
    
    with open(unzipped_file) as fp:
        for line in fp.read().splitlines():
            values = line.split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            glove_index[word] = coefs
    
    with open(_LOCAL_FILE, 'wb') as fp:
        pickle.dump(glove_index, fp, pickle.HIGHEST_PROTOCOL)
    
    return glove_index 

In [11]:
glove_index = _get_glove_index_dict()

In [12]:
print("num unique words in glove", len(glove_index))

num unique words in glove 1917494


In [13]:
def preprocess_text(input_text: str) -> str:
    s = input_text.lower()
    s = (s
         .replace('<br />', ' ')
         .replace('`', "'")
         .replace('´',"'")
         .replace(" '", ' "')
         .replace("-", " - ")
         .replace("/", " ")
         .replace("_", " ")
        )
    return s

In [14]:
df["text"] = df["text"].map(preprocess_text)

In [15]:
tokenizer_func = nltk.word_tokenize

In [16]:
def _tokenize_and_create_embedding_matrix(corpus, glove_index, tok_func, max_vocab=300000):
    
    token_dict = Counter()
    for text in corpus:
        tokens = tok_func(text)
        token_dict.update(tokens)
    
    print(f"found {len(token_dict)} unique tokens in corpus")
    
    tokens_glove = Counter()
    glove_oov = Counter()
    
    for k, v in token_dict.items():
        if k in glove_index:
            tokens_glove[k] = v
        else:
            glove_oov[k] = v
    
    print(f"{len(tokens_glove)} of them are in glove")
    
    vocab_counts = tokens_glove.most_common(max_vocab)
    
    vocabulary_index = {}
    
    for index, tup in enumerate(vocab_counts):
        word, _ = tup
        # 1 index is reserved for OOV
        # 0 index is reserved for padding
        vocabulary_index[word] = index + 2
    
    num_words = len(vocabulary_index) + 2
    
    # need to change this if other dim glove vecs are used
    embedding_dim = 300 
    embedding_matrix = np.random.uniform(-0.05, 0.05, size=(num_words, embedding_dim))
    
    embedding_matrix[0] = np.zeros(embedding_dim)
    
    for word, i in vocabulary_index.items():
        embedding_vector = glove_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            # count the number of words not in embeddings_index
            raise ValueError("impossible!")
    
    return embedding_matrix, vocabulary_index, num_words, glove_oov

In [17]:
embedding_matrix, vocabulary_index, num_words, glove_oov = _tokenize_and_create_embedding_matrix(
    df["text"], 
    glove_index, 
    tokenizer_func,
)

found 80668 unique tokens in corpus
70393 of them are in glove


In [18]:
print(num_words)

70395


In [19]:
print(glove_oov.most_common(100))

[('*', 7061), ('..', 1624), ('\x96', 1338), ('......', 126), ('.......', 67), ('.the', 58), ('\x97', 55), ("'the", 42), ('........', 40), ('hackenstein', 30), ('mraovich', 28), ('.i', 24), ('.........', 23), ('a+', 22), ('b+', 21), ('hundstage', 21), ('protée', 20), ('maléfique', 19), ('.it', 19), ('volckman', 18), ('...........', 18), ('burgade', 18), ('\x8ei\x9eek', 18), ('.but', 16), ('..........', 15), ('soutendijk', 15), ('d+', 15), ('firode', 15), ('danelia', 14), ('·', 14), ('no.1', 13), ('1973.', 13), ('mcdoakes', 13), ('guetary', 13), ('scuddamore', 13), ('1971.', 13), ('.there', 12), ('bressart', 12), ('1959.', 12), ('polarisdib', 12), ('lassick', 12), ("rock'n'roll", 12), ('venantini', 12), ('mcphillip', 12), ('1979.', 11), ('goyokin', 11), ('1969.', 11), ('sjoman', 11), ('\x95', 11), ('unisols', 11), ("k'sun", 11), ('.............', 11), ("did'nt", 11), ('summersisle', 11), ('\x91the', 11), ('tetsurô', 11), ('pâquerette', 10), ('hickcock', 10), ('gwizdo', 10), ('yonica', 10

In [20]:
def texts_to_sequences(corpus, vocabulary_index, tok_func):
    corpus_tokens = []
    for text in corpus:
        tokens = tok_func(text)
        indicies = [vocabulary_index.get(x, 1) for x in tokens]
        corpus_tokens.append(indicies)
    return corpus_tokens

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
text_train, text_test, y_train, y_test = train_test_split(
    df["text"], 
    df["label"], 
    test_size=0.2,
    random_state=1,
)

In [23]:
X_train = texts_to_sequences(text_train, vocabulary_index, tokenizer_func)
X_test = texts_to_sequences(text_test, vocabulary_index, tokenizer_func)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=512)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=512)

In [24]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=num_words, 
                     output_dim=300, 
                     input_length=512, 
                     weights=[embedding_matrix], 
                     trainable=True),
    layers.Dropout(0.9),
    layers.Dense(16, activation="relu"),
    layers.Dropout(0.3),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.1),
    layers.Dense(1, activation="sigmoid")]
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 512, 300)          21118500  
                                                                 
 dropout (Dropout)           (None, 512, 300)          0         
                                                                 
 dense (Dense)               (None, 512, 16)           4816      
                                                                 
 dropout_1 (Dropout)         (None, 512, 16)           0         
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                        

In [25]:
model.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.5))


In [26]:
epochs = 80
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    batch_size=512,
)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80


Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [27]:
import json

with open('vocab_index.json', 'w') as fp:
    json.dump(vocabulary_index, fp)

In [29]:
model.save('glove_word_averaging_model.keras')

In [30]:
X_train = texts_to_sequences(text_train, vocabulary_index, tokenizer_func)
X_test = texts_to_sequences(text_test, vocabulary_index, tokenizer_func)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=256)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=256)

In [31]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=num_words, 
                     output_dim=300, 
                     input_length=256, 
                     weights=[embedding_matrix], 
                     trainable=True),
    layers.Dropout(0.9),
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.3),
    layers.LSTM(64, return_sequences=False),
    layers.Dropout(0.1),
    layers.Dense(16, activation="relu"),
    layers.Dropout(0.1),
    layers.Dense(1, activation="sigmoid")]
)

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 256, 300)          21118500  
                                                                 
 dropout_3 (Dropout)         (None, 256, 300)          0         
                                                                 
 dense_2 (Dense)             (None, 256, 64)           19264     
                                                                 
 dropout_4 (Dropout)         (None, 256, 64)           0         
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 16)               

In [32]:
model.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.5))


In [33]:
epochs = 80
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    batch_size=512,
)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80


Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [34]:
model.save('glove_lstm_model.keras')