In [1]:
import os
import tensorflow as tf

import pandas as pd 
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras import losses

import zipfile

from pathlib import Path
import pickle
from collections import OrderedDict


In [2]:
print(tf.__version__)


2.13.1


In [3]:
# make sure you run the baseline notebook that prepares this data
# from the original imdb dataset downloaded from the web.
df = pd.read_csv("./imdb_full_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,14149,I had two reasons for watching this swashbuckl...,0
1,8946,"This is, in my opinion, a very good film, espe...",1
2,22378,I knew this film was supposed to be so bad it ...,0
3,12162,"When the US entered World War I, the governmen...",1
4,4879,Few movies can be viewed almost 60 years later...,1


In [4]:
print("class one average", np.mean(df["label"]))

class one average 0.5


In [5]:
# downloading glove vector from the web to local:
# this may take a while
# inside of this there is only 1 file
# glove.42B.300d.txt
# according to the website:https://nlp.stanford.edu/projects/glove/ 
# this one is uncased. 

# PLEASE MAKE SURE YOU UNCOMMENT AND RUN THE 
# COMMAND BELOW WHEN RUNNING THE NOTEBOOK FOR 
# THE FIRST TIME OR REST OF NOTEBOOK WON'T WORK

# !wget https://nlp.stanford.edu/data/glove.42B.300d.zip ./glove.42B.300d.zip
 

In [6]:
# once downloaded unzip the file
# !unzip glove.42B.300d.zip

In [7]:
# we can also download other variations
# I downloaded this one in particular seems like it
# has various dimension sizes 50D, 100D etc...
# inside of it there are 4 files:
# glove.6B.100d.txt
# glove.6B.200d.txt
# glove.6B.300d.txt
# glove.6B.50d.txt

# WE MAY EXPERIMENT WITH SMALLER VECS LATER
# FOR NOW WE DON'T NEED TO DOWNLOAD THIS ONE

# !wget https://nlp.stanford.edu/data/glove.6B.zip ./glove.6B.zip

In [8]:
# check `glove.42B.300d.txt` and `imdb_full_dataset.csv` exists or see above comments 
!ls

aclImdb		   glove.42B.300d.txt  glove_vecs.bin
aclImdb_v1.tar.gz  glove.42B.300d.zip  imdb_full_dataset.csv
baseline.ipynb	   glove.6B.zip        using_glove.ipynb


In [9]:
# loading glove vectors into dictionary of word -> vec
# this may take a while for first run, once run
# it will cache to a local pickle file and will run faster

def _get_glove_index_dict(unzipped_file="glove.42B.300d.txt"):
    _LOCAL_FILE = "./glove_vecs.bin"
    local_file = Path(_LOCAL_FILE)
    
    if local_file.is_file():
        with open(_LOCAL_FILE, "rb") as fp:
            glove_index = pickle.load(fp)
        
        return glove_index
    
    glove_index = OrderedDict()
    
    with open(unzipped_file) as fp:
        for line in fp.read().splitlines():
            values = line.split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            glove_index[word] = coefs
    
    with open(_LOCAL_FILE, 'wb') as fp:
        pickle.dump(glove_index, fp, pickle.HIGHEST_PROTOCOL)
    
    return glove_index 

In [10]:
glove_index = _get_glove_index_dict()

In [11]:
print("num unique words in glove", len(glove_index))

num unique words in glove 1917494


In [12]:
def preprocess_text(input_text: str) -> str:
    s = input_text.lower()
    s = s.replace('<br />', ' ')
    return s

In [13]:
df["text"] = df["text"].map(preprocess_text)

In [14]:
def _tokenize_and_create_embedding_matrix(corpus, glove_index):
    
    tok = tf.keras.preprocessing.text.Tokenizer(num_words=1000000, oov_token="<UNK>")
    tok.fit_on_texts(corpus)
    
    # for OOV tokens
    num_words = len(tok.word_index) + 1
    
    # need to change this if other dim glove vecs are used
    embedding_dim = 300 
    embedding_matrix = np.random.uniform(-0.05, 0.05, size=(num_words, embedding_dim))
    
    glove_oov = []
    
    for word, i in tok.word_index.items():
        embedding_vector = glove_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            # count the number of words not in embeddings_index
            glove_oov.append(word)
    
    return embedding_matrix, tok, num_words, glove_oov

In [15]:
embedding_matrix, tok, num_words, glove_oov = _tokenize_and_create_embedding_matrix(df["text"], glove_index)

In [16]:
print(f"{len(glove_oov)} words from the corpus didn't exist in glove vecs")

20822 words from the corpus didn't exist in glove vecs


In [17]:
print(glove_oov[:60])

['<UNK>', "isn't", "wasn't", "couldn't", '\x96', "they're", "won't", "wouldn't", "film's", "aren't", "haven't", "who's", "let's", "'the", "we're", "weren't", "movie's", "80's", "you'd", "hasn't", "man's", "shouldn't", "70's", "character's", "hadn't", "today's", "here's", "we've", "director's", "they've", "one's", "would've", "he'd", "ain't", "father's", "children's", "people's", "could've", "90's", "60's", "woman's", "they'd", "world's", "they'll", "50's", "show's", "1950's", "we'll", "he'll", "girl's", "mother's", "it'll", "hollywood's", "characters'", "someone's", "everyone's", "1970's", "disney's", "king's", "family's"]


In [18]:
# that seems strange... will get back to this
# probably need to split the `'s'` or `'ll` etc...
# these exist in glove vectors, confirming: 
# we should tokenize wouldn't to ["would", "n't"]
# I think default keras tokenizer is no good.

_, _, _ = glove_index["'s"], glove_index["n't"], glove_index["'re"]

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
text_train, text_test, y_train, y_test = train_test_split(
    df["text"], 
    df["label"], 
    test_size=0.2,
    random_state=1,
)

In [21]:
X_train = tok.texts_to_sequences(text_train)
X_test = tok.texts_to_sequences(text_test)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=512, padding="post")
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=512, padding="post")

In [22]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=num_words, 
                     output_dim=300, 
                     input_length=512, 
                     weights=[embedding_matrix], 
                     trainable=True),
    layers.Dropout(0.9),
    layers.Dense(16, activation="relu"),
    layers.Dropout(0.3),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.1),
    layers.Dense(1, activation="sigmoid")]
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 512, 300)          26575200  
                                                                 
 dropout (Dropout)           (None, 512, 300)          0         
                                                                 
 dense (Dense)               (None, 512, 16)           4816      
                                                                 
 dropout_1 (Dropout)         (None, 512, 16)           0         
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                        

In [23]:
model.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.5))


In [24]:
epochs = 40
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    batch_size=512,
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [25]:
X_train = tok.texts_to_sequences(text_train)
X_test = tok.texts_to_sequences(text_test)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=128, padding="post")
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=128, padding="post")

In [26]:
model = tf.keras.Sequential([
    layers.Embedding(input_dim=num_words, 
                     output_dim=300, 
                     input_length=128, 
                     weights=[embedding_matrix], 
                     trainable=True),
    layers.Dropout(0.9),
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.3),
    layers.LSTM(64, return_sequences=False),
    layers.Dropout(0.1),
    layers.Dense(16, activation="relu"),
    layers.Dropout(0.1),
    layers.Dense(1, activation="sigmoid")]
)

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 128, 300)          26575200  
                                                                 
 dropout_3 (Dropout)         (None, 128, 300)          0         
                                                                 
 dense_2 (Dense)             (None, 128, 64)           19264     
                                                                 
 dropout_4 (Dropout)         (None, 128, 64)           0         
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 16)               

In [27]:
model.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.5))


In [28]:
epochs = 40
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=epochs,
    batch_size=512,
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
