In [1]:
# Getting the dataset from stanford's website
from pathlib import Path
import tensorflow as tf

DOWNLOAD_ROOT = "http://ai.stanford.edu/~amaas/data/sentiment/"

# I think this would only be possible if I had the exact name of the file? 
FILENAME = "aclImdb_v1.tar.gz"

# The path to the downloaded file is returned by the function below. The default location is determined by .keras
filepath = tf.keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract=True)
path = Path(filepath).parent / "aclImdb"
path

PosixPath('/home/ryuparish/.keras/datasets/aclImdb')

In [2]:
# Dividing like above will combine the string and the Posix path object returned into one single Posix Path object
str(Path(filepath).parent) + "/aclImdb"

'/home/ryuparish/.keras/datasets/aclImdb'

In [3]:
# Walking through the folder and printing out all the names of the files and the folders
import os

# os.walk returns the current walking directory, the subdirectories within it, and then finally the files within it.
for name, subdirs, files in os.walk(path):
    # The name variable keeps gettin longer as you get further away from the path folder
    # aclImdb, aclImdb/test, aclImdb/test/neg ... 
    indent = len(Path(name).parts) - len(path.parts)
    # os.sep adds a "/" if necessary 
    print("    " * indent + Path(name).parts[-1] + os.sep)
    for index, filename in enumerate(sorted(files)):
        # If we have printed three items already, just go ahead and print and ellipsis and move on
        if index == 3:
            print("    " * (indent + 1) + "...")
            break
        print("    " * (indent + 1) + filename)

aclImdb/
    README
    imdb.vocab
    imdbEr.txt
    test/
        labeledBow.feat
        urls_neg.txt
        urls_pos.txt
        neg/
            0_2.txt
            10000_4.txt
            10001_1.txt
            ...
        pos/
            0_10.txt
            10000_7.txt
            10001_9.txt
            ...
    train/
        labeledBow.feat
        unsupBow.feat
        urls_neg.txt
        ...
        unsup/
            0_0.txt
            10000_0.txt
            10001_0.txt
            ...
        neg/
            0_3.txt
            10000_4.txt
            10001_4.txt
            ...
        pos/
            0_9.txt
            10000_8.txt
            10001_10.txt
            ...


In [4]:
# Getting the filepaths of all the movie reviews and then loading them to their respective sets
def review_paths(dirpath):
    # PosixPath.glob will get all the path names of a specific pattern and return all of them in a list
    return [str(path) for path in dirpath.glob("*.txt")]

train_pos = review_paths(path / "train" / "pos")
train_neg = review_paths(path / "train" / "neg")
test_valid_pos = review_paths(path / "test" / "pos")
test_valid_neg = review_paths(path / "test" / "neg")

len(train_pos), len(train_neg), len(test_valid_pos), len(test_valid_neg)

(12500, 12500, 12500, 12500)

In [5]:
# BAD PRACTICE TO SAMPLE FROM THE TEST DATA 
import numpy as np
np.random.shuffle(test_valid_pos)

test_pos = test_valid_pos[:5000]
test_neg = test_valid_neg[:5000]
valid_pos = test_valid_pos[5000:]
valid_neg = test_valid_neg[5000:]

In [20]:
# Loading the datasets that are just lists of filepaths into
# tf.data.TextLineDataset takes filepaths that have one example per line in the file
def imdb_dataset(filepaths_positive, filepaths_negative, n_read_threads=5):
    dataset_neg = tf.data.TextLineDataset(filepaths_negative,
                                          num_parallel_reads=n_read_threads)
    # map is function that will through each instance in the dataset and will augment it in place
    # 0 means negative
    dataset_neg = dataset_neg.map(lambda review: (review, 0))
    dataset_pos = tf.data.TextLineDataset(filepaths_positive,
                                          num_parallel_reads=n_read_threads)
    # map is function that will through each instance in the dataset and will augment it in place
    # 1 means positive
    dataset_pos = dataset_pos.map(lambda review: (review, 1))
    # Concatenate makes two datasets of any shape and it turns it into one dataset with their respective shapes
    # for the respective samples
    return tf.data.Dataset.concatenate(dataset_pos, dataset_neg)
# returns a tf.data.Dataset that is a concatenation of the positive and the negative versions of the 
# same dataset

In [74]:
# Getting the concatenated datasets that are shuffled, batched, and prefetched for speed
batch_size = 32

train_set = imdb_dataset(train_pos, train_neg).shuffle(25000).batch(batch_size).prefetch(1)
valid_set = imdb_dataset(valid_pos, valid_neg).batch(batch_size).prefetch(1)
test_set = imdb_dataset(test_pos, test_neg).batch(batch_size).prefetch(1)

In [75]:
def preprocess(X_batch, n_words=50):
    # This is literally just a hacky way to:
    # Get the original shape of the batch
    #       vvvvvvvvvvvvvvvvv
    # Multiply by this to increase the dimensions of the list
    #                           vvvvvvvvvvvvvvvvvvv
    # Add this constant to make the recently added dimension a certain length
    #                                                 vvvvvvvvvvvvvvvvvvvvvvvvv
    shape = tf.shape(X_batch) * tf.constant([1, 0]) + tf.constant([0, n_words])
    
    # To look at the actua shapes being added
    #print(tf.constant([0, n_words]))
    #print(tf.constant([1,0]))
    #print(tf.shape(X_batch))
    #print(shape)
    
    # Shortening each full sentence (review) to either 50 words or 300 characters
    Z = tf.strings.substr(X_batch, 0, 300)
    Z = tf.strings.lower(Z)
    # Replacing all the "<br/>" and "<br/ssss...>"
    Z = tf.strings.regex_replace(Z, b"<br\\s*/?>", b" ")
    # Replacing everything that is "^" (NOT) a lowercase letter
    Z = tf.strings.regex_replace(Z, b"[^a-z]", b" ")
    # Splitting by spaces
    Z = tf.strings.split(Z)
    return Z.to_tensor(shape=shape, default_value=b"<pad>")

X_example = tf.constant(["It's a great, great movie! I loved it.", "It was terrible, run away!!!"])
preprocess(X_example)

<tf.Tensor: shape=(2, 50), dtype=string, numpy=
array([[b'it', b's', b'a', b'great', b'great', b'movie', b'i', b'loved',
        b'it', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>'],
       [b'it', b'was', b'terrible', b'run', b'away', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
        b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'

In [76]:
# Counting all of the words and then getting the 1000 most common words from the sample
from collections import Counter

def get_vocabulary(data_sample, max_size=1000):
    preprocessed_reviews = preprocess(data_sample).numpy()
    
    # To look at the tensor to numpy translation
    # print(preprocessed_reviews)
    
    # Counter object is similar to a hash map but it stores the number of times a key is pinged
    counter = Counter()
    for words in preprocessed_reviews:
        for word in words:
            if word != b"<pad>":
                counter[word] += 1
    return [b"<pad>"] + [word for word, count in counter.most_common(max_size)]

get_vocabulary(X_example)

[b'<pad>',
 b'it',
 b'great',
 b's',
 b'a',
 b'movie',
 b'i',
 b'loved',
 b'was',
 b'terrible',
 b'run',
 b'away']

In [77]:
# Using a class object to take in the 
class TextVectorization(tf.keras.layers.Layer):
    def __init__(self, max_vocabulary_size=1000, n_oov_buckets=100, dtype=tf.string, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        self.max_vocabulary_size = max_vocabulary_size
        self.n_oov_buckets = n_oov_buckets

    def adapt(self, data_sample):
        # Gets a list of bstrings of the max_vocabulary_size most popular words in the data_sample
        self.vocab = get_vocabulary(data_sample, self.max_vocabulary_size)
        words = tf.constant(self.vocab)
        word_ids = tf.range(len(self.vocab), dtype=tf.int64)
        # Making a vocab lookup table initializer so each word maps to it's index and encodes the word when called
        vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
        # Making a counter hash table out of all of the words 
        self.table = tf.lookup.StaticVocabularyTable(vocab_init, self.n_oov_buckets)
        
    def call(self, inputs):
        # Breaks up the words into 50-word or 300-char tensors (each tensor still has two reviews good and bad)
        preprocessed_inputs = preprocess(inputs)
        return self.table.lookup(preprocessed_inputs)

In [78]:
# Example text_vectorization
text_vectorization = TextVectorization()

text_vectorization.adapt(X_example)
text_vectorization(X_example)

<tf.Tensor: shape=(2, 50), dtype=int64, numpy=
array([[ 1,  3,  4,  2,  2,  5,  6,  7,  1,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 1,  8,  9, 10, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]])>

In [83]:
max_vocabulary_size = 1000
n_oov_buckets = 100

# I think this is the whole training dataset mapped into 
sample_review_batches = train_set.map(lambda review, label: review)

#print(sample_review_batches)
#print(sample_review_batches.as_numpy_iterator())
#print(list(sample_review_batches.as_numpy_iterator())[0][0])
#print(len(list(sample_review_batches.as_numpy_iterator())[0]))

# It is hard to understand what this is but the above things can sort of give a view of it's shape
sample_reviews = np.concatenate(list(sample_review_batches.as_numpy_iterator()), axis=0)

text_vectorization = TextVectorization(max_vocabulary_size, n_oov_buckets,
                                       input_shape=[])
text_vectorization.adapt(sample_reviews)

In [84]:
text_vectorization(X_example)

<tf.Tensor: shape=(2, 50), dtype=int64, numpy=
array([[  9,  14,   2,  64,  64,  12,   5, 256,   9,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  9,  13, 269, 532, 334,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>

In [85]:
text_vectorization.vocab[:10]

[b'<pad>', b'the', b'a', b'of', b'and', b'i', b'to', b'is', b'this', b'it']

In [88]:
# This is the bag of words functionality abstracted from the class definition below
simple_example = tf.constant([[1, 3, 1, 0, 0], [2, 2, 0, 0, 0]])
# This makes 5 one-hot vectors each list, each vector with 4 spots
print(tf.one_hot(simple_example, 4))
# Summing all of the one hot vector indices 
tf.reduce_sum(tf.one_hot(simple_example, 4), axis=1)

tf.Tensor(
[[[0. 1. 0. 0.]
  [0. 0. 0. 1.]
  [0. 1. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]]

 [[0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]]], shape=(2, 5, 4), dtype=float32)


<tf.Tensor: shape=(2, 4), dtype=float32, numpy=
array([[2., 2., 0., 1.],
       [3., 0., 2., 0.]], dtype=float32)>

In [89]:
class BagOfWords(tf.keras.layers.Layer):
    def __init__(self, n_tokens, dtype=tf.int32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        self.n_tokens = n_tokens
    def call(self, inputs):
        one_hot = tf.one_hot(inputs, self.n_tokens)
        # We cut off the first word of the samples because that is the <pad>
        return tf.reduce_sum(one_hot, axis=1)[:, 1:]

In [91]:
bag_of_words = BagOfWords(n_tokens=4)
bag_of_words(simple_example)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[2., 0., 1.],
       [0., 2., 0.]], dtype=float32)>

In [92]:
n_tokens = max_vocabulary_size + n_oov_buckets + 1 # add 1 for <pad>
bag_of_words = BagOfWords(n_tokens)

In [93]:
# Making a model and running the 
model = tf.keras.models.Sequential([
    text_vectorization,
    bag_of_words,
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])
model.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit(train_set, epochs=5, validation_data=valid_set)
# Already overfitting

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f186300f820>

In [94]:
# Now instead of bag of words embeddings, we use word embeddings
# Actually I am going to stop here. I am going to move on from Aurelien's bullshit
def compute_mean_embedding(inputs):
    not_pad = tf.math.count_nonzero(inputs, axis=-1)
    n_words = tf.math.count_nonzero(not_pad, axis=-1, keepdims=True)    
    sqrt_n_words = tf.math.sqrt(tf.cast(n_words, tf.float32))
    return tf.reduce_sum(inputs, axis=1) / sqrt_n_words

another_example = tf.constant([[[1., 2., 3.], [4., 5., 0.], [0., 0., 0.]],
                               [[6., 0., 0.], [0., 0., 0.], [0., 0., 0.]]])
compute_mean_embedding(another_example)

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[3.535534 , 4.9497476, 2.1213205],
       [6.       , 0.       , 0.       ]], dtype=float32)>