<a href="https://colab.research.google.com/github/pgosar/AlphaHacks/blob/main/Movie_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import io
import os
import shutil
import re
import string
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization


In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


['README', 'imdbEr.txt', 'train', 'imdb.vocab', 'test']

In [None]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt',
 'neg',
 'labeledBow.feat',
 'pos']

In [None]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [None]:
batch_size = 1024
seed = 123
train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='training', seed=seed)
val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size, validation_split=0.2,
    subset='validation', seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [None]:
for text_batch, label_batch in train_ds.take(1):
  for i in range(5):
    print(label_batch[i].numpy(), text_batch.numpy()[i])

1 b'It\'s always nice to see Angela Bassett getting to do a role that she can really sink her teeth into. She is at times intense, funny and even sexy in her role as Lena, a "colored" woman forced to make a home on a desolate mudbank just outside of Cape Town, South Africa. Danny Glover is also good in a not entirely sympathetic role as her partner, Boesman. Willie Jonah gives a finely nuanced performance as the stranger that discovers Boesman and Lena\'s new living area. It\'s not often that you get a chance to see an intelligent film dealing with mature themes. Although it is based on a play, the late director John Berry (who also directed Claudine) opens the material up by having the film shot in the widescreen Cinemascope format. He also keeps things visually interesting through the creative blocking of actors and by showing us things only mentioned in the play. Just like Diahann Carroll in Claudine, John Berry may have directed Angela Bassett into an Academy Award nomination. This

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
embedding_layer = tf.keras.layers.Embedding(1000, 5)

result = embedding_layer(tf.constant([1,2,3]))
result.numpy()

array([[ 0.00973222, -0.01957756,  0.04395263,  0.0133597 , -0.01553816],
       [-0.00042682,  0.02788993, -0.01582443, -0.00634535, -0.02841049],
       [ 0.03374845, -0.04762805,  0.00262735, -0.03352543,  0.01539638]],
      dtype=float32)

In [None]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [None]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(1)
])

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f5771451350>

In [None]:
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_embeddings = tf.keras.utils.get_file("glove.6B.zip", glove_url, extract = True, cache_dir='.',
                                  cache_subdir='')

Downloading data from http://nlp.stanford.edu/data/glove.6B.zip
