# Modul 8: Deep Learning and NLP

## **Setup**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import os
import pathlib

### Preload word embeddings


In [None]:
# Load pre-trained word embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip glove.6B.zip.1 -d glove.6B

### **The data**

In [None]:
data_path = keras.utils.get_file(
    "news20.tar.gz",
    "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz",
    untar=True,
)

In [None]:
data_dir = pathlib.Path(data_path).parent / "20_newsgroup"
dirnames = os.listdir(data_dir)
print("Number of directories:", len(dirnames))
print("Directory names:", dirnames)

fnames = os.listdir(data_dir / "comp.graphics")
print("Number of files in comp.graphics:", len(fnames))
print("Some example filenames:", fnames[:5])

In [None]:
# Get an example of what the dataset contains
print(open(data_dir / "comp.graphics" / "38987").read())

### Preprocessing

In [None]:
# Unpack and load the data
samples = []
labels = []
class_names = []
class_index = 0
for dirname in sorted(os.listdir(data_dir)):
    class_names.append(dirname)
    dirpath = data_dir / dirname
    fnames = os.listdir(dirpath)
    print("Processing %s, %d files found" % (dirname, len(fnames)))
    for fname in fnames:
        fpath = dirpath / fname
        f = open(fpath, encoding="latin-1")
        content = f.read()
        lines = content.split("\n")
        lines = lines[10:]
        content = "\n".join(lines)
        samples.append(content)
        labels.append(class_index)
    class_index += 1

print("Classes:", class_names)
print("Number of samples:", len(samples))

In [None]:
# Create a training and validation dataset

# Shuffle the data
seed = 1337
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

# Extract a training & validation split
validation_split = 0.2
num_validation_samples = int(validation_split * len(samples))
train_samples = samples[:-num_validation_samples]
val_samples = samples[-num_validation_samples:]
train_labels = labels[:-num_validation_samples]
val_labels = labels[-num_validation_samples:]

## **Exercise 8.1: Vectorization**

In [None]:
from tensorflow.keras.layers import TextVectorization

text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
# Create a TextVectorization with 20000 max_tokens and an output_sequence_length of 200
# Adapt the vectorizer to the data
vectorizer = None # TODO

In [None]:
# We can have a look at the vocabulary index
vectorizer.get_vocabulary()[:5]

In [None]:
# If we vectorize a sentence we get
output = vectorizer([["the cat sat on the mat"]])
output.numpy()[0, :6]

## **Exercise 8.2: Embeddings**

In [None]:
# Extract the installed word embeddings
# We use the smallest available embedding 50
path_to_glove_file = 'glove.6B/glove.6B.50d.txt'

# Every entry in the file contains the word followed by the coefficients
# Extract all the words into a dictionary with key (word) value(vector) mapping 
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

The embedding is a simple NumPy matrix where entry at index i is the pre-trained vector for the word of index i in our vectorizer's vocabulary.

In [None]:
# Create an embedding matrix to send it to the embedding layer later

# Create another mapping for the vocabulary learned by the vectorizer
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

num_tokens = len(voc) + 2
embedding_dim = 50
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
# Join both dictionaries in the numpy array
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
# Create the embedding layer
# We pass our embedding matrix as the state of the matrix and set it to not trainable
# Use embeddings_initializer=keras.initializers.Constant(embedding_matrix)
# and trainable=False to initialize the embedding
embedding_layer = None # TODO

## **Exercise 8.3: Modeling**

In [None]:
# Build a convoluational model that takes the vectorized words as input

model = None # TODO
# Create the input with shape=(None, )
# TODO
# Add the previousely created embedding layer next
# TODO
# Stack acouple of Conv1D and MaxPooling1D layers
# TODO
# Add the Dense layer
# TODO

model.summary()

In [None]:
# Use right padding to make each sentence the same size
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)

In [None]:
# Compile the model
# TODO

In [None]:
# Fit the model
# With 20 epochs this should take about 3 minutes
# TODO

## **Bonus: Query the model**

In [None]:
# Create an end-to-end model
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

In [None]:
# TODO
sentence = [['Put your text here']]

In [None]:
probabilities = end_to_end_model.predict(sentence)

class_names[np.argmax(probabilities[0])]

## **Exercise 8.4: Recurrent Models**

In [None]:
model = None # TODO
# Create the Input for variable-length sequences of integers
# TODO
# Create Embedding
# TODO
# Add bidirectional LSTM layers
# TODO
# Add Dense layer(s)
# TODO
model.summary()

In [None]:
# Compule the model
# TODO

In [None]:
# Fit the model
# TODO

## **Bonus: Query the model**

In [None]:
# Create an end-to-end model
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

In [None]:
# TODO
sentence = [['Put your text here']]

In [None]:
probabilities = end_to_end_model.predict(sentence)

class_names[np.argmax(probabilities[0])]