## Doing NLP with Keras and embeddings

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np

2021-09-14 08:19:03.141343: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-14 08:19:03.141457: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### **Preprocessing**

```python
tf.keras.layers.TextVectorization(
    max_tokens=None,  
    standardize='lower_and_strip_punctuation',  
    split='whitespace',  
    ngrams=None, output_mode='int',  
    output_sequence_length=None,  
    pad_to_max_tokens=False,  
    vocabulary=None, **kwargs  
)
```

In [2]:
# Dataset
text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])

2021-09-14 08:19:06.162544: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-09-14 08:19:06.162608: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-09-14 08:19:06.162629: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (CE43859): /proc/driver/nvidia/version does not exist
2021-09-14 08:19:06.164311: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=1000, standardize='lower_and_strip_punctuation',
    split='whitespace', ngrams=None, output_mode='int',
    output_sequence_length=10, pad_to_max_tokens=False, vocabulary=None
)

In [4]:
vectorizer.adapt(text_dataset.batch(32))

2021-09-14 08:19:06.934532: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [5]:
# We can have a look at the vocabulary index
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'foo', 'baz', 'bar']

In [6]:
# If we vectorize a sentence we get
output = vectorizer([['baz bar hippo']])
output.numpy()[0, :6]

array([3, 4, 1, 0, 0, 0])

### **Embedding**

In [7]:
# Build a model with Embeddings and Recurrent layers
model = tf.keras.Sequential()

```python
tf.keras.layers.Embedding(
    input_dim,
    output_dim,
    embeddings_initializer="uniform",
    embeddings_regularizer=None,
    activity_regularizer=None,
    embeddings_constraint=None,
    mask_zero=False,
    input_length=None,
    **kwargs
)
```

In [8]:
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))

In [9]:
model.add(vectorizer)

In [10]:
model.add(tf.keras.layers.Embedding(
    input_dim=1000,
    output_dim=64,
    input_length=10
))

In [11]:
model.compile('rmsprop', 'mse')
model.predict([['foo nice'], ['bar trumpet']])

array([[[-0.00563084,  0.0093014 , -0.01204   , ..., -0.00744009,
         -0.04828423, -0.02074393],
        [ 0.0312037 , -0.03578164,  0.0426073 , ..., -0.04811623,
         -0.04159303,  0.01463152],
        [-0.00287954,  0.01475239, -0.02184997, ...,  0.03118464,
         -0.03309283,  0.02958623],
        ...,
        [-0.00287954,  0.01475239, -0.02184997, ...,  0.03118464,
         -0.03309283,  0.02958623],
        [-0.00287954,  0.01475239, -0.02184997, ...,  0.03118464,
         -0.03309283,  0.02958623],
        [-0.00287954,  0.01475239, -0.02184997, ...,  0.03118464,
         -0.03309283,  0.02958623]],

       [[-0.02593101, -0.03805034,  0.00801908, ...,  0.02699113,
         -0.01994164, -0.01764347],
        [ 0.0312037 , -0.03578164,  0.0426073 , ..., -0.04811623,
         -0.04159303,  0.01463152],
        [-0.00287954,  0.01475239, -0.02184997, ...,  0.03118464,
         -0.03309283,  0.02958623],
        ...,
        [-0.00287954,  0.01475239, -0.02184997, ...,  

In [12]:
# Using a whole CNN model

# Conv1D + global max pooling
model.add(layers.Conv1D(2, 1, activation="relu"))
model.add(layers.Conv1D(2, 1, activation="relu"))
model.add(layers.GlobalMaxPooling1D())

# Densly connected final layers
model.add(layers.Dense(12, activation="relu"))
model.add(layers.Dropout(0.5))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(layers.Dense(1, activation="sigmoid", name="predictions"))

# Compile the model with binary crossentropy loss and an adam optimizer.
# model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
# model.fit(..)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 10)                0         
_________________________________________________________________
embedding (Embedding)        (None, 10, 64)            64000     
_________________________________________________________________
conv1d (Conv1D)              (None, 10, 2)             130       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 10, 2)             6         
_________________________________________________________________
global_max_pooling1d (Global (None, 2)                 0         
_________________________________________________________________
dense (Dense)                (None, 12)                36        
_________________________________________________________________
dropout (Dropout)            (None, 12)                0

In [13]:
# Or use it in an LSTM model

model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorizer)
model.add(tf.keras.layers.Embedding(
    input_dim=1000,
    output_dim=64,
    input_length=10
))
model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(64)))
model.add(layers.Dense(1, activation="sigmoid", name="predictions"))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 10)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 64)            64000     
_________________________________________________________________
bidirectional (Bidirectional (None, 10, 128)           66048     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
predictions (Dense)          (None, 1)                 129       
Total params: 228,993
Trainable params: 228,993
Non-trainable params: 0
_________________________________________________________________
