## NLP tutorial

In [1]:
import os, pathlib ,shutil , random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import numpy as np

### 1. preparing dataset

In [2]:
base_dir = "aclImdb"
train_dir = base_dir+"\\train"
val_dir = base_dir+"\\val"
test_dir = base_dir+"\\test"

### 1.1 create validation folder

In [None]:
os.makedirs(val_dir)
os.makedirs(val_dir+"\\pos")
os.makedirs(val_dir+"\\neg")

### 1.2 move 0.2 of random positive train files to validation folder

In [None]:
pos_files = os.listdir(train_dir+"\\pos")
random.Random(1402).shuffle(pos_files)
num_val_samples = int(0.2*len(pos_files))
val_pos_files = pos_files[:num_val_samples]
for fname in val_pos_files:
    shutil.move(train_dir+"\\pos\\"+fname,val_dir+"\\pos\\"+fname)

### 1.3 move 0.2 of random negative train files to validation folder

In [None]:
pos_files = os.listdir(train_dir+"\\neg")
random.Random(1402).shuffle(pos_files)
num_val_samples = int(0.2*len(pos_files))
val_pos_files = pos_files[:num_val_samples]
for fname in val_pos_files:
    shutil.move(train_dir+"\\neg\\"+fname,val_dir+"\\neg\\"+fname)

#### 1.4 read text files and make a dataset

In [3]:
batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(train_dir,batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory(val_dir,batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory(test_dir,batch_size=batch_size)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [None]:
for inputs, targets in train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

# 1. 1-gram Multi-Hot vectorization

In [None]:
text_vectorization = TextVectorization(
    max_tokens=20000,
    output_mode="multi_hot"
)

In [4]:
text_only_train_ds = train_ds.map(lambda x,y: x)

In [None]:
text_vectorization.adapt(text_only_train_ds)
binary_1gram_train_ds = train_ds.map(lambda x,y:(text_vectorization(x),y),num_parallel_calls=8)
binary_1gram_val_ds = val_ds.map(lambda x,y:(text_vectorization(x),y),num_parallel_calls=8)
binary_1gram_test_ds = test_ds.map(lambda x,y:(text_vectorization(x),y),num_parallel_calls=8)

In [None]:
for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

## 1.1  Model

In [None]:
def get_model(max_tokens=20000 , hidden_dim = 16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim,activation = 'relu')(inputs)
    outputs = layers.Dense(1,activation='sigmoid')(x)
    model = keras.Model(inputs,outputs)
    model.compile(optimizer="rmsprop" , loss= "binary_crossentropy" , metrics = ["accuracy"])
    return model

In [None]:
model = get_model()
model.summary()

In [None]:
callbacks=[keras.callbacks.ModelCheckpoint("binary_1gram.keras",save_best_only=True)]

In [None]:
model.fit(binary_1gram_train_ds.cache(),validation_data=binary_1gram_val_ds.cache(),epochs=10 , callbacks=callbacks)

In [None]:
model = keras.models.load_model("binary_1gram.keras")
print(f"Test Acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

### 1-gram Multi-Hot Test Result: 0.887 

# 2. Bi-gram Molti-Hot 

In [None]:
text_vectorizarion = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="multi_hot"
)

In [None]:
text_vectorization.adapt(text_only_train_ds)
binary_2gram_train_ds = train_ds.map(lambda x,y:(text_vectorization(x),y),num_parallel_calls=8)
binary_2gram_val_ds = val_ds.map(lambda x,y:(text_vectorization(x),y),num_parallel_calls=8)
binary_2gram_test_ds = test_ds.map(lambda x,y:(text_vectorization(x),y),num_parallel_calls=8)

In [None]:
model = get_model()
model.summary()
callback=[keras.callbacks.ModelCheckpoint("binary_2gram.keras",save_best_only=True)]

In [None]:
model.fit(binary_2gram_train_ds, validation_data=binary_2gram_val_ds,epochs=10 , callbacks=callback)

In [None]:
model = keras.models.load_model("binary_2gram.keras")
print(f"Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}")

### 1-gram Multi-Hot Test Result: 0.900

# 3. TF-IDF Bigram vectorization 

In [None]:
text_vectorizarion = TextVectorization(
    ngrams=2,
    max_tokens = 20000,
    output_mode='tf_idf'
)

In [None]:
text_vectorization.adapt(text_only_train_ds)
tfidf_2gram_train_ds = train_ds.map(lambda x,y:(text_vectorization(x),y),num_parallel_calls=8)
tfidf_2gram_val_ds = val_ds.map(lambda x,y:(text_vectorization(x),y),num_parallel_calls=8)
tfidf_2gram_test_ds = test_ds.map(lambda x,y:(text_vectorization(x),y),num_parallel_calls=8)

In [None]:
model = get_model()
model.summary()
callback=[keras.callbacks.ModelCheckpoint("tfidf_2gram.keras",save_best_only=True)]

In [None]:
model.fit(tfidf_2gram_train_ds, validation_data=tfidf_2gram_val_ds,epochs=10 , callbacks=callback)

In [None]:
model = keras.models.load_model("tfidf_2gram.keras")
print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}")

### TF-IDF Bigram Test Result: 0.877

# 4. Processing words as a sequence

In [5]:
max_length = 600
max_tokens = 20000
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)

In [6]:
text_vectorization.adapt(text_only_train_ds)
int_train_ds = train_ds.map(lambda x,y: (text_vectorization(x),y), num_parallel_calls=8)
int_val_ds = val_ds.map(lambda x,y: (text_vectorization(x),y), num_parallel_calls=8)
int_test_ds = test_ds.map(lambda x,y: (text_vectorization(x),y), num_parallel_calls=8)

In [None]:
def get_bidirectional_lstm():
    inputs = keras.Input(shape=(None,),dtype="int64")
    embedded = tf.one_hot(inputs,depth = max_tokens)
    x = layers.Bidirectional(layers.LSTM(32))(embedded)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1,activation="sigmoid")(x)
    model = keras.Model(inputs=inputs,outputs=outputs)
    model.compile(optimizer="rmsprop",loss="binary_crossentropy",metrics=["accuracy"])
    return model

In [None]:
model = get_bidirectional_lstm()
model.summary()

In [None]:
callbacks = [keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras",save_best_only=True)]
model.fit(int_train_ds,validation_data=int_val_ds,epochs=10, callbacks=callbacks)

### one-hot bidi-lstm Result: Too slow

# 5. Word Embeddings

In [None]:
def get_embedding_bidi_lstm():
    inputs = keras.Input(shape=(None,), dtype="int64")
    embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)
    x = layers.Bidirectional(layers.LSTM(32))(embedded)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop",loss="binary_crossentropy",metrics=["accuracy"])
    return model


In [None]:
model = get_embedding_bidi_lstm()
model.summary()

In [None]:
callbacks = [keras.callbacks.ModelCheckpoint("embedding_bidi_lstm.keras",save_best_only=True)]
model.fit(int_train_ds,validation_data=int_val_ds,epochs=10 , callbacks=callbacks)

In [None]:
model=keras.models.load_model("embedding_bidi_lstm.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

###  Embedings bidir-lstm  Result: 0.864

In [None]:
def get_embedding_bidi_lstm_mask():
    inputs = keras.Input(shape=(None,), dtype="int64")
    embedded = layers.Embedding(input_dim=max_tokens, output_dim=256,mask_zero=True)(inputs)
    x = layers.Bidirectional(layers.LSTM(32))(embedded)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop",loss="binary_crossentropy",metrics=["accuracy"])

    return model


In [None]:
model = get_embedding_bidi_lstm_mask()
model.summary()

In [None]:
callbacks = [keras.callbacks.ModelCheckpoint("embedding_bidi_lstm_mask.keras",save_best_only=True)]
model.fit(int_train_ds,validation_data=int_val_ds,epochs=10 , callbacks=callbacks)

In [None]:
model=keras.models.load_model("embedding_bidi_lstm_mask.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

###  Embedings bidir-lstm with mask  Result: 0.878

# 6. Using pretrained word emdeddings

### 6.1 download pretrained GloVe word embeddings (100-dimensional is used)

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

### 6.2 parsing the GloVe word-embeddings file

In [58]:
glove_file = "glove.6B.100d.txt"
embeddings_index = {}
with open(glove_file,encoding="utf-8") as f:
    lines = f.readlines()
    for line in lines:
        word,coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs,"f",sep=" ")
        embeddings_index[word] = coefs
print(f"Found {len(embeddings_index)} word vectors")

Found 400000 word vectors


### 6.3 preparing the GloVe word-embeddings matrix

In [60]:
## text vectorization based on max_lenght = 600 , max_tokens = 20000 and output_mode = int
embeddings_dim = 100
text_vectorization.adapt(text_only_train_ds)
vocabulary = text_vectorization.get_vocabulary(include_special_tokens=False)
word_index = dict(zip(vocabulary,range(len(vocabulary))))
embeddings_matrix = np.zeros((max_tokens,embeddings_dim))

for i , word in enumerate(vocabulary):
    if i < max_tokens:
        embeddings_vector=embeddings_index.get(word)
    if embeddings_vector is not None:
        embeddings_matrix[i]=embeddings_index.get(word)
        
embeddings_layer = layers.Embedding(
    max_tokens,embeddings_dim,
    embeddings_initializer=keras.initializers.Constant(embeddings_matrix),
    trainable = False,
    mask_zero=True
)    

In [66]:
def get_GloVe_embeding_model():
    inputs = keras.Input(shape=(None,),dtype="int64")
    embedded = embeddings_layer(inputs)
    x = layers.Bidirectional(layers.LSTM(32))(embedded)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs,outputs)
    model.compile(optimizer="rmsprop",loss="binary_crossentropy",metrics=["accuracy"])
    return model

In [67]:
model = get_GloVe_embeding_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 100)         2000000   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               34048     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2,034,113
Trainable params: 34,113
Non-trainable params: 2,000,000
______________________________________________

In [69]:
callbacks = [keras.callbacks.ModelCheckpoint("glove_embeddings_sequence_model.keras", save_best_only=True)]
model.fit(int_train_ds,validation_data=int_val_ds,epochs=10,callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a51e65ddb0>

In [70]:
model=keras.models.load_model("glove_embeddings_sequence_model.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Test acc: 0.771


###  GloVe Embedings bidir-lstm with mask  Result: 0.771

# 7. Transformer

In [7]:
class TransformerEncoder(layers.Layer):
    def __init__(self,embed_dim,dense_dim,num_heads,**kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
        [layers.Dense(dense_dim,activation="relu"),
         layers.Dense(embed_dim)
        ])
        self.layernorm1 = layers.LayerNormalization()
        self.layernorm2 = layers.LayerNormalization()
        
    def call(self,inputs,mask=None):
        if mask is not None:
            mask =mask[:,tf.newaxis,:]
        attention_output = self.attention(inputs,inputs,attention_mask=mask)
        proj_input =self.layernorm1(inputs+attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm2(proj_input+proj_output)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim":self.embed_dim,
            "num_heads":self.num_heads,
            "dense_dim":self.dense_dim,
        })
        return config

        

In [14]:
class PositionalEmbedding(layers.Layer):
    def __init__(self,sequence_length, input_dim , output_dim , **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(input_dim=input_dim,output_dim=output_dim)
        self.position_embeddings = layers.Embedding(input_dim=sequence_length,output_dim=output_dim)
        
        self.sequence_length=sequence_length
        self.input_dim=input_dim
        self.output_dim=output_dim
    
    def call(self,inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0,limit=length,delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens+embedded_positions
    
    def compute_mask(self,inputs , mask=None):
        return tf.math.not_equal(inputs,0)
    
    def get_config(self):
            config = super().get_config()
            config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
            })
            return config    

In [9]:
vocab_size = 20000
sequence_length = 600
embed_dim = 256
num_head = 2
dense_dim = 32
inputs = keras.Input(shape=(None,),dtype="int64")
x = PositionalEmbedding(sequence_length,vocab_size,embed_dim)(inputs)
x = TransformerEncoder(embed_dim,dense_dim,num_head)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1,activation="sigmoid")(x)
model = keras.Model(inputs,outputs)
model.compile(optimizer="rmsprop",loss="binary_crossentropy",metrics=["accuracy"])

In [10]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 positional_embedding (Posit  (None, None, 256)        5273600   
 ionalEmbedding)                                                 
                                                                 
 transformer_encoder (Transf  (None, None, 256)        543776    
 ormerEncoder)                                                   
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                             

In [11]:
callbacks = [keras.callbacks.ModelCheckpoint("transformer_encoder.keras",save_best_only=True)]

model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b0749f8100>

In [19]:
model = keras.models.load_model("transformer_encoder.keras",custom_objects={"TransformerEncoder": TransformerEncoder, "PositionalEmbedding":PositionalEmbedding})
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Test acc: 0.884


###  Transformer Result: 0.884

# The end.