## Bigram Encoding

In [4]:
import os, pathlib, shutil, random
from tensorflow.keras.utils import text_dataset_from_directory
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

In [5]:
batch_size = 32

train = text_dataset_from_directory("aclImdb/train", batch_size=batch_size)
validation = text_dataset_from_directory("aclImdb/val", batch_size=batch_size)
test = text_dataset_from_directory("aclImdb/test", batch_size=batch_size)

Found 70000 files belonging to 3 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


### Processing words as a set: The bag-of-words approach

In [6]:
text_vectorization = TextVectorization(ngrams=2, max_tokens=20000, output_mode="multi_hot")
text_only_train = train.map(lambda x, y: x)
text_vectorization.adapt(text_only_train)

In [7]:
binary_2gram_train = train.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_2gram_val = validation.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_2gram_test = test.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

**Model Definition**

In [8]:
def get_model(max_tokens=20000, hidden_dim=16):
    
    inputs = Input(shape=(max_tokens,))
    x = Dense(hidden_dim, activation="relu")(inputs)
    x = Dropout(0.5)(x)
    outputs = Dense(1, activation="sigmoid")(x)
    model = Model(inputs, outputs)
    model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
    
    return model

**Training and testing the binary Bigram model**

In [9]:
model = get_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense (Dense)               (None, 16)                320016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________


In [10]:
callbacks = [ModelCheckpoint("binary_2gram.keras", save_best_only=True)]
model.fit(binary_2gram_train.cache(), validation_data=binary_2gram_val.cache(), epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x126393bfbb0>

In [11]:
model = load_model("binary_2gram.keras")
print(f"Test acc: {model.evaluate(binary_2gram_test)[1]:.3f}")

Test acc: 0.500
