## Unigram Encoding

In [36]:
import os, pathlib, shutil, random
from tensorflow.keras.utils import text_dataset_from_directory
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

In [37]:
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"

In [38]:
def data_generator():

    for category in ("neg", "pos"):
    
        os.makedirs(val_dir / category)
        files = os.listdir(train_dir / category)
        random.Random(1337).shuffle(files)
        num_val_samples = int(0.2 * len(files))
        val_files = files[-num_val_samples:]

        for fname in val_files:
            shutil.move(train_dir / category / fname, val_dir / category / fname)

In [39]:
batch_size = 32

train = text_dataset_from_directory("aclImdb/train", batch_size=batch_size)
validation = text_dataset_from_directory("aclImdb/val", batch_size=batch_size)
test = text_dataset_from_directory("aclImdb/test", batch_size=batch_size)

Found 70000 files belonging to 3 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


**Displaying the shapes and dtypes of the first batch**

In [40]:
for inputs, targets in train:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b"In Panic In The Streets Richard Widmark plays U.S. Navy doctor who has his week rudely interrupted with a corpse that contains plague. As cop Paul Douglas properly points out the guy died from two bullets in the chest. That's not the issue here, the two of them become unwilling partners in an effort to find the killers and anyone else exposed to the disease.<br /><br />As was pointed out by any number of people, for some reason director Elia Kazan did not bother to cast the small parts with anyone that sounds like they're from Louisiana. Having been to New Orleans where the story takes place I can personally attest to that. Richard Widmark and his wife Barbara Bel Geddes can be excused because as a Navy doctor he could be assigned there, but for those that are natives it doesn't work.<br /><br />But with plague out there and the news being kept a secret, the N

### Processing words as a set: The bag-of-words approach

In [41]:
text_vectorization = TextVectorization(max_tokens=20000, output_mode="multi_hot")
text_only_train = train.map(lambda x, y: x)
text_vectorization.adapt(text_only_train)

In [47]:
binary_1gram_train = train.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_val = validation.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_test = test.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

**Inspecting the output of our binary unigram dataset**

In [42]:
for inputs, targets in binary_1gram_train:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(2, shape=(), dtype=int32)


**Our model-building utility**

In [43]:
def get_model(max_tokens=20000, hidden_dim=16):
    
    inputs = Input(shape=(max_tokens,))
    x = Dense(hidden_dim, activation="relu")(inputs)
    x = Dropout(0.5)(x)
    outputs = Dense(1, activation="sigmoid")(x)
    model = Model(inputs, outputs)
    model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
    
    return model

**Training and testing the binary unigram model**

In [44]:
model = get_model()
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________


In [45]:
callbacks = [ModelCheckpoint("binary_1gram.keras", save_best_only=True)]
model.fit(binary_1gram_train.cache(), validation_data=binary_1gram_val.cache(), epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1eff898a3a0>

In [46]:
model = load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test)[1]:.3f}")

Test acc: 0.500
