## TFIDF

In [14]:
import os, pathlib, shutil, random
from tensorflow.keras.utils import text_dataset_from_directory
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
import tensorflow as tf

In [7]:
batch_size = 32

train = text_dataset_from_directory("aclImdb/train", batch_size=batch_size)
validation = text_dataset_from_directory("aclImdb/val", batch_size=batch_size)
test = text_dataset_from_directory("aclImdb/test", batch_size=batch_size)

Found 70000 files belonging to 3 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


**Model Definition**

In [3]:
def get_model(max_tokens=20000, hidden_dim=16):
    
    inputs = Input(shape=(max_tokens,))
    x = Dense(hidden_dim, activation="relu")(inputs)
    x = Dropout(0.5)(x)
    outputs = Dense(1, activation="sigmoid")(x)
    model = Model(inputs, outputs)
    model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
    
    return model

**Training and testing the binary Bigram model**

In [4]:
model = get_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense (Dense)               (None, 16)                320016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________


#### Bigrams with TF-IDF encoding

In [8]:
text_vectorization = TextVectorization(ngrams=2, max_tokens=20000, output_mode="tf_idf")
text_only_train = train.map(lambda x, y: x)
text_vectorization.adapt(text_only_train)

**Training and testing the TF-IDF bigram model**

In [9]:
tfidf_2gram_train = train.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
tfidf_2gram_val = validation.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
tfidf_2gram_test = test.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

In [10]:
model = get_model()
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________


In [12]:
callbacks = [ModelCheckpoint("tfidf_2gram.keras", save_best_only=True)]
model.fit(tfidf_2gram_train.cache(), validation_data=tfidf_2gram_val.cache(), epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2557427fca0>

In [13]:
model = load_model("tfidf_2gram.keras")
print(f"Test acc: {model.evaluate(tfidf_2gram_test)[1]:.3f}")

Test acc: 0.500


In [15]:
inputs = Input(shape=(1,), dtype="string")
processed_inputs = text_vectorization(inputs)
outputs = model(processed_inputs)
inference_model = Model(inputs, outputs)

In [16]:
raw_text_data = tf.convert_to_tensor([
    ["That was an excellent movie, I loved it."],
])
predictions = inference_model(raw_text_data)
print(f"{float(predictions[0] * 100):.2f} percent positive")

100.00 percent positive
