In [1]:
from datasets import load_dataset, DatasetDict
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score


# 1) Load datasets
az = load_dataset('mteb/amazon_polarity')
rt = load_dataset("rotten_tomatoes")


def to_tf_dataset(hfds, text_key="text", label_key="label", batch_size=64, shuffle=False):
    # Convert HF columns to plain Python lists / numpy
    texts = list(hfds[text_key])
    labels = np.array(hfds[label_key], dtype=np.int32)

    ds = tf.data.Dataset.from_tensor_slices((texts, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(20000, len(texts)), seed=42, reshuffle_each_iteration=True)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

train_movie = to_tf_dataset(rt["train"], shuffle=True)
val_movie  = to_tf_dataset(rt["validation"], shuffle=False)
test_movie  = to_tf_dataset(rt["test"], shuffle=False)

split = az["train"].train_test_split(test_size=0.1, seed=42)
az = DatasetDict({
    "train": split["train"],
    "validation": split["test"],
    "test": az["test"],
})

train_ds = to_tf_dataset(az["train"], shuffle=True)
val_ds   = to_tf_dataset(az["validation"], shuffle=False)
test_ds  = to_tf_dataset(az["test"], shuffle=False)

best_cfg = {"rnn_type":"lstm","embed_dim":128, "rnn_units":256, "dropout":0.4, "lr":1e-3, "bidirectional":True}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def build_rnn_model(
    rnn_type="gru",
    vocab_size=20000,
    seq_len=50,
    embed_dim=128,
    rnn_units=128,
    dropout=0.3,
    lr=1e-3,
    bidirectional=False,
):
    # TextVectorization layer
    vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        output_sequence_length=seq_len
    )

    # Adapt vectorizer on training text only
    train_text_only = rt["train"]["text"]
    vectorizer.adapt(train_text_only)

    inputs = tf.keras.Input(shape=(1,), dtype=tf.string)
    x = vectorizer(inputs)
    x = tf.keras.layers.Embedding(vocab_size, embed_dim, mask_zero=True)(x)

    RNN = tf.keras.layers.GRU if rnn_type.lower() == "gru" else tf.keras.layers.LSTM
    rnn_layer = RNN(rnn_units)

    if bidirectional:
        x = tf.keras.layers.Bidirectional(rnn_layer)(x)
    else:
        x = rnn_layer(x)

    x = tf.keras.layers.Dropout(dropout)(x)
    x = tf.keras.layers.Dense(64, activation="relu")(x)
    x = tf.keras.layers.Dropout(dropout)(x)
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

    model = tf.keras.Model(inputs, outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    return model

In [3]:
def train_one(config,epochs,train_ds,val_ds):
    model = build_rnn_model(**config)
    cb = [
        tf.keras.callbacks.EarlyStopping(
            monitor="val_loss", patience=2, restore_best_weights=True
        )
    ]
    print('starting training ...')
    hist = model.fit(train_ds, validation_data=val_ds, epochs=epochs, callbacks=cb, verbose=2)
    val_best = max(hist.history["val_accuracy"])
    return model, hist, val_best

In [None]:
movie_model = tf.keras.models.load_model('best_model.keras')

## train movie model initial

In [None]:
movie_model,hist,val_best = train_one(config=best_cfg,epochs=10,train_ds=train_movie,val_ds=val_movie)

starting training ...
Epoch 1/10


## Movie model on amazon data (initial, no training/finetuning)

In [None]:
y_true = test_ds
y_prob = movie_model.predict(test_ds)

y_pred = np.argmax(y_prob, axis=1)

print(classification_report(y_true, y_pred))

print("f1 (macro):", f1_score(y_true, y_pred, average="macro"))
print("recall (macro):", recall_score(y_true, y_pred, average="macro"))
print("precision (macro):", precision_score(y_true, y_pred, average="macro"))

## finetune model

In [None]:
def finetune(
    model,
    train_ds,
    *,
    val_ds=None,
    limit=1000,
    epochs=1,
    batch_size=32,
    shuffle=True,
    seed=42,
    monitor="val_accuracy",
    mode="max",
    best_weights_path="best.weights.h5",
    **fit_kwargs,
):
    train_ds = train_ds.take(limit)
    if shuffle:
        train_ds = train_ds.shuffle(min(limit, 10_000), seed=seed, reshuffle_each_iteration=True)
    train_ds = train_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    if val_ds is None:
        return model.fit(train_ds, epochs=epochs, **fit_kwargs)

    val_ds = val_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    ckpt = tf.keras.callbacks.ModelCheckpoint(
        filepath=best_weights_path,
        monitor=monitor,
        mode=mode,
        save_best_only=True,
        save_weights_only=True,
        verbose=0,
    )

    callbacks = list(fit_kwargs.pop("callbacks", []) or [])
    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=epochs,
        callbacks=callbacks + [ckpt],
        **fit_kwargs,
    )

    model.load_weights(best_weights_path)
    return model,history

model, hist = finetune(movie_model, train_ds=train_ds, val_ds=val_ds, limit=250, epochs=10, batch_size=64)

Epoch 1/12


ValueError: Exception encountered when calling TextVectorization.call().

[1mWhen using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, None) with rank=2[0m

Arguments received by TextVectorization.call():
  • inputs=tf.Tensor(shape=(None, None), dtype=string)

## train new model on amazon reviews

In [None]:
amazon_model,hist,val_best = train_one(config=best_cfg,epochs=10,train_ds=train_ds,val_ds=val_ds)
val_best