In [None]:
!pip install -q transformers datasets accelerate tensorboard spacy tqdm
!python -m spacy download en_core_web_sm


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m120.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m88.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os, re, random, numpy as np, pandas as pd, tensorflow as tf, spacy, tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import (BertTokenizerFast,
                          TFBertForSequenceClassification,
                          create_optimizer)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])  # speed: no NER/POS
SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("balanced_data_for_DL.csv")
df = df.drop(columns=["Unnamed: 0"], errors="ignore")           # drop index col
df["Rating"] = df["Rating"].astype(int)                         # 1–5 → ints
print(df["Rating"].value_counts(), "\n", df.head())


Rating
4    23000
5    23000
2    23000
3    23000
1    23000
Name: count, dtype: int64 
    Rating                                             Review
0       4  I love the fabric of this shirt -- it's super ...
1       5  Very tasty cheese flavor. Because of crunch an...
2       5  My little Yorkie loves these. She eats the swe...
3       2  The product is not bad, but the olive taste is...
4       2  IT WAS FREE WITH A GREAT MEMORY FOAM MATTRESS ...


In [None]:
def spacy_lemma(texts):
    cleaned = []
    for doc in tqdm.tqdm(nlp.pipe(texts, batch_size=1024), total=len(texts)):
        tokens = [t.lemma_.lower() for t in doc
                  if t.is_alpha and not t.is_stop]
        cleaned.append(" ".join(tokens))
    return cleaned

df["clean_review"] = spacy_lemma(df["Review"].astype(str))


100%|██████████| 115000/115000 [11:45<00:00, 163.09it/s]


In [None]:
train_df, test_df = train_test_split(
    df[["clean_review", "Rating"]],
    test_size=0.2, stratify=df["Rating"], random_state=SEED
)
NUM_LABELS = train_df["Rating"].nunique()


In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

MAX_LEN = 128    # 95 % of reviews fit; truncation keeps training fast

def encode(example_batch):
    return tokenizer(
        example_batch["clean_review"].tolist(),
        max_length=MAX_LEN,
        truncation=True,
        padding="max_length"
    )

def to_tf_dataset(df_subset, shuffle=False, batch=32):
    encodings = encode(df_subset)
    labels     = df_subset["Rating"].values - 1        # make labels 0-4
    ds = tf.data.Dataset.from_tensor_slices((
            dict(encodings),
            labels
         ))
    if shuffle: ds = ds.shuffle(len(df_subset), seed=SEED)
    return ds.batch(batch).prefetch(tf.data.AUTOTUNE)

train_ds = to_tf_dataset(train_df, shuffle=True,  batch=32)
test_ds   = to_tf_dataset(test_df,   shuffle=False, batch=32)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from transformers import create_optimizer
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# Define the number of labels
NUM_LABELS = len(df['Rating'].unique())

# Setup distribution strategy (automatically falls back to CPU if no GPU)
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    model = TFBertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=NUM_LABELS,
        id2label={i: str(i+1) for i in range(NUM_LABELS)},
        label2id={str(i+1): i for i in range(NUM_LABELS)},
    )

    steps_per_epoch = len(train_df) // 32
    num_train_steps = steps_per_epoch * 5

    optimizer, schedule = create_optimizer(
        init_lr=2e-5,
        num_warmup_steps=int(0.1 * num_train_steps),
        num_train_steps=num_train_steps,
        weight_decay_rate=0.01
    )

    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=["accuracy"]
    )


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Callbacks: early stop, best checkpoint, LR plateaus
ckpt = tf.keras.callbacks.ModelCheckpoint(
    "best_bert_model",
    save_weights_only=True,
    monitor="val_accuracy",
    mode="max",
    save_best_only=True
)
early = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    patience=2,
    restore_best_weights=True
)
reduce = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", factor=0.5, patience=1
)


In [None]:
history = model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=5,
    callbacks=[ckpt, early]
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5