In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


In [None]:
!pip -q install "transformers>=4.42.0" "datasets>=2.19.0" accelerate scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/

In [9]:
import os, random, numpy as np, torch
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
from sklearn.metrics import f1_score, accuracy_score
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, Trainer, TrainingArguments, set_seed)

# Config
SEED = 42; set_seed(SEED)
TEXT, LABEL = "text", "label"
NUM_LABELS = 2
MAX_LEN = 192
SUBSET_SIZE = 1000
EPOCHS_SUB = 2
EPOCHS_FULL = 5
BS = 16
LR = 2e-5

MODEL_IDS = {
    "bert"     : "bert-base-uncased",
    "roberta"  : "roberta-base",
    "deberta"  : "microsoft/deberta-v3-base",
    "electra"  : "google/electra-base-discriminator",
    "distilbert": "distilbert-base-uncased",
}

WK = "/kaggle/working" if os.path.exists("/kaggle/working") else "."
OUT = os.path.join(WK, "runs"); BEST_DIR = os.path.join(WK, "best")
os.makedirs(OUT, exist_ok=True); os.makedirs(BEST_DIR, exist_ok=True)

# Metrics
def f1_macro(y_true, y_pred): 
    return f1_score(y_true, y_pred, average="macro", zero_division=0)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {"accuracy": float(accuracy_score(labels, preds)),
            "f1_macro": float(f1_macro(labels, preds))}

# Data loader
def load_imdb() -> DatasetDict:
    kaggle_csv = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
    try:
        ds = load_dataset("imdb")
        split = ds["train"].train_test_split(test_size=0.1, seed=SEED, stratify_by_column="label")
        return DatasetDict(train=split["train"], validation=split["test"], test=ds["test"])
    except Exception:
        import pandas as pd
        if not os.path.exists(kaggle_csv):
            raise FileNotFoundError("Add 'IMDB Dataset of 50K Movie Reviews' to Inputs or enable internet.")
        df = pd.read_csv(kaggle_csv)
        df["label"] = (df["sentiment"].str.lower()=="positive").astype(int)
        df.rename(columns={"review":"text"}, inplace=True)
        df = df[[TEXT, LABEL]]
        from sklearn.model_selection import train_test_split
        tr, tmp = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df[LABEL])
        va, te = train_test_split(tmp, test_size=0.5, random_state=SEED, stratify=tmp[LABEL])
        return DatasetDict(
            train=Dataset.from_pandas(tr.reset_index(drop=True)),
            validation=Dataset.from_pandas(va.reset_index(drop=True)),
            test=Dataset.from_pandas(te.reset_index(drop=True)),
        )

# Subset
def stratified_subset(dataset, n, label_col=LABEL):
    if n >= len(dataset): return dataset
    labels = dataset[label_col]; classes = sorted(set(labels))
    per = max(1, n // len(classes))
    idx_by_c = {c: [] for c in classes}
    for i, lab in enumerate(labels): idx_by_c[lab].append(i)
    rng = np.random.default_rng(SEED); chosen = []
    for c in classes:
        ii = idx_by_c[c]; rng.shuffle(ii); chosen += ii[:per]
    rem = n - len(chosen)
    if rem > 0:
        rest = list(set(range(len(dataset))) - set(chosen)); rng.shuffle(rest); chosen += rest[:rem]
    return dataset.select(sorted(chosen))

def tokenize(batch, tok): 
    return tok(batch[TEXT], truncation=True, max_length=MAX_LEN)


In [10]:
raw = load_imdb()
small_train = stratified_subset(raw["train"], SUBSET_SIZE)
ds_small = DatasetDict(train=small_train, validation=raw["validation"])
print("Train small:", len(ds_small["train"]), "| Val:", len(ds_small["validation"]), "| Test:", len(raw["test"]))


Train small: 1000 | Val: 2500 | Test: 25000


In [11]:
def train_on_subset(model_id: str):
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    tok_ds = ds_small.map(lambda x: tokenize(x, tok), batched=True, remove_columns=[TEXT])
    coll = DataCollatorWithPadding(tokenizer=tok)
    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=NUM_LABELS)
    model.to("cuda" if torch.cuda.is_available() else "cpu")

    args = TrainingArguments(
        output_dir=os.path.join(OUT, model_id.replace("/", "_")+"_subset"),
        per_device_train_batch_size=BS,
        per_device_eval_batch_size=BS,
        num_train_epochs=EPOCHS_SUB,
        learning_rate=LR,
        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=SEED,
        logging_steps=50,
        save_strategy="no",
    )
    tr = Trainer(model=model, args=args, train_dataset=tok_ds["train"], 
                 eval_dataset=tok_ds["validation"], tokenizer=tok, 
                 data_collator=coll, compute_metrics=compute_metrics)
    tr.train()
    m = tr.evaluate(tok_ds["validation"])
    return {"id": model_id, "f1_macro": m["eval_f1_macro"]}

def train_full(model_id: str):
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    tok_ds = raw.map(lambda x: tokenize(x, tok), batched=True, remove_columns=[TEXT])
    coll = DataCollatorWithPadding(tokenizer=tok)
    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=NUM_LABELS)
    model.to("cuda" if torch.cuda.is_available() else "cpu")

    merged = concatenate_datasets([tok_ds["train"], tok_ds["validation"]])
    args = TrainingArguments(
        output_dir=BEST_DIR,
        per_device_train_batch_size=BS,
        per_device_eval_batch_size=BS,
        num_train_epochs=EPOCHS_FULL,
        learning_rate=LR,
        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=SEED,
        logging_steps=50,
        save_strategy="no",
    )
    tr = Trainer(model=model, args=args, train_dataset=merged, 
                 eval_dataset=tok_ds["test"], tokenizer=tok, 
                 data_collator=coll, compute_metrics=compute_metrics)
    tr.train()
    test_metrics = tr.evaluate(tok_ds["test"])
    tr.save_model(BEST_DIR); tok.save_pretrained(BEST_DIR)
    return {"tok": tok, "test": test_metrics}


In [12]:
res_bert = train_on_subset(MODEL_IDS["bert"])
print("BERT:", res_bert)


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  tr = Trainer(model=model, args=args, train_dataset=tok_ds["train"],


Step,Training Loss
50,0.5448


BERT: {'id': 'bert-base-uncased', 'f1_macro': 0.8495074410565475}


In [13]:
res_roberta = train_on_subset(MODEL_IDS["roberta"])
print("RoBERTa:", res_roberta)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  tr = Trainer(model=model, args=args, train_dataset=tok_ds["train"],


Step,Training Loss
50,0.605


RoBERTa: {'id': 'roberta-base', 'f1_macro': 0.8935975484875172}


In [14]:
res_deberta = train_on_subset(MODEL_IDS["deberta"])
print("DeBERTa:", res_deberta)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  tr = Trainer(model=model, args=args, train_dataset=tok_ds["train"],


Step,Training Loss
50,0.4959


DeBERTa: {'id': 'microsoft/deberta-v3-base', 'f1_macro': 0.9115937620558506}


In [13]:
# Drop this in a new cell BEFORE training ModernBERT
import os, torch
os.environ["TORCHDYNAMO_DISABLE"] = "1"   # fully disable torch.compile/dynamo

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments

def train_on_subset_modernbert(model_id):
    # clone of your train_on_subset but with compile turned off in config
    cfg = AutoConfig.from_pretrained(model_id)
    for attr in ("reference_compile", "torch_compile", "use_compiled_kernels"):
        if hasattr(cfg, attr):
            setattr(cfg, attr, False)

    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    tok_ds = ds_small.map(lambda x: tokenize(x, tok), batched=True, remove_columns=[TEXT])
    coll = DataCollatorWithPadding(tokenizer=tok)

    model = AutoModelForSequenceClassification.from_pretrained(model_id, config=cfg, num_labels=NUM_LABELS)
    model.to("cuda" if torch.cuda.is_available() else "cpu")

    args = TrainingArguments(
        output_dir=os.path.join(OUT, model_id.replace("/", "_")+"_subset"),
        per_device_train_batch_size=BS,
        per_device_eval_batch_size=BS,
        num_train_epochs=EPOCHS_SUB,
        learning_rate=LR,
        fp16=torch.cuda.is_available(),
        report_to="none",
        seed=SEED,
        logging_steps=50,
        save_strategy="no",
    )
    tr = Trainer(model=model, args=args, train_dataset=tok_ds["train"],
                 eval_dataset=tok_ds["validation"], tokenizer=tok,
                 data_collator=coll, compute_metrics=compute_metrics)
    tr.train()
    m = tr.evaluate(tok_ds["validation"])
    return {"id": model_id, "f1_macro": m["eval_f1_macro"]}


In [16]:
res_electra = train_on_subset(MODEL_IDS["electra"])
print("Electra:", res_electra)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  tr = Trainer(model=model, args=args, train_dataset=tok_ds["train"],


Step,Training Loss
50,0.6063


Electra: {'id': 'google/electra-base-discriminator', 'f1_macro': 0.8838926982760388}


In [17]:
res_distilbert = train_on_subset(MODEL_IDS["distilbert"])
print("DistilBERT:", res_distilbert)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  tr = Trainer(model=model, args=args, train_dataset=tok_ds["train"],


Step,Training Loss
50,0.6383


DistilBERT: {'id': 'distilbert-base-uncased', 'f1_macro': 0.8367895545314901}


In [18]:
# Collect only the results you actually ran
all_res = []
for r in [globals().get("res_bert"),
          globals().get("res_roberta"),
          globals().get("res_deberta"),
          globals().get("res_electra"),
          globals().get("res_distilbert")]:
    if isinstance(r, dict) and "f1_macro" in r:
        all_res.append(r)

# If you only ran DistilBERT, this will just pick that.
best = max(all_res, key=lambda d: d["f1_macro"])
best_id = best["id"]
print("Best on subset:", best_id, "| f1_macro:", best["f1_macro"])


Best on subset: microsoft/deberta-v3-base | f1_macro: 0.9115937620558506


In [20]:
from transformers import pipeline, AutoTokenizer

# assumes you've already run the "pick best" cell and have best_id
print("Sanity-checking pipeline on:", best_id)

clf_pipeline = pipeline(
    task="text-classification",
    model=best_id,                 # if you didn't save subset fine-tune, this loads base weights
    tokenizer=best_id,
    device=0 if torch.cuda.is_available() else -1,
    truncation=True,
    return_all_scores=True,
)

# Try a few validation samples just to confirm everything runs end-to-end
sample_idxs = list(range(len(ds_small["validation"])))[:5]
texts  = [ds_small["validation"][i][TEXT]  for i in sample_idxs]

preds = clf_pipeline(texts, batch_size=16)
for i, (t, pr) in enumerate(zip(texts, preds), 1):
    scores = {d["label"]: d["score"] for d in pr}
    p_neg = float(scores.get("LABEL_0", scores.get("NEGATIVE", 0.0)))
    p_pos = float(scores.get("LABEL_1", scores.get("POSITIVE", 0.0)))
    pred  = "positive" if p_pos >= p_neg else "negative"
    conf  = max(p_neg, p_pos)
    snippet = t if len(t) < 180 else t[:177] + "..."
    print(f"\n[{i}] {snippet}\nPred: {pred} | Conf: {conf:.3f} | P(neg)={p_neg:.3f}, P(pos)={p_pos:.3f}")


Sanity-checking pipeline on: microsoft/deberta-v3-base


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



[1] I have watched 3 episodes of Caveman, and I have no idea why I continue except maybe waiting for it to get better. <br /><br />To me this show is just pumping itself off the com...
Pred: negative | Conf: 0.517 | P(neg)=0.517, P(pos)=0.483

[2] The fluttering of butterfly wings in the Atlantic can unleash a hurricane in the Pacific. According to this theory (somehow related to the Chaos Theory, I'm not sure exactly how...
Pred: negative | Conf: 0.515 | P(neg)=0.515, P(pos)=0.485

[3] Judging from this film and THE STRONG MAN, made the same year, I would not place Harry Langdon at the top of the list of great silent screen comedians. There simply is not enoug...
Pred: negative | Conf: 0.517 | P(neg)=0.517, P(pos)=0.483

[4] Earlier today I got into an argument on why so many people complain about modern films in which I encountered a curious statement: "the character development in newer movies jus...
Pred: negative | Conf: 0.520 | P(neg)=0.520, P(pos)=0.480

[5] this movie is such 

In [None]:
bundle = train_full(best_id)
print("Final TEST metrics:", bundle["test"])

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  tr = Trainer(model=model, args=args, train_dataset=merged,


Step,Training Loss
50,0.5452
100,0.2809


In [None]:
from transformers import pipeline

# Use the just-saved model + tokenizer directory
MODEL_PATH = BEST_DIR

clf_pipeline = pipeline(
    task="text-classification",
    model=MODEL_PATH,
    tokenizer=MODEL_PATH,
    device=0 if torch.cuda.is_available() else -1,
    truncation=True,
    return_all_scores=True,
)
clf_pipeline


In [None]:
import random, numpy as np

# sample 10 texts from the test split you loaded earlier
idxs = list(range(len(raw["test"])))
random.Random(SEED).shuffle(idxs)
idxs = idxs[:10]

texts  = [raw["test"][i][TEXT]  for i in idxs]
labels = [raw["test"][i][LABEL] for i in idxs]  # 0=neg, 1=pos

# batch inference via pipeline
preds = clf_pipeline(texts, batch_size=32)

for i, (t, pr, y) in enumerate(zip(texts, preds, labels), 1):
    # pr is a list like [{'label': 'LABEL_0', 'score': ...}, {'label': 'LABEL_1', 'score': ...}]
    scores = {d["label"]: d["score"] for d in pr}
    p_neg = float(scores.get("LABEL_0", scores.get("NEGATIVE", 0.0)))
    p_pos = float(scores.get("LABEL_1", scores.get("POSITIVE", 0.0)))
    pred  = 1 if p_pos >= p_neg else 0
    conf  = max(p_neg, p_pos)

    snippet = t if len(t) < 280 else t[:277] + "..."
    print(f"\n[{i}] {snippet}")
    print(f"Pred: {'positive' if pred==1 else 'negative'} | True: {'positive' if y==1 else 'negative'}")
    print(f"Confidence: {conf:.3f} | P(neg)={p_neg:.3f}, P(pos)={p_pos:.3f}")
