In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
from torch.nn import functional as F
from torch import nn

2025-08-21 17:43:58.002279: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755798238.183012      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755798238.242017      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# =========================
# Cell 1 — Data + labels (mirror teacher encoding)
# =========================
import pandas as pd
import torch, random, numpy as np
from sklearn.model_selection import train_test_split
from transformers import set_seed, AutoConfig

# --- paths & columns ---
DATA_PATH   = r"/kaggle/input/edaaaaa-tergol5/my_eda.csv"
TEXT_COL    = "Tweet"
TARGET_COL  = "Sentiment"
TEACHER_DIR = "/kaggle/input/covidbertttttttttt/pytorch/default/1"  # <-- use this

# --- seeds ---
SEED = 42
set_seed(SEED)
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

MAX_LENGTH = 128

# --------- Load CSV ---------
df = pd.read_csv(DATA_PATH, encoding="latin1")

# Keep original text if present; use normalized for training
if "Tweet" in df.columns:
    df = df.rename(columns={"Tweet": "Original"})
if "normalized_tweet" in df.columns:
    df = df.rename(columns={"normalized_tweet": "Tweet"})

assert TEXT_COL in df.columns and TARGET_COL in df.columns, f"Missing {TEXT_COL}/{TARGET_COL} in CSV."

# --------- Load teacher label maps from the valid folder ---------
cfg = AutoConfig.from_pretrained(TEACHER_DIR)
label2id = dict(cfg.label2id or {})
id2label = dict(cfg.id2label or {})

# If the teacher config doesn't define them, derive from data (stable order)
if not label2id or not id2label:
    uniq = sorted(df[TARGET_COL].astype(str).unique())
    label2id = {lbl: i for i, lbl in enumerate(uniq)}
    id2label = {i: lbl for lbl, i in label2id.items()}

def to_label_id(y):
    # pass-through if already an int id
    try:
        yi = int(y)
        if yi in id2label:
            return yi
    except Exception:
        pass
    ys = str(y)
    if ys not in label2id:
        raise ValueError(f"Label '{y}' not in mapping: {list(label2id.keys())}")
    return label2id[ys]

df["label"] = df[TARGET_COL].apply(to_label_id)
num_labels = len(id2label)

print("Teacher label2id:", label2id)
print("id2label:", id2label)
print("num_labels =", num_labels)
print("Label dist:\n", df["label"].value_counts().sort_index())

# --------- Stratified split ---------
train_df, eval_df = train_test_split(
    df, test_size=0.2, random_state=SEED, stratify=df["label"]
)
print("Train/Eval sizes:", len(train_df), len(eval_df))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Teacher label2id: {'Extremely Negative': 0, 'Extremely Positive': 1, 'Negative': 2, 'Neutral': 3, 'Positive': 4}
id2label: {0: 'Extremely Negative', 1: 'Extremely Positive', 2: 'Negative', 3: 'Neutral', 4: 'Positive'}
num_labels = 5
Label dist:
 label
0     5480
1     6619
2     9907
3     7673
4    11413
Name: count, dtype: int64
Train/Eval sizes: 32873 8219


In [3]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score

# =========================
# 2) Build HF DatasetDict
# =========================
cols = [TEXT_COL, "label"]
train_ds = Dataset.from_pandas(train_df[cols].rename(columns={"label": "labels"}), preserve_index=False)
eval_ds  = Dataset.from_pandas(eval_df[cols].rename(columns={"label": "labels"}),  preserve_index=False)
raw_datasets = DatasetDict({"train": train_ds, "test": eval_ds})

# =========================
# 3) Tokenizer (teacher) & tokenization
# =========================
teacher_NAME = "/kaggle/input/covidbertttttttttt/pytorch/default/1"  # <-- set your student here
tokenizer = AutoTokenizer.from_pretrained(teacher_NAME, use_fast=True)

MAX_LENGTH = 96  # or whatever you used in Cell 1

def tokenize_function(batch):
    return tokenizer(
        batch[TEXT_COL],   # uses TEXT_COL from global
        truncation=True,
        max_length=MAX_LENGTH,
    )

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=[TEXT_COL],
)

# Ensure expected columns
assert "labels" in tokenized_datasets["train"].column_names

# Dynamic padding collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Torch format
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# =========================
# Metrics (5-class safe)
# =========================
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):  # some trainers return (logits, ...)
        preds = preds[0]
    preds = preds.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted"),
    }

tokenized_datasets


Map:   0%|          | 0/32873 [00:00<?, ? examples/s]

Map:   0%|          | 0/8219 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 32873
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8219
    })
})

In [4]:
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/finetuned-bertweet/pytorch/default/1/finetuned_from_old").to(device)
print(device)

cuda


In [5]:
# define compute_metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    accuracy = (preds == labels).astype(float).mean().item()
    return {"accuracy": accuracy}




In [6]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=8 if torch.cuda.is_available() else None,  # nice for fp16
)

args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=1,
    save_strategy="no",
    fp16=torch.cuda.is_available(),
    dataloader_pin_memory=True,
    dataloader_num_workers=2,
    report_to=[],
    logging_strategy="no",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"].select(range(100)),
    eval_dataset=tokenized_datasets["test"].select(range(100)),
    compute_metrics=compute_metrics,
    data_collator=data_collator,   # <<<<<< IMPORTANT
)

trainer.train()
metrics = trainer.evaluate()             # <- show eval after training
print(metrics)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.569053,0.23


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'eval_loss': 1.5690526962280273, 'eval_accuracy': 0.23, 'eval_runtime': 0.9739, 'eval_samples_per_second': 102.675, 'eval_steps_per_second': 2.054, 'epoch': 1.0}


In [7]:
# Train a small student model using a larger teacher
teacher = model
student = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=5) # Smaller model of my choice

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, temperature=2.0, alpha=0.5, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        self.temperature = temperature
        self.alpha = alpha

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        outputs_student = model(**inputs)
        with torch.no_grad():
            outputs_teacher = self.teacher(**inputs)

        loss_ce = F.cross_entropy(outputs_student.logits, inputs["labels"])
        loss_kl = F.kl_div(
            F.log_softmax(outputs_student.logits / self.temperature, dim=-1),
            F.softmax(outputs_teacher.logits / self.temperature, dim=-1),
            reduction="batchmean") * (self.temperature ** 2)
        loss = self.alpha * loss_ce + (1 - self.alpha) * loss_kl
        
        return (loss, outputs_student) if return_outputs else loss

In [9]:
# revert to Python lists (no framework formatting)
tokenized_datasets.reset_format()          # <- or: tokenized_datasets.set_format(type=None)
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=8 if torch.cuda.is_available() else None,
)

trainer_distill = DistillationTrainer(
    model=student,
    teacher_model=teacher,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

trainer_distill.train()
print("\nDistillation complete. Student model trained.")
print("Student model size:", sum(p.numel() for p in student.parameters()))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.513866,0.834043


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



Distillation complete. Student model trained.
Student model size: 66957317


In [10]:
OUTPUT_DIR = "./student_model"

student.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved student model + tokenizer to", OUTPUT_DIR)


Saved student model + tokenizer to ./student_model


In [11]:
import os, shutil

# 1. Pick an output folder
OUTPUT_DIR = "/kaggle/working/student_model"

# 2. Save model + tokenizer
student.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# 3. Zip it up
shutil.make_archive(OUTPUT_DIR, "zip", OUTPUT_DIR)

print("Saved + zipped to:", OUTPUT_DIR + ".zip")


Saved + zipped to: /kaggle/working/student_model.zip
