In [2]:
# !pip install -U transformers accelerate trl bitsandbytes datasets evaluate 
# !pip install -U peft scikit-learn
# !pip install hf_transfer

In [1]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import balanced_accuracy_score, classification_report

In [3]:
dataset = load_dataset('codefactory4791/ai-human-text-detection-balanced')
train_dataset = dataset['train'].to_pandas()
validation_dataset = dataset['validation'].to_pandas()
test_dataset = dataset['test'].to_pandas()

README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/22 [00:00<?, ?files/s]

data/train-00000-of-00022.parquet:   0%|          | 0.00/281M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [4]:
dataset = load_dataset('codefactory4791/amazon_test')
train_dataset = dataset['train'].to_pandas()
validation_dataset = dataset['validation'].to_pandas()
test_dataset = dataset['test'].to_pandas()

labels = sorted(train_dataset['labels'].unique())
label2id = {lbl: idx for idx, lbl in enumerate(labels)}

train_dataset['label_id'] = train_dataset['labels'].map(label2id)
validation_dataset['label_id'] = validation_dataset['labels'].map(label2id)
test_dataset['label_id'] = test_dataset['labels'].map(label2id)



In [5]:
train_dataset.head()

Unnamed: 0,text,labels,label_id
0,Buying 2nd pair. I was just using the pair I h...,arts-crafts-and-sewing_crafting,0
1,"Not a toy, but a good made in America alternat...",arts-crafts-and-sewing_crafting,0
2,Beautiful (with a bit of elbow grease. Ok out ...,arts-crafts-and-sewing_crafting,0
3,Wow. I wasn't expecting this stuff to be very ...,arts-crafts-and-sewing_crafting,0
4,Absolutely wonderful. I have several matching ...,arts-crafts-and-sewing_crafting,0


In [6]:
train_dataset = train_dataset.sample(frac=1).reset_index(drop=True)
validation_dataset = validation_dataset.sample(frac=1).reset_index(drop=True)
test_dataset = test_dataset.sample(frac=1).reset_index(drop=True)


class_weights=(1/train_dataset.labels.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()
class_weights

tensor([0.0435, 0.0435, 0.0435, 0.0435, 0.0435, 0.0435, 0.0435, 0.0435, 0.0435,
        0.0435, 0.0435, 0.0435, 0.0435, 0.0435, 0.0435, 0.0435, 0.0435, 0.0435,
        0.0435, 0.0435, 0.0435, 0.0435, 0.0435])

In [7]:
from datasets import DatasetDict, Dataset

# Converting pandas DataFrames into Hugging Face Dataset objects:
dataset_train = Dataset.from_pandas(train_dataset[['text','label_id']])
dataset_val = Dataset.from_pandas(validation_dataset[['text','label_id']])
dataset_test = Dataset.from_pandas(test_dataset[['text','label_id']])

# Combine them into a single DatasetDict
dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val,
    'test': dataset_test
})
dataset

train_ds = dataset['train']
val_ds = dataset['val']
test_ds = dataset['test']



In [8]:
train_ds

Dataset({
    features: ['text', 'label_id'],
    num_rows: 322000
})

In [9]:
from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, 
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True, 
    bnb_4bit_compute_dtype = torch.bfloat16 
)

model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# model = AutoModelForSequenceClassification.from_pretrained(
#     model_name,
#     quantization_config=quantization_config,
#     num_labels=30,
#     device_map='auto'
# )

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label={i: lbl for i, lbl in enumerate(labels)},
    label2id=label2id,
)

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

lora_config = LoraConfig(
    r = 16, 
    lora_alpha = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout = 0.05, 
    bias = 'none',
    task_type = 'SEQ_CLS'
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [11]:
from transformers import AutoTokenizer

model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [12]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [13]:
def tokenize(batch):
    return tokenizer(
        batch['text'],
        padding=True,
        truncation=True,
        max_length=512,
    )
    

train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)
train_ds = train_ds.rename_column("label_id", "labels")
val_ds   = val_ds.rename_column("label_id", "labels")
train_ds.set_format(type="torch", columns=['input_ids','attention_mask','labels'])
val_ds.set_format(type="torch", columns=['input_ids','attention_mask','labels'])


Map:   0%|          | 0/322000 [00:00<?, ? examples/s]

Map:   0%|          | 0/34500 [00:00<?, ? examples/s]

In [14]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
def compute_metrics(evaluations):
    predictions, labels = evaluations
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),
    'accuracy':accuracy_score(predictions,labels)}

In [16]:
import torch
import torch.nn.functional as F
from transformers import Trainer

class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # if class_weights is already a tensor, detach & clone it to avoid warnings
        if class_weights is not None:
            if isinstance(class_weights, torch.Tensor):
                self.class_weights = class_weights.detach().clone().float()
            else:
                self.class_weights = torch.tensor(class_weights, dtype=torch.float32)
            self.class_weights = self.class_weights.to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(
        self,
        model,
        inputs,
        return_outputs=False,
        num_items_in_batch=None,
        **kwargs,
    ):
        """
        Override the default loss to support class weights.
        Accepts `num_items_in_batch` and arbitrary **kwargs so Trainer doesn't error out.

        The `num_items_in_batch` argument is unused but passed by newer versions of Trainer
        (see transformers docs [oai_citation:1‡sbert.net](https://sbert.net/docs/package_reference/sentence_transformer/trainer.html#:~:text=compute_loss%28model%3A%20SentenceTransformer%2C%20inputs%3A%20dict,tuple%5Btorch.Tensor%2C%20dict%5Bstr%2C%20Any%5D%5D%5Bsource%5D%EF%83%81)).
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # compute weighted or unweighted cross‑entropy loss
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers import TrainingArguments


training_args = TrainingArguments(
    output_dir="intent_classification",
    learning_rate=1e-4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=1000,
    fp16 = True,
    save_strategy="steps",
    save_steps=1000,
    load_best_model_at_end=True,
    logging_steps=100,
    report_to="wandb",
    group_by_length=True
)

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    class_weights=class_weights,
)

trainer.train()

Step,Training Loss,Validation Loss
