In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
from datetime import datetime
from typing import Optional

import datasets
import torch
from torch.utils.data import DataLoader
from transformers import (
    AdamW,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)

In [4]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)


In [5]:
from datasets import load_dataset
import evaluate

accuracy_metric = evaluate.load("accuracy")
raw_datasets = load_dataset("ag_news")

## Tokenizer

In [6]:
# model_name = "meta-llama/Meta-Llama-3-8B"
model_name = "albert-base-v2"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [8]:
MAX_LEN = 64
col_to_delete = ['text']

def llama_preprocessing_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=MAX_LEN,padding='max_length')

tokenized_datasets = raw_datasets.map(llama_preprocessing_function, batched=True,remove_columns=['text'])
tokenized_datasets.set_format("torch")

In [9]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

def create_dataloaders(train_batch_size=256, eval_batch_size=256):
    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, batch_size=train_batch_size, collate_fn=collate_fn, num_workers=4
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["test"], shuffle=False, batch_size=eval_batch_size, collate_fn = collate_fn, num_workers=4
    )
    return train_dataloader, eval_dataloader

In [10]:
train_dl, test_dl = create_dataloaders()

## Model

In [11]:


def create_model1(model_name):
    from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification
    
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True, # enable 4-bit quantization
        bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
        bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
        bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
    )
    
    
    lora_config = LoraConfig(
        r = 16, # the dimension of the low-rank matrices
        lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
        target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
        lora_dropout = 0.05, # dropout probability of the LoRA layers
        bias = 'none', # wether to train bias weights, set to 'none' for attention layers
        task_type = 'SEQ_CLS'
    )
        
    
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        num_labels=4
    )
    
    
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    return model


In [17]:
import lightning as L
from typing import Optional
import evaluate
import torchmetrics

In [27]:
class TextClassifier(L.LightningModule):
    def __init__(self, 
        model_name_or_path:str,
        num_labels:str,
                learning_rate: float = 2e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 0,
        weight_decay: float = 0.0,
        train_batch_size: int = 32,
        eval_batch_size: int = 32,
        eval_splits: Optional[list] = None
        ):
        super().__init__()
        self.save_hyperparameters()

        self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels = num_labels)
        quantization_config = BitsAndBytesConfig(
            load_in_4bit = True, # enable 4-bit quantization
            bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
            bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
            bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
        )
    
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, 
                                                                        quantization_config=quantization_config,
                                                                        config=self.config)
        self.metric = evaluate.load(
            "accuracy")

    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss, logits = outputs[:2]

        preds = torch.argmax(logits, axis=1)
        labels = batch['labels']
        
        self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)
        
        return loss

    def on_training_step_end(self):
        
    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        val_loss, logits = outputs[:2]

        if self.hparams.num_labels > 1:
            preds = torch.argmax(logits, axis=1)
        elif self.hparams.num_labels == 1:
            preds = logits.squeeze()

        labels = batch["labels"]

        # log step metric
        self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)

        return {"val_loss": val_loss, "preds": preds, "labels": labels}

    # def on_validation_epoch_end(self, outputs):
    #     preds = torch.cat([x["preds"] for x in outputs]).detach().cpu().numpy()
    #     labels = torch.cat([x["labels"] for x in outputs]).detach().cpu().numpy()
    #     loss = torch.stack([x["loss"] for x in outputs]).mean()
    #     self.log("val_loss", loss, prog_bar=True)
    #     self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

In [28]:
model = TextClassifier(
    model_name_or_path="albert-base-v2",
    num_labels = 4
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
trainer = L.Trainer(
    max_epochs=1,
    accelerator="auto",
    devices=1 if torch.cuda.is_available() else None,  # limiting got iPython runs
    precision="bf16",
    log_every_n_steps=20
)

Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [30]:
trainer.fit(model, train_dataloaders=train_dl, val_dataloaders=test_dl)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name  | Type                            | Params | Mode
-----------------------------------------------------------------
0 | model | AlbertForSequenceClassification | 7.8 M  | eval
-----------------------------------------------------------------
3.9 M     Trainable params
3.9 M     Non-trainable params
7.8 M     Total params
31.215    Total estimated model params size (MB)


Sanity Checking: |                                                                    | 0/? [00:00<?, ?it/s]

Training: |                                                                           | 0/? [00:00<?, ?it/s]

/home/puneet/venv/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
