In [None]:
!pip install adapters datasets transformers
!pip install accelerate
!pip install wandb
!pip install -i https://pypi.org/simple/ bitsandbytes

Collecting adapters
  Downloading adapters-1.0.0-py3-none-any.whl.metadata (16 kB)
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.43.4-py3-none-any.whl.metadata (43 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m43.7/43.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
import pandas as pd
import wandb
###Step1: Model Initialization
#1.1 Initialize a new wandb run
wandb.init(project="QLoRA_New", entity="siyinggu-nyu")

#1.2 modelpath for distilbert
modelpath = "distilbert-base-uncased"

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
#1.3 Load 4-bit quantized model
model = AutoModelForSequenceClassification.from_pretrained(
    modelpath,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    num_labels=3,  # Update this if you have a different number of labels
    low_cpu_mem_usage=True
)
model.config.use_cache = False

#1.4 Initialize lora config
import adapters
from adapters import LoRAConfig

adapters.init(model)

config = LoRAConfig(
    selfattn_lora=True, intermediate_lora=True, output_lora=True,
    attn_matrices=["q", "k", "v"],
    alpha=16, r=64, dropout=0.1
)
model.add_adapter("assistant_adapter", config=config)
model.train_adapter("assistant_adapter")

print(model.adapter_summary())

#1.5 Give model paramater size and type
for param in model.parameters():
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)


class CastOutputToFloat(torch.nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)
model.classifier = CastOutputToFloat(model.classifier)

print(model)

# Verifying the datatypes.
dtypes = {}
for _, p in model.named_parameters():
    dtype = p.dtype
    if dtype not in dtypes:
        dtypes[dtype] = 0
    dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items():
    total += v
for k, v in dtypes.items():
    print(k, v, v / total)

import os

###Step2: Data preprocessing and tokenize
from datasets import load_dataset, load_metric
dataset = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")

##2.1: Clean dataset
#: Function to clean text
import re
def clean_text(text):
    if text is None:
        return ""
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text.strip()

# Function to clean the entire dataset
def clean_dataset(dataset):
    dataset = dataset.map(lambda x: {'text': clean_text(x['text'])})
    return dataset

cleaned_dataset = clean_dataset(dataset)
print(cleaned_dataset)


##2.2: Tokenize dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(modelpath)
# Ensure the tokenizer has a pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize function
def tokenize_func(examples):
    return tokenizer(
        examples['text'],
        max_length=512,
        padding='max_length',
        truncation=True
    )

# Tokenize the Training Data
train_dataset = cleaned_dataset['train'].map(
    tokenize_func,
    batched=True
)

# Tokenize the Validation Data
val_dataset = cleaned_dataset['validation'].map(
    tokenize_func,
    batched=True
)

# Tokenize the Test Data
test_dataset = cleaned_dataset['test'].map(
    tokenize_func,
    batched=True
)

# Define the format for labels to ensure they match input size
def format_labels(examples):
    examples['labels'] = examples['label']
    return examples

train_dataset = train_dataset.map(format_labels, batched=True)
val_dataset = val_dataset.map(format_labels, batched=True)
test_dataset = test_dataset.map(format_labels, batched=True)


###Step3: Model Training and Evaluation
# Set up wandb config
wandb.config = {
    "learning_rate": 1e-8, ##!!!reduce it
    "epochs": 1000,
    "batch_size": 16,
    "model_name": modelpath,
    "quantization_config": bnb_config,
    "lora_config": config
}

args = TrainingArguments(
    output_dir="output/distilbert_qlora",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="steps",
    logging_steps=10,
    save_steps=500,
    eval_steps=187,
    save_total_limit=3,
    gradient_accumulation_steps=16,
    max_steps=1875,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",
    learning_rate=0.0002,
    group_by_length=True,
    bf16=True,
    warmup_ratio=0.03,
    max_grad_norm=0.3,
    report_to="wandb"  # Enable logging to wandb
)

from adapters import AdapterTrainer
from transformers import DataCollatorWithPadding
from sklearn.metrics import roc_auc_score
import time
import psutil

data_collator = DataCollatorWithPadding(tokenizer)

# Custom function to log time and memory usage
def log_time_memory():
    # Log the current time and memory usage
    current_time = time.time()
    memory_info = psutil.virtual_memory()
    wandb.log({
        "time": current_time,
        "memory_usage": memory_info.used / (1024 ** 2)  # Convert to MB
    })

def metrics(eval_prediction):
    logits, labels = eval_prediction
    # Convert logits to probabilities
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=1).numpy()
    auc_score = roc_auc_score(labels, probs, multi_class='ovr')
    return {"accuracy": auc_score}


trainer = AdapterTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=args,
    compute_metrics=metrics
)

log_time_memory()
trainer.train()
trainer.save_model()
wandb.finish()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
assistant_adapter        lora              4,718,592      10.456       1       1
--------------------------------------------------------------------------------
Full model                                45,129,216     100.000               0
DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlockWithAdapters(
          (attention): MultiHeadSelfAttentionWithAdapters(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): LoRALinear4bit(
 

Downloading readme:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/601k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/586k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31232 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5205 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5206 [00:00<?, ? examples/s]

Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/5206 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 31232
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 5205
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 5206
    })
})


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/5206 [00:00<?, ? examples/s]

Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/5206 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss,Accuracy
187,0.7159,0.715433,0.857442
374,0.6637,0.678551,0.872955


Step,Training Loss,Validation Loss,Accuracy
187,0.7159,0.715433,0.857442
374,0.6637,0.678551,0.872955
561,0.6898,0.656256,0.879506
748,0.7074,0.649278,0.883264
935,0.6035,0.653124,0.885744
1122,0.6173,0.636946,0.886622
1309,0.6508,0.636745,0.888918
1496,0.6849,0.618031,0.892132
1683,0.6642,0.623056,0.893175
1870,0.6412,0.617276,0.893439


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,‚ñÅ‚ñÑ‚ñÖ‚ñÜ‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
eval/loss,‚ñà‚ñÖ‚ñÑ‚ñÉ‚ñÑ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
eval/runtime,‚ñÇ‚ñÅ‚ñÑ‚ñÉ‚ñÇ‚ñÉ‚ñÅ‚ñÑ‚ñà‚ñÇ
eval/samples_per_second,‚ñá‚ñà‚ñÖ‚ñÜ‚ñá‚ñÜ‚ñà‚ñÖ‚ñÅ‚ñÜ
eval/steps_per_second,‚ñá‚ñà‚ñÖ‚ñÜ‚ñá‚ñÜ‚ñà‚ñÖ‚ñÅ‚ñÜ
memory_usage,‚ñÅ
time,‚ñÅ
train/epoch,‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
train/grad_norm,‚ñÅ‚ñÜ‚ñá‚ñÖ‚ñÇ‚ñÖ‚ñá‚ñÖ‚ñÑ‚ñÜ‚ñÉ‚ñÑ‚ñÜ‚ñÖ‚ñá‚ñÜ‚ñá‚ñÑ‚ñÉ‚ñá‚ñÑ‚ñÉ‚ñÑ‚ñÜ‚ñÇ‚ñÑ‚ñà‚ñà‚ñá‚ñÑ‚ñÅ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÑ‚ñÑ‚ñÖ‚ñÑ

0,1
eval/accuracy,0.89344
eval/loss,0.61728
eval/runtime,200.8867
eval/samples_per_second,25.91
eval/steps_per_second,25.91
memory_usage,2216.18359
time,1723333328.5903
total_flos,4408958269440000.0
train/epoch,0.96055
train/global_step,1875.0


In [None]:
!pip install huggingface.huk > /dev/null 2>&1
from huggingface_hub import notebook_login
notebook_login()
trainer.push_to_hub("QLoRA")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

NameError: name 'trainer' is not defined