In [3]:
import os
import sys
from pathlib import Path
from tqdm import tqdm

import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, hamming_loss, jaccard_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from peft import LoraConfig, get_peft_model

from transformers import AutoTokenizer, pipeline, TrainingArguments, Trainer, AutoModelForSequenceClassification, DataCollatorWithPadding, EvalPrediction

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from utils import utils


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/baran/.conda/envs/stackoverflow/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda115.so
CUDA SETUP: CUDA runtime path found: /home/baran/.conda/envs/stackoverflow/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 115
CUDA SETUP: Loading binary /home/baran/.conda/envs/stackoverflow/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda115.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [9]:
# config variables and constants
dataset_path = Path("../datasets")
dataset_path.mkdir(exist_ok=True)

model_name = "facebook/opt-6.7b"

MAX_LEN = 2048
train_batch_size = 8
eval_batch_size = 4
gradient_accumulation_steps = 4
learning_rate = 2e-4
epoch = 1
metric_name = "f1"

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3)
max_memory = f"{free_in_GB-2}GB"

n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}

In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, problem_type="multi_label_classification")

Downloading (…)okenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [11]:
df_full = pd.read_parquet(dataset_path/"cleaned_df.parquet")

In [12]:
tags = df_full["Tag"].apply(lambda x: (x.split(',')))
binarizer =  MultiLabelBinarizer()
labels = binarizer.fit_transform(tags)
df_full["labels"] = list(labels)
df_full

Unnamed: 0,Tag,BodyCleaned,TitleCleaned,labels
0,"sql,asp.net",Has anyone got experience creating SQL-based A...,ASP.NET Site Maps,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"c#,.net",I have a little game written in C#. It uses a ...,Adding scripting functionality to .NET applica...,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,c++,I am working on a collection of classes used f...,Should I use nested classes in this case?,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
3,.net,I've been writing a few web services for a .ne...,Homegrown consumption of web services,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,sql-server,I wonder how you guys manage deployment of a d...,Deploying SQL Server Databases from Test to Live,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
830491,javascript,"I'm trying to detect the ""flash out of date"" e...","YouTube iFrame API: no ready call, no error call","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
830492,python,I need to extend a shell script (bash). As I a...,How to execute multiline python code from a ba...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
830493,php,I am building a custom MVC project and I have ...,URL routing in PHP (MVC),"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
830494,android,Under minifyEnabled I changed from false to tr...,Obfuscating code in android studio,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [13]:
x_train_title, x_test_title, y_train, y_test = train_test_split(df_full["TitleCleaned"], df_full["labels"], test_size=0.1, random_state = 0)
x_train_body, x_test_body, y_train, y_test = train_test_split(df_full["BodyCleaned"], df_full["labels"], test_size=0.1, random_state = 0)
samples = x_test_body.sample(1000)
samples_y = y_test[samples.index]

In [14]:
dataset_train = utils.StackOverflowDS(x_train_body.reset_index(drop=True), y_train.reset_index(drop=True), tokenizer, MAX_LEN)
dataset_test = utils.StackOverflowDS(x_test_body.reset_index(drop=True), y_test.reset_index(drop=True), tokenizer, MAX_LEN)
dataset_sample = utils.StackOverflowDS(samples.reset_index(drop=True), samples_y.reset_index(drop=True), tokenizer, MAX_LEN)

The model we are trying to use is really big and will not fit my GPU memory thats why we will use 8 bit quantization and LORA to make it smaller and faster.

In [1]:
num_labels = len(binarizer.classes_)
id2label = {idx:label for idx, label in enumerate(binarizer.classes_)}
label2id = {label:idx for idx, label in enumerate(binarizer.classes_)}
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, mlm=False)
model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=num_labels, 
                                                           problem_type="multi_label_classification",
                                                           id2label=id2label,
                                                           label2id=label2id,
                                                           load_in_8bit=True, 
                                                           device_map='auto',
                                                           max_memory=max_memory)

NameError: name 'binarizer' is not defined

In [2]:
### there is a post processing needed for the peft library
for param in model.parameters():
    param.requires_grad = False  # freeze the model - train adapters later
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)

class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [None]:
model = get_peft_model(model, config)
print_trainable_parameters(model)

In [None]:
args = TrainingArguments(f"{model_name}", 
                         evaluation_strategy = "epoch",
                         save_strategy = "epoch",
                         learning_rate=learning_rate,
                         per_device_train_batch_size=train_batch_size,
                         per_device_eval_batch_size=eval_batch_size,
                         gradient_accumulation_steps=gradient_accumulation_steps,
                         num_train_epochs=epoch,
                         weight_decay=0.01,
                         warmup_steps=100, 
                         max_steps=200, 
                         load_best_model_at_end=True,
                         metric_for_best_model=metric_name,
                         fp16=True,
                         logging_steps=1, 
                         output_dir=model_path
                        )

trainer = Trainer(model=model, args=args, train_dataset=dataset_train, 
                  #eval_dataset=dataset_sample,
                  eval_dataset=dataset_test, 
                  tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics)