This code was created following the tutorial: [hugging face - fine tune mistral](https://huggingface.co/blog/sirluk/multilabel-llm)

In [1]:
!pip install scikit-multilearn
!pip install datasets
!pip install peft
!pip install bitsandbytes
!pip install accelerate
!pip install wandb

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0
Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting m

In [2]:
exit()

In [5]:
import os
import random
import functools
import csv
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score
from skmultilearn.model_selection import iterative_train_test_split
from datasets import Dataset, DatasetDict
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)



In [4]:
import wandb
wandb.login()

# let's log every trained
# %env WANDB_LOG_MODEL=true

wandb_project_name = "LLP2-test"

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [1]:

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# Method and Model Configuration
entity = "rstern"
retrain_from_checkpoint = False
model_config = "frozen_model_1" # possible values: no_freezing_model, frozen_model_1, frozen_model_2
author_label_only = True
augmented_data = True
experiment_name = "full_aug_b"
debugg = True

## Load Dataset

In [7]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
if IN_COLAB:
  from google.colab import drive
  import sys
  drive.mount('/content/drive')
  # sys.path.append('/content/drive/MyDrive/ucph/LP Project') # If working in collab change this path
  path = '/content/drive/MyDrive/ucph/LP Project/'
  if augmented_data:
    train_df = pd.read_csv(f'{path}balanced_train.csv')
    val_df = pd.read_csv(f'{path}balanced_val.csv')
  else:
    train_df = pd.read_csv(f'{path}df_train.csv')
    val_df = pd.read_csv(f'{path}df_validation.csv')

# shuffle dataset
train_df = train_df.sample(frac=1, random_state=42)
val_df = val_df.sample(frac=1, random_state=42)
print(train_df.sample(5))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
        Unnamed: 0                                         paragraph1  \
8176          8176  I agree. At this point there’s no way to bring...   
76779        76779  When death is the punishment for peaceful acti...   
7208          7208  "In an article for the Columbia Journalism Rev...   
123796      123796  Fun fact! The purchase, possession, and consum...   
93840        93840  My question is this: How is it possible to cas...   

                                               paragraph2  label_author  \
8176    r/politics is currently accepting new moderato...             1   
76779   Once they start imprisoning or executing the f...             1   
7208    This has dramatically increased American influ...             1   
123796  i am for legalization and i have enjoyed weed ...             0   
93840   That's the thing. If we were to just give up, ...

In [8]:
print(train_df.columns)

def create_sequences(row):
  sequence = str(row["paragraph1"]) + "[LP2]" + str(row["paragraph2"])
  return sequence

train_df["input"] = train_df.apply(create_sequences, axis=1)
val_df["input"] = val_df.apply(create_sequences, axis=1)

x_train = train_df["input"].values
x_val = val_df["input"].values

def create_multilabel(row):
  label_dataset = max(min(1, row["label_dataset"]), 0)
  multilabel = np.array([int(row["label_author"]), int(label_dataset)])
  return multilabel

def create_singlelabel(row):
  label = np.array([int(row["label_author"])])
  return label

if author_label_only:
  train_df["label"] = train_df.apply(create_singlelabel, axis=1)
  val_df["label"] = val_df.apply(create_singlelabel, axis=1)
  label_weights = [1]
else:
  train_df["label"] = train_df.apply(create_multilabel, axis=1)
  val_df["label"] = val_df.apply(create_multilabel, axis=1)
  # weight author label heavier than topic change label
  label_weights = [2,1]

y_train = train_df["label"].values
y_train = np.stack(y_train)
y_val = val_df["label"].values
y_val = np.stack(y_val)


Index(['Unnamed: 0', 'paragraph1', 'paragraph2', 'label_author',
       'label_dataset', 'fileindex'],
      dtype='object')


In [9]:
print(x_train.shape, y_train.shape, y_train[0])
print(x_train[0])
print(x_val.shape, y_val.shape, y_val[0])
print(x_val[0])

(125600,) (125600, 1) [1]
In 2020 the polling suggested that Trump and Biden would have a tight race (which they did, sort of) but also that the Democratic party would pick up something like five seats in the Senate and win a clear majority, instead we saw Republicans voting for Biden at the top of the ticket and voting for Republicans at the bottom of the ticket, and the Senate ended up with a 50/50 tie.[LP2]Hi! I actually just voted for Warnock about an hour ago. . My question: to what degree do you think the current statewide political landscape is directly attributable to Donald Trump turning off so many moderate Republican voters? I do not understand how the state went from having uniform Republican control statewide to having Senators like Warnock and Ossoff in such a short amount of time.
(28262,) (28262, 1) [0]
Well, I don't know much about accreditation to be fair, but federal grants come from a lot more places than just DOEd, and many of those sponsors have already included l

In [10]:
x_train_sub = x_train[0:50]
y_train_sub = y_train[0:50]
x_val_sub = x_val[0:50]
y_val_sub = y_val[0:50]


In [11]:
if debugg:
  ds = DatasetDict({
    'train': Dataset.from_dict({'text': x_train_sub, 'labels': y_train_sub}),
    'val': Dataset.from_dict({'text': x_val_sub, 'labels': y_val_sub})
  })
else:
  ds = DatasetDict({
      'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
      'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
  })


Load the model

In [12]:
# model name
model_name = 'mistralai/Mistral-7B-v0.1'

# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

#### FREEZING FUCNTIONS
def freeze_all_but_last(model):
  for param in model.parameters():
    param.requires_grad = False  # Freeze all parameters
  for param in model.classifier.parameters():
    param.requires_grad = True   # Unfreeze only the last layer (classifier)

def freeze_specific_layers(model, layer_names_to_unfreeze):
  for name, param in model.named_parameters():
    if any(layer_name in name for layer_name in layer_names_to_unfreeze):
      param.requires_grad = True  # Unfreeze specified layers
    else:
      param.requires_grad = False  # Freeze other parameters

# preprocess dataset with tokenizer
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

#####
# new tokens
new_tokens = ["[LP2]"]

# check if the tokens are already in the vocabulary
new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())

# add the tokens to the tokenizer vocabulary
tokenizer.add_tokens(list(new_tokens))

# add new, random embeddings for the new tokens
tokenizer.pad_token = tokenizer.eos_token
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')

# Experiment 0: no freezing
no_freezing_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=y_train.shape[1]
)
no_freezing_model = prepare_model_for_kbit_training(no_freezing_model)
no_freezing_model = get_peft_model(no_freezing_model, lora_config)
no_freezing_model.config.pad_token_id = tokenizer.pad_token_id
no_freezing_model.resize_token_embeddings(len(tokenizer))

# Experiment 1: Freeze all except last layer
frozen_model_1 = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=y_train.shape[1])
freeze_all_but_last(frozen_model_1)
frozen_model_1 = prepare_model_for_kbit_training(frozen_model_1)
frozen_model_1 = get_peft_model(frozen_model_1, lora_config)
frozen_model_1.config.pad_token_id = tokenizer.pad_token_id
frozen_model_1.resize_token_embeddings(len(tokenizer))

# Experiment 2: Freeze all except specific layers (replace with your desired layers)
frozen_model_2 = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=y_train.shape[1])
freeze_specific_layers(frozen_model_2, ["encoder.layer.12"])  # Example: Unfreeze only layer 12
frozen_model_2 = prepare_model_for_kbit_training(frozen_model_2)
frozen_model_2 = get_peft_model(frozen_model_2, lora_config)
frozen_model_2.config.pad_token_id = tokenizer.pad_token_id
frozen_model_2.resize_token_embeddings(len(tokenizer))



In [None]:
model_dict = {"no_freezing_model":no_freezing_model, "frozen_model_1": frozen_model_1, "frozen_model_2": frozen_model_2}

In [None]:
# define if you retrain from local checkpoint or want to train from scratch
if retrain_from_checkpoint:
  model = model.from_pretrained(f'multilabel_classification/{model_name}')
else:
  model = model_dict[model_config]


In [None]:
# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels'])
    return d

# define which metrics to compute for evaluation
def compute_metrics(p):
    predictions, labels = p
    f1_micro = f1_score(labels, predictions > 0, average = 'micro')
    f1_macro = f1_score(labels, predictions > 0, average = 'macro')
    f1_weighted = f1_score(labels, predictions > 0, average = 'weighted')
    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted
    }


# create custom trainer class to be able to pass label weights and calculate multilabel loss
class CustomTrainer(Trainer):

    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights = label_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")

        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # compute custom loss
        loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32), pos_weight=self.label_weights)
        return (loss, outputs) if return_outputs else loss


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


# Define training function with freezing strategies
def train_and_evaluate(model, model_name, frozen=False):
    wandb.init(entity=entity, project="lp2", name=experiment_name)  # Update name with model name

    # define training args with potentially different learning rates for frozen models
    training_args = TrainingArguments(
        output_dir=f'multilabel_classification/{model_name}',  # Separate output directory
        learning_rate=1e-5 if frozen else 1e-4,  # Adjust learning rate for frozen models (optional)
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=10,
        weight_decay=0.01,
        evaluation_strategy='epoch',
        save_strategy='steps',  # Save based on steps
        save_steps=25,
        load_best_model_at_end=True,
        logging_steps=25,              # When to start reporting loss
        logging_dir=f"./logs/{model_name}",  # Separate log directory
        save_steps=25,                # Save checkpoints every 50 steps
        report_to=["wandb"],
    )

    # train
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds['train'],
        eval_dataset=tokenized_ds['val'],
        tokenizer=tokenizer,
        data_collator=functools.partial(collate_fn, tokenizer=tokenizer),
        compute_metrics=compute_metrics,
        label_weights=torch.tensor(label_weights, device=model.device)
      )

    trainer.train()

    # save model and tokenizer separately
    model_save_path = f"multilabel_mistral_{model_name}"
    trainer.model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

    print(f"Model and tokenizer saved to {model_save_path}")



In [None]:

# Train the original model (without freezing)
print_trainable_parameters(model)
train_and_evaluate(model, model_config)  # Pass model name for wandb


In [None]:
# save model local and on the hub if training was successfull.
test_results = trainer.evaluate(tokenized_ds['val'])
print(test_results)
trainer.save_model()
trainer.save_state()
trainer.push_to_hub()

In [None]:

# Train model with frozen last layer only
"""
frozen_model_1 = AutoModelForSequenceClassification.from_pretrained(
    frozen_model_1, num_labels=y_train.shape[1])
freeze_all_but_last(frozen_model_1)
print_trainable_parameters(frozen_model_1)
train_and_evaluate(frozen_model_1, model_config)
"""

In [None]:

# Train model with specific layer unfrozen (replace 12 with your desired layer)
"""frozen_model_2 = AutoModelForSequenceClassification.from_pretrained(
    frozen_model_2, num_labels=y_train.shape[1])
freeze_specific_layers(frozen_model_2, ["encoder.layer.12"])  # Example: Unfreeze only layer 12
print_trainable_parameters(frozen_model_2)
train_and_evaluate(frozen_model_2, model_config)
"""

In [None]:
# # define training args
# training_args = TrainingArguments(
#     output_dir = 'multilabel_classification',
#     learning_rate = 1e-4,
#     per_device_train_batch_size = 8,
#     per_device_eval_batch_size = 8,
#     num_train_epochs = 10,
#     weight_decay = 0.01,
#     evaluation_strategy = 'epoch',
#     save_strategy = 'epoch',
#     load_best_model_at_end = True,
#     logging_steps=25,              # When to start reporting loss
#     logging_dir="./logs",        # Directory for storing logs
#     save_steps=25,                # Save checkpoints every 50 steps
#     report_to = ["wandb"],
# )

# # train
# trainer = CustomTrainer(
#     model = model,
#     args = training_args,
#     train_dataset = tokenized_ds['train'],
#     eval_dataset = tokenized_ds['val'],
#     tokenizer = tokenizer,
#     data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
#     compute_metrics = compute_metrics,
#     label_weights = torch.tensor(label_weights, device=model.device)
# )

# trainer.train()

# # save model
# peft_model_id = 'multilabel_mistral'
# trainer.model.save_pretrained(peft_model_id)
# tokenizer.save_pretrained(peft_model_id)

In [None]:
# load model
# peft_model_id = 'multilabel_mistral'
# model = AutoModelForSequenceClassification.from_pretrained(peft_model_id)

In [None]:
wandb.finish()