# Training other models (bertic and similar)

In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
# hf_token = user_secrets.get_secret("huggingface")

In [2]:
pooling_method="cls"

In [3]:
%%capture
!pip install info-nce-pytorch
!pip install -q transformers 
!pip install -q peft
!pip install -q evaluate

In [4]:
import torch
from transformers import AdamW, get_scheduler, TrainingArguments
from transformers.modeling_outputs import ModelOutput
from tqdm import tqdm
from info_nce import InfoNCE
import torch.nn as nn


class ModelWrapper(nn.Module):
    def __init__(self, model, pooling_method="cls"):
        super().__init__()
        self.model = model
        self.pooling_method = pooling_method
        self.loss_fn = InfoNCE()

    def forward_multiple(self, query_input_ids, query_attention_mask, 
             passage_input_ids, passage_attention_mask, **kwargs):
        query_embeddings = self(
            input_ids=query_input_ids,
            attention_mask=query_attention_mask).embeddings
        passage_embeddings = self(
            input_ids=passage_input_ids,
            attention_mask=passage_attention_mask).embeddings
        loss = self.loss_fn(query_embeddings, passage_embeddings)
        return ModelOutput(
            query_embeddings=query_embeddings,
            passage_embeddings=passage_embeddings,
            loss=loss)
        
    
    def forward(self, input_ids, attention_mask, **kwargs) -> ModelOutput:
        kwargs["output_hidden_states"] = True
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        hidden_states = outputs.hidden_states[-1]
        if self.pooling_method == "cls":
            indices = attention_mask.sum(dim=-1)-1 # Last non zero element per sentence in batch
            indices = indices.unsqueeze(-1).expand(-1, hidden_states.size(-1))  # Shape: (batch_size, hidden_size)
            embeddings = hidden_states.gather(1, indices.unsqueeze(1)).squeeze(1)  # Shape: (batch_size, hidden_size)
        elif self.pooling_method == "mean":
            mask = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
            embeddings = torch.sum(hidden_states * mask, dim=1) / torch.clamp(mask.sum(dim=1), min=1e-9)
        return ModelOutput(embeddings=embeddings)

    def __getattr__(self, name: str):
        """Forward missing attributes to the wrapped module."""
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
            return getattr(self.model, name)

In [5]:
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import BertForSequenceClassification
from peft import LoraConfig, TaskType
from peft import get_peft_model

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=16, lora_alpha=16, lora_dropout=0
)

def get_model():
    # model_name = "classla/bcms-bertic"
    model_name = 'bert-base-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    base_model = BertForSequenceClassification.from_pretrained(
        'bert-base-cased', 
        num_labels=2
    )
    model = get_peft_model(base_model, lora_config)
    model = ModelWrapper(model, pooling_method=pooling_method)
    return model, tokenizer

In [6]:
# Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 2
max_seq_length = 512  # Replace with your sequence length
seed = 3407

# Set random seed for reproducibility
torch.manual_seed(seed);
print(device)

cuda


In [7]:
model, tokenizer = get_model()
model.to(device);

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Load dataset

In [8]:
import pandas as pd
import datasets
from torch.utils.data import DataLoader

EOS_TOKEN = tokenizer.cls_token
passage_col = 'passage'
query_col = 'query'

def to_text(examples):
    queries = ["Query: " + x + EOS_TOKEN for x in examples[query_col]]
    passages = ["Passage: " + x + EOS_TOKEN for x in examples[passage_col]]
    return {"query_text": queries, "passage_text": passages}

def tokenize(examples):
    """Dodaje stupce input_ids i attention_mask tako da tokenizira stupac text"""
    data = {}
    for col in examples.keys():
        if "text" in col:
            new_data = tokenizer(examples[col], truncation=True, padding='max_length', max_length=max_seq_length)
            data[col.replace("text", "input_ids")] = new_data["input_ids"]
            data[col.replace("text", "attention_mask")] = new_data["attention_mask"]
    return data

def load_dataset(df):
    ds = datasets.arrow_dataset.Dataset.from_pandas(df)
    ds = ds.map(to_text, batched = True)
    ds = ds.map(tokenize, batched = True)
    ds.set_format(type='torch', 
                  columns=['query_input_ids',
                           'query_attention_mask',
                           'passage_input_ids',
                           'passage_attention_mask'])
    return ds

dataset_path = "/kaggle/input/wiki-and-ms-marco"
train_df = pd.read_csv(dataset_path + "/train.csv")
test_df = pd.read_csv(dataset_path + "/test.csv")

train_ds = load_dataset(train_df)
test_ds = load_dataset(test_df)

train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

print(len(train_dataloader), len(test_dataloader), batch_size)

Map:   0%|          | 0/15879 [00:00<?, ? examples/s]

Map:   0%|          | 0/15879 [00:00<?, ? examples/s]

Map:   0%|          | 0/1985 [00:00<?, ? examples/s]

Map:   0%|          | 0/1985 [00:00<?, ? examples/s]

7940 993 2


In [9]:
def mean_reciprocal_rank(cosine_sim_matrix, ground_truth_indices):
    num_queries = cosine_sim_matrix.size(0)
    reciprocal_ranks = []
    for i in range(num_queries):
        sorted_indices = torch.argsort(cosine_sim_matrix[i], descending=True)
        rank = (sorted_indices == ground_truth_indices[i]).nonzero(as_tuple=True)[0].item() + 1
        reciprocal_ranks.append(1 / rank)
    return sum(reciprocal_ranks) / num_queries

def hit_rate_at_1(cosine_sim_matrix, ground_truth_indices):
    top_1_indices = torch.argmax(cosine_sim_matrix, dim=1)  # Shape: (num_samples,)
    hits = (top_1_indices == ground_truth_indices).sum().item()
    return hits / ground_truth_indices.size(0)


class Looper:
    def __init__(self, model, **kwargs):
        self.model = model

    def loop(self, dataloader, num_steps, call, train=False, **kwargs):
        if num_steps == -1:
            num_steps = len(dataloader)
        assert len(dataloader) >= num_steps, "Dataloader is smaller than number of steps!"
        step = 0
        with tqdm(range(num_steps), leave=True, position=0,
                  desc="Training" if train else "Testing", unit="step") as progress_bar:
            for batch in dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                with torch.amp.autocast(device.type, dtype=torch.float16):
                    if train:
                        outputs = self.model.forward_multiple(**batch, **kwargs)
                    else:
                        with torch.no_grad():
                            outputs = model.forward_multiple(**batch, **kwargs)
                call(outputs=outputs, step=step, progress_bar=progress_bar)
                progress_bar.update(1)
                step += 1
                if step >= num_steps:
                    break


In [10]:
# Step je broj batcheva
gradient_accumulation_steps = 8
learning_rate = 2e-4
num_training_steps = len(train_dataloader)
num_val_steps = len(test_dataloader)
print("valsteps", num_val_steps)
val_every_steps = int(2000/batch_size)
warmup_steps = 10

model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=num_training_steps,
)
scaler = torch.amp.GradScaler(device.type, enabled=True)
looper = Looper(model)

def test_loop(dataloader, num_test_steps):
    query_embeddings = []
    passage_embeddings = []
    print()
    def test_callback(outputs, step, progress_bar):
        query_embeddings.append(outputs.query_embeddings)
        passage_embeddings.append(outputs.passage_embeddings)
    looper.loop(dataloader, num_test_steps, test_callback)
    query_embeddings = torch.concatenate(query_embeddings, dim=0) # po batch dimenziji
    passage_embeddings = torch.concatenate(passage_embeddings, dim=0)

    cosine_sim_matrix = torch.matmul(query_embeddings, passage_embeddings.T)
    ground_truth_indices = torch.arange(query_embeddings.shape[0], device=query_embeddings.device)
    mrr_score = mean_reciprocal_rank(cosine_sim_matrix, ground_truth_indices)
    hr_1 = hit_rate_at_1(cosine_sim_matrix, ground_truth_indices)
    print(f"MRR: {mrr_score:.4f} | Hit Rate @ 1: {hr_1:.4f}")

def train_loop():
    test_loop(test_dataloader, num_val_steps)
    def train_callback(outputs, step, progress_bar):
        loss = outputs.loss / gradient_accumulation_steps
        scaler.scale(loss).backward()
        if (step + 1) == 100 or (step + 1) == 200 or (step + 1) % 1000 == 0:
            torch.save(model.model.state_dict(), f"model_{step}.pth")
        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
        progress_bar.set_postfix({'loss': loss.item()})
        # progress_bar.set_description(f"Training [loss={loss.item()}]")
        if (step + 1) % val_every_steps == 0:
            test_loop(test_dataloader, num_val_steps)
    looper.loop(train_dataloader, num_training_steps, train_callback, train=True)

train_loop()

valsteps 993



Testing: 100%|██████████| 993/993 [00:40<00:00, 24.54step/s]


MRR: 0.0233 | Hit Rate @ 1: 0.0116


Training:  13%|█▎        | 999/7940 [01:37<11:23, 10.16step/s, loss=0.0523]




Testing: 100%|██████████| 993/993 [00:43<00:00, 22.60step/s]
Training:  13%|█▎        | 1002/7940 [02:21<9:15:13,  4.80s/step, loss=0.0308] 

MRR: 0.2149 | Hit Rate @ 1: 0.1577


Training:  25%|██▌       | 1999/7940 [03:59<09:39, 10.25step/s, loss=0.0162] 




Testing: 100%|██████████| 993/993 [00:43<00:00, 22.58step/s]
Training:  25%|██▌       | 2002/7940 [04:44<8:50:14,  5.36s/step, loss=0.00643] 

MRR: 0.4204 | Hit Rate @ 1: 0.3456


Training:  38%|███▊      | 2999/7940 [06:21<08:01, 10.26step/s, loss=0.0291] 




Testing: 100%|██████████| 993/993 [00:43<00:00, 22.57step/s]
Training:  38%|███▊      | 3002/7940 [07:06<6:35:23,  4.80s/step, loss=0.0123]

MRR: 0.5451 | Hit Rate @ 1: 0.4715


Training:  50%|█████     | 3999/7940 [08:44<06:25, 10.22step/s, loss=0.00346]




Testing: 100%|██████████| 993/993 [00:43<00:00, 22.62step/s]
Training:  50%|█████     | 4002/7940 [09:28<5:14:56,  4.80s/step, loss=0.00447]

MRR: 0.5658 | Hit Rate @ 1: 0.4912


Training:  63%|██████▎   | 4999/7940 [11:06<04:45, 10.31step/s, loss=0.00182]




Testing: 100%|██████████| 993/993 [00:43<00:00, 22.59step/s]
Training:  63%|██████▎   | 5002/7940 [11:50<3:55:18,  4.81s/step, loss=0.0145]

MRR: 0.5869 | Hit Rate @ 1: 0.5179


Training:  76%|███████▌  | 5999/7940 [13:28<03:06, 10.38step/s, loss=0.00119]




Testing: 100%|██████████| 993/993 [00:43<00:00, 22.68step/s]
Training:  76%|███████▌  | 6002/7940 [14:12<2:34:28,  4.78s/step, loss=0.00598]

MRR: 0.6056 | Hit Rate @ 1: 0.5345


Training:  88%|████████▊ | 6999/7940 [15:48<01:29, 10.49step/s, loss=0.000741]




Testing: 100%|██████████| 993/993 [00:43<00:00, 22.67step/s]
Training:  88%|████████▊ | 7002/7940 [16:32<1:14:45,  4.78s/step, loss=0.0426]

MRR: 0.6240 | Hit Rate @ 1: 0.5557


Training: 100%|██████████| 7940/7940 [18:03<00:00,  7.33step/s, loss=0]


In [11]:
# Save model and tokenizer
torch.save(model.model.state_dict(), 'model_final.pth')
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/u54VK8m8tk) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!

Some other links:
1. Zephyr DPO 2x faster [free Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)
2. Llama 7b 2x faster [free Colab](https://colab.research.google.com/drive/1lBzz5KeZJKXjvivbYvmGarix9Ao6Wxe5?usp=sharing)
3. TinyLlama 4x faster full Alpaca 52K in 1 hour [free Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)
4. CodeLlama 34b 2x faster [A100 on Colab](https://colab.research.google.com/drive/1y7A0AxE3y8gdj4AVkl2aZX47Xu3P1wJT?usp=sharing)
5. Mistral 7b [free Kaggle version](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
6. We also did a [blog](https://huggingface.co/blog/unsloth-trl) with 🤗 HuggingFace, and we're in the TRL [docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth)!
7. `ChatML` for ShareGPT datasets, [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing)
8. Text completions like novel writing [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing)
9. [**NEW**] We make Phi-3 Medium / Mini **2x faster**! See our [Phi-3 Medium notebook](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing)
10. [**NEW**] We make Llama-3 8b, 70b **2x faster**! See our [Llama-3 8b notebook](https://colab.research.google.com/drive/135ced7oHytdxu3N2DNe1Z0kqjyYIkDXp?usp=sharing)

<div class="align-center">
  <a href="https://github.com/unslothai/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
  <a href="https://discord.gg/u54VK8m8tk"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
  <a href="https://ko-fi.com/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Kofi button.png" width="145"></a></a> Support our work if you can! Thanks!
</div>