In [1]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth
!pip install info-nce-pytorch
!pip -q uninstall transformers -y
!pip -q install transformers==4.47.1

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
# hf_token = user_secrets.get_secret("huggingface")
import torch
from transformers import AdamW, get_scheduler, TrainingArguments
from transformers.modeling_outputs import ModelOutput
from tqdm import tqdm
from trl import SFTTrainer
from unsloth import is_bfloat16_supported
from info_nce import InfoNCE
import torch.nn as nn
from unsloth import FastLanguageModel
import torch
import pandas as pd

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
print(torch.cuda.device_count())
print(torch.cuda.current_device())
for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_name(i))

1
0
Tesla T4


In [4]:
class ModelWrapper(nn.Module):
    def __init__(self, model, pooling_method="cls"):
        super().__init__()
        self.model = model
        self.pooling_method = pooling_method
        self.loss_fn = InfoNCE()

    def forward_multiple(self, query_input_ids, query_attention_mask, 
             passage_input_ids, passage_attention_mask, **kwargs):
        query_embeddings = self(
            input_ids=query_input_ids,
            attention_mask=query_attention_mask).embeddings
        passage_embeddings = self(
            input_ids=passage_input_ids,
            attention_mask=passage_attention_mask).embeddings
        loss = self.loss_fn(query_embeddings, passage_embeddings)
        return ModelOutput(
            query_embeddings=query_embeddings,
            passage_embeddings=passage_embeddings,
            loss=loss)

    def forward(self, input_ids, attention_mask, **kwargs) -> ModelOutput:
        kwargs["output_hidden_states"] = True
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        hidden_states = outputs.hidden_states[-1]
        if self.pooling_method == "cls":
            indices = attention_mask.sum(dim=-1)-1 # Last non zero element per sentence in batch
            indices = indices.unsqueeze(-1).expand(-1, hidden_states.size(-1))  # Shape: (batch_size, hidden_size)
            embeddings = hidden_states.gather(1, indices.unsqueeze(1)).squeeze(1)  # Shape: (batch_size, hidden_size)
        elif self.pooling_method == "mean":
            mask = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
            embeddings = torch.sum(hidden_states * mask, dim=1) / torch.clamp(mask.sum(dim=1), min=1e-9)
        return ModelOutput(embeddings=embeddings)

    def __getattr__(self, name: str):
        """Forward missing attributes to the wrapped module."""
        try:
            return super().__getattr__(name)  # defer to nn.Module's logic
        except AttributeError:
            return getattr(self.model, name)

In [5]:
max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
use_bfloat16 = is_bfloat16_supported()

def load_embeddings_model(model_name):
    !mkdir my_model
    !cp -r /kaggle/input/gemma-cls-embedding/pytorch/cls-pooling/1/tokenizer/* my_model
    !cp -r /kaggle/input/gemma-cls-embedding/pytorch/cls-pooling/1/{model_name}/* my_model
    _model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "my_model",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )
    # FastLanguageModel.for_inference(_model)
    model = ModelWrapper(_model, pooling_method="cls")
    return model, tokenizer

def load_language_model():
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/gemma-7b-it-bnb-4bit",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )
    FastLanguageModel.for_inference(model)
    return model, tokenizer
    # google/gemma-1.1-7b-it

In [6]:
model_name = "model_3999"
emb_model, emb_tokenizer = load_embeddings_model(model_name)
lang_model, lang_tokenizer = load_language_model()

  pid, fd = os.forkpty()


==((====))==  Unsloth 2025.1.6: Fast Gemma2 patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unsloth 2025.1.6 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


==((====))==  Unsloth 2025.1.6: Fast Gemma patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.57G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [7]:
EOS_TOKEN = emb_tokenizer.eos_token
passage_col = 'passage'
query_col = 'query'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 2
max_seq_length = max_seq_length

def query_to_text(query):
    return "Query: " + query + EOS_TOKEN

def tokenize(text):
    """Dodaje stupce input_ids i attention_mask tako da tokenizira stupac text"""
    global emb_tokenizer
    tokenized = emb_tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_seq_length,
        return_tensors="pt")
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]
    return input_ids, attention_mask

def build_rag_prompt(query, passage):
    prompt = f"""<start_of_turn>user
Ti si koristan AI asistent. Tvoj zadatak je odgovarati na pitanja na hrvatskom jeziku. U nastavku ti navodim jedan odlomak:
{passage}
Moje pitanje je:
{query}<end_of_turn>
<start_of_turn>model"""
    return prompt

def build_normal_prompt(query):
    prompt = f"""<start_of_turn>user
Ti si koristan AI asistent. Tvoj zadatak je odgovarati na pitanja na hrvatskom jeziku.
Moje pitanje je:
{query}<end_of_turn>
<start_of_turn>model"""
    return prompt

In [8]:
dataset_path = "/kaggle/input/wiki-and-ms-marco"
test_df = pd.read_csv(dataset_path + "/test.csv")
passage_embeddings = torch.load(dataset_path + "/passage_embeddings.pt", weights_only=True, map_location="cpu")
test_df.head()

Unnamed: 0,query,passage
0,koliko je star dr.nowzaradan,Dr. Nowzaradan Plan prehrane.1 muškarci.2 treb...
1,Kako ukloniti anketu,Da biste izbrisali anketu: 1 Idite na stranicu...
2,Kako bih mogao razumjeti Dolića Draga?,"Dolića Draga je naselje u općini Lokvičići, u ..."
3,Što MFD znači za pisač,Značenje kanonskog pisača za upozorenje svjetl...
4,Koji su organi uključeni u sustav izlučivanja,Organi i njihove funkcije.1 Bubrezi otpadu iz ...


In [9]:
def retrieve_passage(text):
    global emb_model, passage_embeddings, test_df
    with torch.amp.autocast(device.type, dtype=torch.bfloat16 if use_bfloat16 else torch.float16):
        with torch.no_grad():
            query_prompt = query_to_text(text)
            query_input_ids, query_attention_mask = tokenize([query_prompt])
            query_embedding = emb_model(
                input_ids=query_input_ids,
                attention_mask=query_attention_mask).embeddings
    sim_scores = torch.matmul(passage_embeddings, query_embedding.T)
    passage_ind = int(torch.argmax(sim_scores))
    passage = test_df.iloc[passage_ind]["passage"]
    return passage

In [10]:
def get_rag_output(query):
    passage = retrieve_passage(query)
    prompt = build_rag_prompt(query, passage)
    inputs = lang_tokenizer([prompt], return_tensors = "pt").to("cuda")
    outputs = lang_model.generate(**inputs, max_new_tokens = 300)
    return lang_tokenizer.batch_decode(outputs)[0]

def get_normal_output(query):
    prompt = build_normal_prompt(query)
    inputs = lang_tokenizer([prompt], return_tensors = "pt").to("cuda")
    outputs = lang_model.generate(**inputs, max_new_tokens = 300)
    return lang_tokenizer.batch_decode(outputs)[0]

In [11]:
queries = [
    "Što je to Hallux?", # 1982
    "Kako izbrisati anketu?", # 1
    "Za što se koriste superračunala?", #
    "Koji je posao veznjaka u nogometu?",
    "Koliko dugo trebam kuhati krumpir?", # 1011
    "Primjer teme Lelemi jezici i njezina važnost.", # 1013
    "Kako pokrenuti uspješnu grupu za podršku dijabetesu?", # 1067
    "Koje je simboličko značenje bikova?", # 1065
    "Što je fiksacija dušika", # 1062
    "Navedi sve karakteristike morskog puža", # 1068
    "Koji je najveći planet u Sunčevom sustavu i koliki mu je promjer?",
    "Tko je napisao knjigu Na Drini ćuprija i koja je godina izdavanja?",
    "Koje je godine održana prva moderna Olimpijada i gdje se održala?",
    "Koji je glavni grad Brazila i kada je postao glavni grad?",
    "Objasni evoluciju homo sapiensa od ranijih hominida.",
    "Kako funkcionira fotosinteza u biljkama i koji su njezini osnovni proizvodi?",
    "Tko je bio Nikola Tesla i koje je najvažnije otkriće pripisano njemu?",
    "Objasni kako radi unutarnji mehanizam klasičnog sata s utegom.",
]

for query in queries:
    print("================ Pitanje ==============")
    print(query)
    print()
    print("================ Odgovor ===============")
    print(get_normal_output(query))
    print()
    print("================ Odgovor RAG ===========")
    print(get_rag_output(query))
    print()

Što je to Hallux?

<bos><start_of_turn>user
Ti si koristan AI asistent. Tvoj zadatak je odgovarati na pitanja na hrvatskom jeziku.
Moje pitanje je:
Što je to Hallux?<end_of_turn>
<start_of_turn>model**Odgovor:**

Hallux je prvo pralo na čovjekovom nogu. To je najčešći tip deformirane noge, koji se može pojaviti kod ljudi s širokim krugom ili obratno. Hallux se može opisati kao pretvorba noge u čavdar, što može biti bolno i funkcionalno nepovoljano.<eos>

<bos><start_of_turn>user
Ti si koristan AI asistent. Tvoj zadatak je odgovarati na pitanja na hrvatskom jeziku. U nastavku ti navodim jedan odlomak:
Bunions su progresivni poremećaj.Započinju naginjanjem velikog nožnog prsta, postupno mijenjajući kut kostiju tijekom godina i polako stvarajući karakteristični udarac, koji postaje sve istaknutiji.Simptomi se obično pojavljuju u kasnijim fazama, iako neki ljudi nikada nemaju simptome. Definicija Halluxa.plural.haluce.Igra \ ˈha-lə-ˌsēz, ˈhal-yə- \.: Unutarnja znamenka (kao veliki nožni pr