# Set Up

In [1]:
!pip install transformers
!pip install datasets
!pip install openai
!pip install peft
!pip install ast

Collecting transformers
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.30.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m 

In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
import transformers
from datasets import load_dataset, concatenate_datasets, ClassLabel
import json
import pandas as pd
from openai import OpenAI
import ast
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

In [2]:
!huggingface-cli login --token nevergonna


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `seniorproject` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `seniorproject`


In [3]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    cache_dir="base_models_7b",
    device_map='auto'
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [31]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", cache_dir="base_models_7b", padding_side="left")

In [65]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = 1024
tokenizer.truncation = True 
tokenizer.padding = "max_length"

In [5]:
def get_response(prompt, max_new_tokens=2000):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, repetition_penalty = 1.18, temperature= 0.15)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# RareBench Zero-Shot Top-10 Recall 

In [None]:
rb_test = pd.read_csv('rarebench_data_test.csv')
correct = 0
zero_shot_predictions = []

for _, example in rb_test.iterrows():
    prompt = '''<s>[INST] <<SYS>>
            {{ You are a specialist in the field of rare diseases. You will be provided and
            asked about a complicated clinical case; read it carefully and then provide a diverse and
            comprehensive differential diagnosis. }}
            <</SYS>>

            {{ This rare disease patient suffers from symptoms: '''+ example["symptoms"] +'''. Enumerate the top 10 most likely diagnoses. Be precise,
            listing one diagnosis per line, and try to cover many unique possibilities (at least 10). The
            top 10 diagnoses are: }} [/INST]'''
    result = get_response(prompt)
    
    zero_shot_predictions.append({
        "symptoms": example["symptoms"],
        "diseases": example["diseases"],
        "predicted": result
    })    

    zero_shot_predictions = pd.DataFrame(zero_shot_predictions)

In [9]:

list_is_correct = []
correct = 0
client = OpenAI(api_key="urmom", base_url="https://api.deepseek.com")
for _, example in zero_shot_predictions.iterrows():
    is_correct = client.chat.completions.create(
        model="deepseek-reasoner",
        messages=[
            {"role": "user", "content": "For each disease in another language model's output" + example["predicted"] + ", check if it's listed in " + example["diseases"] + ". Print \"Yes\" or \"No\" only."}
        ],
        stream=False
    )
    is_correct = is_correct.choices[0].message.content
    if is_correct == "Yes":
        correct += 1
    list_is_correct.append({
        "correct": is_correct == "Yes"
    })

list_is_correct = pd.DataFrame(list_is_correct)

In [15]:
display(zero_shot_predictions)
zero_shot_predictions.to_csv("zero_shot_predictions.csv", index = False)

Unnamed: 0,0,1,2,4
0,"['Intellectual disability', 'Hypotonia', 'Glob...",['TBCK-related intellectual disability syndrom...,[INST] <<SYS>>\n {{ You are a speci...,False
1,"['Death in infancy', 'Aminoaciduria', 'Abnorma...",['3-Methylcrotonyl-CoA carboxylase 1 deficienc...,[INST] <<SYS>>\n {{ You are a speci...,False
2,"['Weight loss', 'Arthralgia', 'Elevated circul...","['Scleroderma, familial progressive', 'Systemi...",[INST] <<SYS>>\n {{ You are a speci...,False
3,"['Seizure', 'Spastic diplegia', 'Encephalopath...",['Glutaric acidemia type I; GA-I/Glutaryl-CoA ...,[INST] <<SYS>>\n {{ You are a speci...,True
4,"['Wide mouth', 'Epicanthus', 'Carious teeth', ...","['Hyperinsulinemic hypoglycemia, familial, 2',...",[INST] <<SYS>>\n {{ You are a speci...,False
...,...,...,...,...
415,"['Vesicoureteral reflux', 'Microretrognathia',...","['Mental retardation, autosomal dominant 42']",[INST] <<SYS>>\n {{ You are a speci...,False
416,"['Obesity', 'Death in infancy', 'Neutropenia',...",['Glycogen storage disease Ib/Glycogen storage...,[INST] <<SYS>>\n {{ You are a speci...,False
417,"['Eosinophilia', 'Lymphopenia', 'Increased cir...","['Candidiasis, familial chronic mucocutaneous,...",[INST] <<SYS>>\n {{ You are a speci...,False
418,"['Microcephaly', 'Ptosis', 'Myopia', 'Autism',...",['Intellectual developmental disorder with dys...,[INST] <<SYS>>\n {{ You are a speci...,False


In [10]:
zero_shot_predictions = pd.concat([zero_shot_predictions, list_is_correct], axis = 1, ignore_index = True)
zero_shot_predictions.to_csv("zero_shot_predictions.csv", index = False)

In [20]:
print(correct)
print(correct/len(rb_test))

50
0.11904761904761904


# Fine-Tuning #1

In [77]:
dataset = load_dataset("csv", data_files="final_train_data.csv")

In [78]:
dataset = dataset.map(lambda samples: {
    "input_ids": tokenizer(
        [str(q) for q in samples["question"]],
        padding="max_length",      
        truncation=True,
        max_length=1024,
        return_tensors="np"
    )["input_ids"],
    "labels": tokenizer(
        [str(a) for a in samples["answer"]],
        padding="max_length",      
        truncation=True,
        max_length=1024,
        return_tensors="np"
    )["input_ids"]
}, batched=True)

Map:   0%|          | 0/2061 [00:00<?, ? examples/s]

In [79]:
dataset = dataset.remove_columns(["question", "answer"])

In [80]:
split = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
val_dataset = split["test"]

print(train_dataset)
print(val_dataset)

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 1854
})
Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 207
})


In [81]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [82]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [83]:
config = LoraConfig(
    r=8, 
    lora_alpha=16, 
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 19988480 || all params: 6758404096 || trainable%: 0.2957573965106688


In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,
        warmup_steps=100,
        max_steps=2000,
        learning_rate=1e-4,
        fp16= False,
        bf16 = True,
        logging_steps=20,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_steps = 500,  
        eval_strategy = "steps",  
        eval_steps = 500,  
        save_total_limit = 2,  
        load_best_model_at_end = True,  
        metric_for_best_model = "loss", 
        greater_is_better = False 
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
