In [1]:
pip install -U transformers datasets accelerate peft trl bitsandbytes wandb -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install bitsandbytes -q

In [3]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
import transformers
import datasets
import torch
import wandb
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments

print("Transformers version:", transformers.__version__)
print("Datasets version:", datasets.__version__)
print("PyTorch version:", torch.__version__)
print("WandB version:", wandb.__version__)

Transformers version: 4.44.2
Datasets version: 3.0.0
PyTorch version: 2.4.0
WandB version: 0.18.1


In [5]:
import os

#Store the Hugging Face token in environment
os.environ["HF_TOKEN"] = "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"

In [6]:
from huggingface_hub import login
login(token=os.getenv("HF_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [7]:
#Preparing workspace
os.makedirs('models', exist_ok=True)
os.makedirs('output', exist_ok=True)
os.makedirs('logs', exist_ok=True)
print("Workspace directories are set up.")

Workspace directories are set up.


In [8]:
#Testing Configuration by loading dummy data point in W&B

# Initialize a WandB run
wandb.init(project='llama3-finetuning', entity='rafayahmad-addo-ai')

# Log a test data point
wandb.log({'test': 1})

# Finish the run
wandb.finish()
print("W&B setup test completed successfully.")

[34m[1mwandb[0m: Currently logged in as: [33mrafayahmad[0m ([33mrafayahmad-addo-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test,▁

0,1
test,1


W&B setup test completed successfully.


In [9]:
#Loading and splitting dataset 
from datasets import load_dataset
dataset = load_dataset("keivalya/MedQuad-MedicalQnADataset")
train_test_split = dataset['train'].train_test_split(test_size=0.1, shuffle=True, seed = 42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

README.md:   0%|          | 0.00/233 [00:00<?, ?B/s]

medDataset_processed.csv:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16407 [00:00<?, ? examples/s]

In [10]:
#Inspect the dataset
print("Dataset structure:", dataset)
print("Sample entry:", dataset['train'][0])

Dataset structure: DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 16407
    })
})
Sample entry: {'qtype': 'susceptibility', 'Question': 'Who is at risk for Lymphocytic Choriomeningitis (LCM)? ?', 'Answer': 'LCMV infections can occur after exposure to fresh urine, droppings, saliva, or nesting materials from infected rodents.  Transmission may also occur when these materials are directly introduced into broken skin, the nose, the eyes, or the mouth, or presumably, via the bite of an infected rodent. Person-to-person transmission has not been reported, with the exception of vertical transmission from infected mother to fetus, and rarely, through organ transplantation.'}


In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [12]:
#Add a custom padding token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

In [13]:
#Data Preprocessing (cleaning text, tokenizing, and converting entries to a format suitable for model training)

def preprocess_data(example):
    # Tokenize the conversations
    return tokenizer(example['qtype'], truncation=True, padding="max_length", max_length=512)

# Apply the preprocessing function to all entries in the dataset
dataset = dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/16407 [00:00<?, ? examples/s]

# Model and Tokenizer Configuration

In [14]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
dataset_name = "keivalya/MedQuad-MedicalQnADataset"

In [15]:
#We will use the transformers library to load the pre-trained LLaMA 3 model and its tokenizer.
#This is a critical step to ensure that the model understands the format of the input data

# from transformers import AutoModelForCausalLM, AutoTokenizer

# # Specify the model ID from Hugging Face Model Hub
# model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_name)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)
model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=quant_config,torch_dtype="float16", device_map="auto")
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

**Configure Tokenizer for Special Tokens:**

Special tokens may be necessary for specific tasks, such as separating dialog turns or indicating the start of a response:

In [16]:
# Add or modify special tokens
special_tokens_dict = {'additional_special_tokens': ['[USR]', '[SYS]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))  # Important: resize model token embeddings

Embedding(128258, 4096)

**Check Model and Tokenizer Compatibility:**

Ensure that the model and tokenizer are aligned in terms of vocabulary size and token types, and test them with a simple encoding and decoding task:

In [17]:
# Test encoding and decoding to ensure alignment
sample_text = "Hello, how can I assist you today?"
encoded_input = tokenizer(sample_text, return_tensors='pt')
decoded_output = tokenizer.decode(encoded_input['input_ids'][0])

print("Encoded:", encoded_input)
print("Decoded:", decoded_output)

Encoded: {'input_ids': tensor([[128000,   9906,     11,   1268,    649,    358,   7945,    499,   3432,
             30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Decoded: <|begin_of_text|>Hello, how can I assist you today?


# ORPO Configuration and Training

In [18]:
#Define ORPO 
#The ORPO configuration involves setting parameters that control the training process, 
#including learning rates, batch sizes, and the preference alignment ratio
from trl import ORPOConfig, ORPOTrainer

orpo_config = ORPOConfig(
    output_dir='./output',  # Specify the output directory for saving models and checkpoints
    learning_rate=8e-6,
    beta=0.1,  # The lambda parameter for preference optimization
    max_length=512,  # Maximum sequence length
    per_device_train_batch_size=2,
    num_train_epochs=1,  # For demonstration, a small number of epochs
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=10
)


In [19]:
print(train_dataset[0])


{'qtype': 'treatment', 'Question': 'What are the treatments for Lennox-Gastaut syndrome ?', 'Answer': 'These resources address the diagnosis or management of Lennox-Gastaut syndrome:  - Cleveland Clinic  - Genetic Testing Registry: Epileptic encephalopathy Lennox-Gastaut type  - National Institute of Neurological Disorders and Stroke: Diagnosis and Treatment of Epilepsy  - News Release: FDA Approves New Drug to Treat Severe Form of Epilepsy (U.S. Food and Drug Administration, November 20, 2008)   These resources from MedlinePlus offer information about the diagnosis and management of various health conditions:  - Diagnostic Tests  - Drug Therapy  - Surgery and Rehabilitation  - Genetic Counseling   - Palliative Care'}


In [20]:
# Initialize ORPO Trainer:

# The ORPO trainer handles the fine-tuning process by applying the ORPO algorithm to update the model’s weights based on the dataset
# and the defined configuration:

In [21]:
# trainer = ORPOTrainer(
#     model=model,
#     args=orpo_config,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     tokenizer=AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
# )


In [22]:
llama_prompt = """You are a medical assistance. Your task is to answer medical related queries.

### Query:
{}

### Answer:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

In [23]:
def formatting_prompts_func(examples):
    questions = examples["Question"]
    answers = examples["Answer"]
    texts = []
    for question, answer in zip(questions, answers):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = llama_prompt.format(question, answer) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

train_dataset = train_dataset.map(formatting_prompts_func, batched = True)
test_dataset = test_dataset.map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/14766 [00:00<?, ? examples/s]

Map:   0%|          | 0/1641 [00:00<?, ? examples/s]

In [24]:
# Initialize a WandB run
wandb.init(project='llama3-finetuning', entity='rafayahmad-addo-ai')

## PEFT Fine tuning Configuration

In [25]:
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer,SFTConfig
output_dir="llama-3.1-fine-tuned-model"

peft_config = LoraConfig(
    r=32,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['down_proj', 'gate_proj', 'o_proj', 'v_proj', 'up_proj', 'q_proj', 'k_proj'],
)

training_arguments = SFTConfig(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=1,                       # number of training epochs
    per_device_train_batch_size=2,            # batch size per device during training
    gradient_accumulation_steps=4,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="adamw_8bit",
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper                        
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.01,
    fp16=True,
    bf16=False,
    logging_steps = 1,
    max_steps=50,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="wandb",                  # report metrics to w&b
)



trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=False,
    dataset_kwargs={
    "add_special_tokens": False,
    "append_concat_token": False,
    }
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/14766 [00:00<?, ? examples/s]

Map:   0%|          | 0/1641 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [26]:
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

In [27]:
trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,1.5882
2,1.8958
3,1.6478
4,1.8228
5,1.5649
6,1.272
7,1.3398
8,1.503
9,1.2279
10,1.3323




TrainOutput(global_step=50, training_loss=1.1542865872383117, metrics={'train_runtime': 1371.8686, 'train_samples_per_second': 0.292, 'train_steps_per_second': 0.036, 'total_flos': 7367680759971840.0, 'train_loss': 1.1542865872383117, 'epoch': 0.027089259108763374})

In [None]:
#stop reporting to wandb
wandb.finish()
# save model
trainer.save_model("model")
tokenizer.save_pretrained("model")

### evaluate the model

In [29]:
from transformers import EvalPrediction
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds, average='weighted')
    }

In [None]:
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

### Visualize the results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example: Plotting accuracy
plt.figure(figsize=(10, 5))
sns.lineplot(data=evaluation_results['eval_accuracy'], label='Test Accuracy')
plt.title('Model Accuracy over Epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [None]:
# Analyze and Interpret results
sample_outputs = tokenizer.batch_decode(trainer.predict(test_dataset.sample(5)).predictions, skip_special_tokens=True)
for output in sample_outputs:
    print("Model Output:", output)

### Pushing model into Hugging Face Hub

In [None]:
from transformers import Trainer

# Login to Hugging Face
from huggingface_hub import notebook_login
notebook_login()

# Push to hub
trainer.push_to_hub("YourFineTunedModel", use_auth_token=True)
tokenizer.push_to_hub("YourFineTunedModel", use_auth_token=True)

#### Integrate the Model into an Application:

Provide guidance on how to integrate the model into a live environment or application. This might include API usage examples or embedding the model within a web service:

In [None]:
from transformers import pipeline

# Load the model from the hub
model_pipeline = pipeline("text-generation", model="YourUserName/YourFineTunedModel")

# Example usage
user_input = "I feel stressed and overwhelmed."
response = model_pipeline(user_input)[0]['generated_text']
print("Model Response:", response)