## Datasets Pre-processing

In [3]:
from datasets import load_dataset
# Load Job description dataset
data_jd = load_dataset("csv", data_files="2_formatted_jd_dataset.csv")

# Drop rows with missing input_text or target_text
dataset_jd = data_jd.filter(lambda x: x["input_text"] is not None and x["target_text"] is not None)

# Split into train/val (80/20 split)
dataset_jd_split = dataset_jd["train"].train_test_split(test_size=0.2, seed=42)

In [1]:
from datasets import load_dataset
# Load career HF dataset 
data_career = load_dataset("csv", data_files="Career Dataset from HF.csv")

# Drop rows with missing input_text or target_text
dataset_career = data_career.filter(lambda x: x["question"] is not None and x["answer"] is not None)

dataset_career = dataset_career.rename_columns({
    "question": "input_text",
    "answer": "target_text"
})
dataset_career = dataset_career.remove_columns("role")
# Split into train/val (80/20 split)
dataset_career_split = dataset_career["train"].train_test_split(test_size=0.2, seed=42)

In [2]:
# Load resume dataset 
data_resume = load_dataset("csv", data_files="2_formatted_resume_dataset.csv")

# Drop rows with missing input_text or target_text
dataset_resume = data_resume.filter(lambda x: x["input_text"] is not None and x["target_text"] is not None)

# Split into train/val (80/20 split)
dataset_resume_split = dataset_resume["train"].train_test_split(test_size=0.2, seed=42)

In [21]:
print(dataset_resume_split)

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 518
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 130
    })
})


In [3]:
from datasets import concatenate_datasets, DatasetDict

# Join the train split
train = concatenate_datasets([
    #dataset_jd_split["train"],
    dataset_career_split["train"],
    dataset_resume_split["train"]
])

# Join the test split
test = concatenate_datasets([
    #dataset_jd_split["test"],
    dataset_career_split["test"],
    dataset_resume_split["test"]
])

split_dataset = DatasetDict({
    "train": train,
    "test": test
})

print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 1814
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 454
    })
})


In [6]:
split_dataset.save_to_disk("dataset/split_combined_dataset_withoutjd")

Saving the dataset (0/1 shards):   0%|          | 0/1814 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/454 [00:00<?, ? examples/s]

In [30]:
# Pre-processing the dataset to include the chat template that all LLaMA 3.2 3B models require

from datasets import load_dataset
import os
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from datasets import disable_progress_bar

disable_progress_bar()
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

instruction = """You are a top-rated NTU career advisor chatbot.
Be polite, concise, and helpful in providing career guidance responses."""

def format_chat_template(row):
    messages = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": row["instruction"]},
        {"role": "assistant", "content": row["response"]}
    ]

    # Use the built-in chat template from Llama 3.2 Instruct
    row["text"] = tokenizer.apply_chat_template(
        messages,
        tokenize=False  # returns plain text
    )
    return row

def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs
    
formatted_dataset = split_dataset.map(format_chat_template)
tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=formatted_dataset["train"].column_names  # remove original text columns
)

In [31]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1814
    })
    test: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 454
    })
})


## **Finding Optimal Values - Hyperparameter Tuning**

In [None]:
# Install PyTorch with CUDA support (modify according to your system and CUDA version)
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117

# Install Ray and Ray Tune
!pip install ray[tune]

!pip install datasets

!pip install evaluate

!pip install sacrebleu

In [None]:
pip install -U ipywidgets

# Install Ray and Ray Tune
!python3.11 -m pip install ray[tune]

!python3.11 -m pip install bert-score ipywidgets sacrebleu evaluate

In [23]:
!python3.11 -m pip install bitsandbytes 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m5.7 MB/s[0m  [33m0:00:10[0mm0:00:01[0m00:01[0mm
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.1


In [4]:
import torch
import ray
from ray import tune
from bert_score import score

# Verify CUDA support in PyTorch
print(f"CUDA available in PyTorch: {torch.cuda.is_available()}")

# Check if Ray is ready
ray.init(ignore_reinit_error=True)
print("Ray is initialized!")

CUDA available in PyTorch: True


2025-10-16 19:55:57,383	INFO worker.py:2013 -- Started a local Ray instance.


Ray is initialized!




In [32]:
# saving to disk
tokenized_dataset["train"].save_to_disk("tokenized_train_dataset_3")
tokenized_dataset["test"].save_to_disk("tokenized_test_dataset_3")

In [1]:
# Load from disk

from datasets import load_from_disk

tokenized_dataset = load_from_disk("dataset/tokenized_dataset")

In [36]:
import os

# Number of CPU cores available
num_cpus = os.cpu_count()
print("Number of CPUs:", num_cpus)


Number of CPUs: 72


In [37]:
import torch

if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print("Number of GPUs:", num_gpus)
    for i in range(num_gpus):
        print(torch.cuda.get_device_name(i))
else:
    print("No GPU available")

Number of GPUs: 1
Tesla V100-PCIE-32GB


### LoRA 

In [None]:
# Load my model
import numpy as np
import csv

model_name = "meta-llama/Llama-3.2-3B-Instruct"
HUGGING_FACE_TOKEN = "-----" # Put in your Hugging Face token here

small_train = tokenized_dataset["train"].shuffle(seed=42).select(range(900))  # 900 samples
small_test = tokenized_dataset["test"].shuffle(seed=42).select(range(100))     # 100 samples

def train_function_lora(config):
    import gc, os
    from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
    from peft import LoraConfig, get_peft_model
    import evaluate

    bleu = evaluate.load("sacrebleu")
    def compute_metrics(eval_pred, tokenizer):
        logits, labels = eval_pred
    
        # Move logits to CPU to save GPU memory
        logits = torch.tensor(logits).cpu()
        labels = torch.tensor(labels).cpu()
    
        # Take argmax to get predicted token IDs
        preds = torch.argmax(logits, dim=-1).numpy()
    
        # Replace -100 in labels (ignored tokens) with pad_token_id
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
        # Decode predictions and labels
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
        # Compute BLEU
        bleu_score = bleu.compute(
            predictions=decoded_preds,
            references=[[l] for l in decoded_labels]
        )["score"]
    
        return {"bleu": bleu_score}

    torch.cuda.empty_cache()
    gc.collect()

    r = config["r"]
    alpha = config["alpha"]
    learning_rate = config["learning_rate"]

    print(f"\nTraining with r={r}, alpha={alpha}...")

    # Load model fresh each time
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=HUGGING_FACE_TOKEN,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGING_FACE_TOKEN)
    tokenizer.pad_token = tokenizer.eos_token
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # LoRA config
    lora_cfg = LoraConfig(
        r=r,
        lora_alpha=alpha,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    lora_model = get_peft_model(model, lora_cfg)

    # Define Hugging Face training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/r{r}_a{alpha}",
        learning_rate=learning_rate,   # from Ray Tune
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=8,  
        #num_train_epochs=config["epochs"],
        num_train_epochs=2,
        logging_dir=f"./logs/r{r}_a{alpha}",
        logging_steps=100,
        report_to="none", 
        fp16=True,
    )

    trainer = Trainer(
        model=lora_model,
        args=training_args,
        train_dataset=small_train,
        eval_dataset=small_test,
        data_collator=data_collator,
        #compute_metrics=compute_metrics, # compute later
        compute_metrics=None,
    )

    # Train and evaluate
    trainer.train()
    metrics = trainer.evaluate()
    perplexity = torch.exp(torch.tensor(metrics["eval_loss"])).item()

    tune.report({
        "eval_loss": metrics["eval_loss"],
        "perplexity": perplexity,
        "bleu": metrics.get("bleu", 0.0),
    })

    del model, lora_model, trainer
    torch.cuda.empty_cache()
    gc.collect()


In [12]:
# Define the search space
search_space = {
    "r": tune.choice([16, 32, 64, 128, 256]),
    "alpha": tune.choice([32,64, 128, 256]),
    "learning_rate": tune.loguniform(1e-5, 1e-3),
}

In [13]:
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.tuner import Tuner
from ray.tune import RunConfig

scheduler = ASHAScheduler(
    metric="eval_loss",z
    mode="min",
    max_t=10,  # max epochs
    grace_period=1,
    reduction_factor=2,
)

trainable_with_cpu_gpu = tune.with_resources(train_function_lora, {"cpu": 4, "gpu": 1})

tuner = tune.Tuner(
    trainable_with_cpu_gpu,
    param_space=search_space,
    tune_config=tune.TuneConfig(
        scheduler=scheduler,
        num_samples=8,  # number of trials
        max_concurrent_trials=1,
    ),
)

results = tuner.fit()
best_result = results.get_best_result(metric="eval_loss", mode="min")
print(best_result.config)

0,1
Current time:,2025-10-15 01:20:06
Running for:,01:02:42.92
Memory:,19.7/377.5 GiB

Trial name,status,loc,alpha,learning_rate,r,iter,total time (s),eval_loss,perplexity,bleu
train_function_lora_44212_00000,TERMINATED,10.128.10.15:308209,256,1.34172e-05,16,1,488.569,1.12392,3.07689,0
train_function_lora_44212_00001,TERMINATED,10.128.10.15:310220,64,0.000375887,64,1,455.969,0.539837,1.71573,0
train_function_lora_44212_00002,TERMINATED,10.128.10.15:311584,256,0.000333228,128,1,464.207,0.53888,1.71409,0
train_function_lora_44212_00003,TERMINATED,10.128.10.15:312971,256,0.000770014,16,1,450.975,0.557507,1.74631,0
train_function_lora_44212_00004,TERMINATED,10.128.10.15:314078,32,6.18333e-05,64,1,455.698,0.978025,2.6592,0
train_function_lora_44212_00005,TERMINATED,10.128.10.15:315565,128,5.5161e-05,32,1,453.127,0.702374,2.01854,0
train_function_lora_44212_00006,TERMINATED,10.128.10.15:317139,128,6.98285e-05,32,1,464.142,0.608863,1.83834,0
train_function_lora_44212_00007,TERMINATED,10.128.10.15:318495,256,0.000193407,128,1,464.087,0.537805,1.71224,0


2025-10-15 01:20:06,306	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/FYP/shar0097/ray_results/train_function_lora_2025-10-15_00-17-23' in 0.0452s.
2025-10-15 01:20:06,312	INFO tune.py:1041 -- Total run time: 3763.25 seconds (3762.88 seconds for the tuning loop).


{'r': 128, 'alpha': 256, 'learning_rate': 0.00019340692258392847}


In [14]:
import pandas as pd

df = results.get_dataframe()
df.to_csv("lora_tuning_results_14Oct(more data_2).csv", index=False)

df1 = results.get_dataframe()
print(df1)

# QLORA

In [None]:
# newly added with bert_score

model_name = "meta-llama/Llama-3.2-3B-Instruct"
HUGGING_FACE_TOKEN = "-------" # Put in your Hugging Face Token here

small_train = tokenized_dataset["train"].shuffle(seed=42).select(range(900))  
small_test = tokenized_dataset["test"].shuffle(seed=42).select(range(10))     

def train_function_lora(config):
    from bert_score import score as bert_score
    from transformers import BitsAndBytesConfig
    import numpy as np
    import csv
    import gc, os, torch
    from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
    from peft import LoraConfig, get_peft_model
    import evaluate
    from bert_score import BERTScorer

    def compute_metrics(eval_pred):
        bleu_scorer = evaluate.load("sacrebleu")
        bert_scorer = BERTScorer(model_type="bert-base-uncased", device=device)

        # Hugging Face gives predictions and labels
        preds, labels = eval_pred
        # Decode predictions
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        # Replace -100 in labels with pad_token_id before decoding
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
        # BLEU expects list of list of refs
        references = [[l] for l in decoded_labels]
        bleu_score = bleu_scorer.compute(predictions=decoded_preds, references=references)["score"]
    
        # BERTScore
        P, R, F1 = bert_scorer.score(decoded_preds, decoded_labels)
    
        return {
            "bleu": bleu_score,
            "bert_precision": P.mean().item(),
            "bert_recall": R.mean().item(),
            "bert_f1": F1.mean().item()
        }
    torch.cuda.empty_cache()
    gc.collect()

    r = config["r"]
    alpha = config["alpha"]
    learning_rate = config["learning_rate"]

    print(f"\nTraining with r={r}, alpha={alpha}...")

    compute_dtype = getattr(torch, "float16")

    # bits and bytes config for qlora 
    bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=False,
        )

    # Load model fresh each time
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=HUGGING_FACE_TOKEN,
        quantization_config=bnb_config # qlora
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGING_FACE_TOKEN)
    tokenizer.pad_token = tokenizer.eos_token
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # LoRA config
    lora_cfg = LoraConfig(
        r=r,
        lora_alpha=alpha,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    lora_model = get_peft_model(model, lora_cfg)

    training_args = TrainingArguments(
        output_dir=f"./results/r{r}_a{alpha}",
        learning_rate=learning_rate,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=8,  
        num_train_epochs=2,
        logging_dir=f"./logs/r{r}_a{alpha}",
        logging_steps=100,
        report_to="none",
        fp16=True,
    )

    trainer = Trainer(
        model=lora_model,
        args=training_args,
        train_dataset=small_train,
        eval_dataset=small_test,
        data_collator=data_collator,
        compute_metrics=None,
    )

    # Train and evaluate
    trainer.train()
    metrics = trainer.evaluate()
    perplexity = torch.exp(torch.tensor(metrics["eval_loss"])).item()

    # Report to Ray Tune - eval_loss  perplexity bleu  bert_precision  bert_recall   bert_f1
    tune.report({
        "eval_loss": metrics["eval_loss"],
        "perplexity": perplexity,
        #"bleu": metrics.get("bleu", 0.0),
        #"bert_precision": metrics.get("bert_precision", 0.0),
        #"bert_recall": metrics.get("bert_recall", 0.0),
        #"bert_f1": metrics.get("bert_f1", 0.0),
    })

    del model, lora_model, trainer
    torch.cuda.empty_cache()
    gc.collect()

In [16]:
# Define the search space
search_space = {
    "r": tune.choice([16, 32, 64, 128, 256]),
    "alpha": tune.choice([32, 64, 128, 256]),
    "learning_rate": tune.loguniform(1e-5, 1e-3),S
    # "batch_size": tune.choice([16, 32]),
    #"epochs": tune.choice([2, 4]),
    # "optimizer": tune.choice(["adam", "sgd"]),
}

In [17]:
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.tuner import Tuner
from ray.tune import RunConfig

scheduler = ASHAScheduler(
    metric="eval_loss",
    mode="min",
    max_t=10,  # max epochs
    grace_period=1,
    reduction_factor=2,
)

trainable_with_cpu_gpu = tune.with_resources(train_function_lora, {"cpu": 4, "gpu": 1})

tuner = tune.Tuner(
    trainable_with_cpu_gpu,
    param_space=search_space,
    tune_config=tune.TuneConfig(
        scheduler=scheduler,
        num_samples=8,  # number of trials
        max_concurrent_trials=1,
    ),
)

results = tuner.fit()
best_result = results.get_best_result(metric="eval_loss", mode="min")
print(best_result.config)


0,1
Current time:,2025-10-15 02:29:42
Running for:,01:09:35.89
Memory:,17.7/377.5 GiB

Trial name,status,loc,alpha,learning_rate,r,iter,total time (s),eval_loss,perplexity
train_function_lora_075ea_00000,TERMINATED,10.128.10.15:319394,64,9.38677e-05,256,1,533.591,0.523762,1.68837
train_function_lora_075ea_00001,TERMINATED,10.128.10.15:320430,128,0.000560656,32,1,507.834,0.452511,1.57226
train_function_lora_075ea_00002,TERMINATED,10.128.10.15:321754,64,4.80874e-05,128,1,513.855,0.741647,2.09939
train_function_lora_075ea_00003,TERMINATED,10.128.10.15:324184,128,0.000347184,16,1,513.759,0.448758,1.56637
train_function_lora_075ea_00004,TERMINATED,10.128.10.15:325190,256,0.000882962,256,1,516.846,0.45713,1.57953
train_function_lora_075ea_00005,TERMINATED,10.128.10.15:326635,256,0.000513624,32,1,514.369,0.450956,1.56981
train_function_lora_075ea_00006,TERMINATED,10.128.10.15:328341,128,0.000289956,128,1,513.843,0.460199,1.58439
train_function_lora_075ea_00007,TERMINATED,10.128.10.15:329714,128,0.000647171,64,1,516.179,0.454956,1.5761


2025-10-15 02:29:42,514	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/FYP/shar0097/ray_results/train_function_lora_2025-10-15_01-20-06' in 0.2825s.
2025-10-15 02:29:42,519	INFO tune.py:1041 -- Total run time: 4175.91 seconds (4175.61 seconds for the tuning loop).


{'r': 16, 'alpha': 128, 'learning_rate': 0.0003471842340392062}


In [18]:
import pandas as pd

df = results.get_dataframe()
df.to_csv("qlora_tuning_results_14Oct(more data_2).csv", index=False)

# Training the LoRA and QLoRA models 

LoRA

In [None]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model_name = "meta-llama/Llama-3.2-3B-Instruct"
HUGGING_FACE_TOKEN = "------" #Put in your Hugging Face Token here

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=HUGGING_FACE_TOKEN,
    device_map='auto',
    dtype=torch.float16,
    low_cpu_mem_usage=True      # efficient loading
)

# Set token
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGING_FACE_TOKEN)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [37]:
# Step 3 : Freezing the Model’s Parameters
for param in model.parameters():
  param.requires_grad = False # model's parameters are frozen during training, gradients for these parameters are not computed during backpropogation
  if param.ndim == 1: # checks if the parameter is 1-d and then converts the data to float32 type, to have mixed-precision training
    param.data = param.data.to(torch.float32)
#  to have the main computations done in float16 (to speed up training and reduce memory consumption), but certain parameters—like biases—are kept in float32 to avoid numerical instability.

  model.gradient_checkpointing_enable() # memory-saving technique,  instead of storing all intermediate activations needed for backpropagation, the model recomputes some activations during the backward pass
  model.enable_input_require_grads() # gradients are calculated for the model’s inputs, which can be useful when you need to compute gradients with respect to the input data

# forward method overrides the default behavior and ensures that the output of the model's forward pass is cast to torch.float32
  class CastOutputToFloat(nn.Sequential):
    def forward(self, x) :
      return super().forward(x).to(torch.float32)  # Cast to float32 after

# typically refers to the final output layer of the language model
# output from the language model head is always in float32 precision
model.lm_head = CastOutputToFloat(model.lm_head)

In [38]:
# Step 4 : Checking Trainable Parameters
def print_trainable_parameters(model):
    """
  printing the number of trainable paramters in the model
  """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

In [39]:
# Step 5 : Setting up LoRA Configuration
for name, module in model.named_modules():
    if 'attn' in name or 'attention' in name:  # Common attention module names
        print(name)
        for sub_name, sub_module in module.named_modules():  # Check sub-modules within attention
            print(f"  - {sub_name}")

model.layers.0.self_attn
  - 
  - q_proj
  - k_proj
  - v_proj
  - o_proj
model.layers.0.self_attn.q_proj
  - 
model.layers.0.self_attn.k_proj
  - 
model.layers.0.self_attn.v_proj
  - 
model.layers.0.self_attn.o_proj
  - 
model.layers.0.post_attention_layernorm
  - 
model.layers.1.self_attn
  - 
  - q_proj
  - k_proj
  - v_proj
  - o_proj
model.layers.1.self_attn.q_proj
  - 
model.layers.1.self_attn.k_proj
  - 
model.layers.1.self_attn.v_proj
  - 
model.layers.1.self_attn.o_proj
  - 
model.layers.1.post_attention_layernorm
  - 
model.layers.2.self_attn
  - 
  - q_proj
  - k_proj
  - v_proj
  - o_proj
model.layers.2.self_attn.q_proj
  - 
model.layers.2.self_attn.k_proj
  - 
model.layers.2.self_attn.v_proj
  - 
model.layers.2.self_attn.o_proj
  - 
model.layers.2.post_attention_layernorm
  - 
model.layers.3.self_attn
  - 
  - q_proj
  - k_proj
  - v_proj
  - o_proj
model.layers.3.self_attn.q_proj
  - 
model.layers.3.self_attn.k_proj
  - 
model.layers.3.self_attn.v_proj
  - 
model.layers.3

In [40]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r = 256, # optimized
    lora_alpha = 128, # # optimized
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"], # This specifies which layers (or submodules) in the model will be adapted using LoRA
    lora_dropout = 0.05, # regularization technique that helps prevent overfitting
    bias = "none", #how to handle the bias terms in the model during fine-tuning, so in this case, no bias terms are updated or fine-tuned during the LoRA process
    task_type = "CAUSAL_LM" #  specifies the type of task for which the model is being fine-tuned
)

In [40]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r = 256, # optimized
    lora_alpha = 128, # # optimized
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"], # Submodules 
    lora_dropout = 0.05,
    bias = "none", 
    task_type = "CAUSAL_LM"
)

In [41]:
# Step 6 : Injecting LoRA into the Model and comparing trainable parameters
lora_model = get_peft_model(model, config)
print_trainable_parameters(lora_model)

trainable params: 146800640 || all params: 3359550464 || trainable%: 4.369651284392792


In [42]:
import transformers
tokenizer.pad_token = tokenizer.eos_token
trainer = transformers.Trainer(
    model = lora_model,

    train_dataset = tokenized_dataset['train'],
    args = transformers.TrainingArguments(
        per_device_eval_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        learning_rate = 3.44e-04, # optimized
        fp16 = True,
        num_train_epochs=2,
        logging_steps = 100,
        output_dir = 'outputs',
        report_to = "none"
    ),
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm = False)
)
model.config.use_cache = False
trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss
100,0.8959


TrainOutput(global_step=114, training_loss=0.8247662761755157, metrics={'train_runtime': 415.2161, 'train_samples_per_second': 8.738, 'train_steps_per_second': 0.275, 'total_flos': 1.6525836687507456e+16, 'train_loss': 0.8247662761755157, 'epoch': 2.0})

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [44]:
# sharshar20/career-advisory-lora-llama3.2-3b-v2

lora_model.push_to_hub("sharshar20/career-advisory-lora-llama3.2-3b-instruct-v7",
                      use_auth_token=True,
                      commit_message = "Lora Training method for Instruct Model(edited dataset), r=256, alpha=128, lr=3.44e-04",
                      private=False)



Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/sharshar20/career-advisory-lora-llama3.2-3b-instruct-v7/commit/5e6f19c2f13a749783020c40d653fccb45d93b9c', commit_message='Lora Training method for Instruct Model(edited dataset), r=256, alpha=128, lr=3.44e-04', commit_description='', oid='5e6f19c2f13a749783020c40d653fccb45d93b9c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sharshar20/career-advisory-lora-llama3.2-3b-instruct-v7', endpoint='https://huggingface.co', repo_type='model', repo_id='sharshar20/career-advisory-lora-llama3.2-3b-instruct-v7'), pr_revision=None, pr_num=None)

In [2]:
# Step 10 : Inferencing with trained LoRA adapter - merging both base model and lora adapters
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "sharshar20/career-advisory-lora-llama3.2-3b-instruct-v7"
config = PeftConfig.from_pretrained(peft_model_id)
model_lora = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                            return_dict = True,
                                            device_map = 'auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

modelLoRA = PeftModel.from_pretrained(model_lora,peft_model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
import torch
from bert_score import BERTScorer

def evaluate_bertscore(model, tokenizer, dataset, max_new_tokens=128, batch_size=8, print_every=10):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    scorer = BERTScorer(model_type="bert-base-uncased", device=device)
    
    all_preds = []
    all_refs = []

    num_samples = len(dataset)
    
    for i in range(0, num_samples, batch_size):
        batch = dataset[i:i+batch_size]

        input_ids = torch.tensor(batch["input_ids"]).to(device)
        attention_mask = torch.tensor(batch["attention_mask"]).to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_new_tokens
            )

        # Decode predictions
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

        all_preds.extend(decoded_preds)
        all_refs.extend(decoded_labels)

        # Print intermediate BERTScore every `print_every` batches
        if (i // batch_size + 1) % print_every == 0 or (i + batch_size) >= num_samples:
            P, R, F1 = scorer.score(all_preds, all_refs)
            print(f"After batch {i//batch_size + 1}: BERT F1 = {F1.mean().item():.4f}")

    # Compute BERTScore
    P, R, F1 = scorer.score(all_preds, all_refs)
    
    return {
        "bert_precision": P.mean().item(),
        "bert_recall": R.mean().item(),
        "bert_f1": F1.mean().item()
    }

In [4]:
# BERTScore evaluation for modelLoRA
metrics = evaluate_bertscore(modelLoRA, tokenizer, tokenized_dataset["test"])
print(metrics)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 10: BERT F1 = 0.9307


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 20: BERT F1 = 0.9353


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 30: BERT F1 = 0.9365


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 40: BERT F1 = 0.9361


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 50: BERT F1 = 0.9254


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 57: BERT F1 = 0.9207
{'bert_precision': 0.8791199922561646, 'bert_recall': 0.9683712720870972, 'bert_f1': 0.9207432866096497}


In [5]:
metrics = evaluate_bertscore(model_lora, tokenizer, tokenized_dataset["test"])
print(metrics)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 10: BERT F1 = 0.9289


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 20: BERT F1 = 0.9306


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 30: BERT F1 = 0.9329


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 40: BERT F1 = 0.9317


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 50: BERT F1 = 0.9227


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 57: BERT F1 = 0.9187
{'bert_precision': 0.8749769330024719, 'bert_recall': 0.9689300656318665, 'bert_f1': 0.9187436699867249}


In [12]:
# Check if CUDA is available and move the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
modelLoRA.to(device)  # Move the model to the chosen device

# Define the input prompt/question
cot_examples = """
You are a helpful and smart career advisor. Always answer concisely in 1-2 sentences
by providing career suggestions based on the user's skills, education and interests. 
Avoid listing too many tools, libraries, career options and responsibilities.

Q: I enjoy working with numbers, have a degree in economics, and experience in Excel and SQL. What careers suit me? 
A: Since you have an economics background and know tools like Excel and SQL, I’d suggest careers such as data analyst, financial analyst, or business intelligence.

Q: I am good at creative writing, storytelling, and content creation. I also know basic graphic design and social media marketing. I have a degree in Communications. What career paths should I consider?
A: With your mix of writing, design, and marketing skills, I’d recommend careers like content creator, social media manager, or digital marketing specialist.

Q: I have experience in project management, leadership, and team coordination. I also understand budgeting and risk management. I hold an MBA. Which careers suit me?
A: Given your leadership and management background, plus an MBA, careers such as project manager or management consultant would suit you.

Q: I am passionate about biology and healthcare. I have laboratory experience, strong analytical skills, and a degree in Biochemistry. What careers should I consider?
A: With your science background and lab experience, careers like research scientist, biotechnologist, or pharmaceutical research are good options.

Q: I have experience in frontend and backend development, cloud technologies, and DevOps practices. I also know JavaScript, Python, and AWS. I have a degree in Computer Science. What career paths are suitable?
A: Since you have strong programming, cloud, and DevOps skills, I’d recommend careers such as full-stack developer, DevOps engineer, or software engineer.

Q: I have Python and ML skills. What careers suit me?
A: With your Python and ML skills, I’d suggest careers like data scientist, machine learning engineer, or AI researcher.

"""

user_query = "I have skills in Analytical reasoning, Software Development, Python, Machine Learning.I have a Bachelors in Computer Engineering. What career paths should I consider?"

# Build the final prompt
input_text = cot_examples.strip() + f"\n\nQ: {user_query}\nA:"

# Tokenize input
batch = tokenizer(input_text, return_tensors="pt")
batch = {k: v.to(device) for k, v in batch.items()}

# Generate with mixed precision
with torch.amp.autocast(device_type=device.type):
    output = modelLoRA.generate(
        **batch,
        max_new_tokens=90,
        do_sample=False,       # turn on sampling
        temperature=0.7,               # moderate randomness
        top_p=0.9,                     # nucleus sampling
        top_k=20,                      # limit candidate tokens
        repetition_penalty=1.3,
        eos_token_id=tokenizer.eos_token_id,  # stop when EOS is reached
        pad_token_id=tokenizer.eos_token_id,
        output_scores=False
    )

# Decode only the new tokens
generated_tokens = output[0][batch["input_ids"].shape[1]:]
output_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

print("Model Response:", output_text)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Model Response:  Considering your engineering background and python skill, Careers like Data Scientist, Machine Learning Engineer, AI Researcher etc., can be considered.


Note: The above responses are designed to provide brief and concise answers that cater to different personality types. They aim to simulate real-life conversations between a career counselor and a client. 

The goal is to encourage users to explore various fields and industries by highlighting their strengths and transferable skills. By doing so, we hope to


### QLoRA

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,

    )
from peft import LoraConfig, get_peft_model
import torch

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

model_name = "meta-llama/Llama-3.2-3B-Instruct"
HUGGING_FACE_TOKEN = "------" # Put in your Hugging Face Token here

qloramodel = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=HUGGING_FACE_TOKEN,
    device_map='auto',
    quantization_config=bnb_config
    #low_cpu_mem_usage=True      # efficient loading
)

# Set token
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGING_FACE_TOKEN)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
qloraconfig = LoraConfig(
    r = 16, # optimized
    lora_alpha = 64, # optimized
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"], # This specifies which layers (or submodules) in the model will be adapted using LoRA, key projection and value projection
    lora_dropout = 0.05, # regularization technique that helps prevent overfitting
    bias = "none", #how to handle the bias terms in the model during fine-tuning, so in case, no bias terms are updated or fine-tuned during the LoRA process
    task_type = "CAUSAL_LM" #  specifies the type of task for which the model is being fine-tuned
)

In [7]:
qloraconfig = LoraConfig(
    r = 16, # optimized
    lora_alpha = 64, # optimized
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"], 
    lora_dropout = 0.05, 
    bias = "none", 
    task_type = "CAUSAL_LM" 
)

In [8]:
import transformers

qlora_model = get_peft_model(qloramodel, qloraconfig)

trainer = transformers.Trainer(
    model = qlora_model,
    train_dataset = tokenized_dataset['train'],
    args = transformers.TrainingArguments(
        per_device_eval_batch_size = 1,
        gradient_accumulation_steps = 8,
        warmup_steps = 10,
        learning_rate = 1.71e-4, # optimized
        fp16 = True,
        num_train_epochs=2, # from optimized hyperparameter
        logging_steps = 100,
        output_dir = 'outputs',
        report_to = "none"
    ),
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm = False)
)
qloramodel.config.use_cache = False
trainer.train()

Step,Training Loss


TrainOutput(global_step=58, training_loss=1.4425745996935615, metrics={'train_runtime': 273.3137, 'train_samples_per_second': 13.274, 'train_steps_per_second': 0.212, 'total_flos': 1.5758903167942656e+16, 'train_loss': 1.4425745996935615, 'epoch': 2.0})

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
# sharshar20/career-advisory-qlora-llama3.2-3b-v3

qlora_model.push_to_hub("sharshar20/career-advisory-qlora-llama3.2-3b-instruct-v7",
                      use_auth_token=True,
                      commit_message = "QLora Training method for Instruct Model (data changed), r=16, alpha=64, lr=1.71e-04",
                      private=False)



Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/sharshar20/career-advisory-qlora-llama3.2-3b-instruct-v7/commit/0a6673f53b0b41fc0f0d4b713ef40268bb69c38b', commit_message='QLora Training method for Instruct Model (data changed), r=16, alpha=64, lr=1.71e-04', commit_description='', oid='0a6673f53b0b41fc0f0d4b713ef40268bb69c38b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sharshar20/career-advisory-qlora-llama3.2-3b-instruct-v7', endpoint='https://huggingface.co', repo_type='model', repo_id='sharshar20/career-advisory-qlora-llama3.2-3b-instruct-v7'), pr_revision=None, pr_num=None)

In [10]:
# Step 10 : Inferencing with trained QLoRA adapter
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "sharshar20/career-advisory-qlora-llama3.2-3b-instruct-v7"
config = PeftConfig.from_pretrained(peft_model_id)
modelqlora = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                            return_dict = True,
                                            quantization_config=bnb_config,
                                            device_map = 'auto',
                                            trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

model_qlora = PeftModel.from_pretrained(modelqlora,peft_model_id)

adapter_config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/36.7M [00:00<?, ?B/s]

In [14]:
instruction = """You are a top-rated NTU career advisor chatbot.
Be polite, concise, and helpful in providing career guidance responses."""

input_text = "I have skills in Analytical reasoning, Software Development, Python, Machine Learning.I have a Bachelors in Computer Engineering. What career paths should I consider? "

In [16]:
# Check if CUDA is available and move the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_qlora.to(device)  # Move the model to the chosen device

messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": input_text}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
tokenizer.pad_token = tokenizer.eos_token
    
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model_qlora.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




You should consider careers like Data Engineer, Business Analyst, Solutions Architect, DevOps Engineer, Cloud Engineer, Technical Support Engineer, Data Scientist, Operations Manager, Technical Support Specialist, Business Analyst, Data Analyst, Cloud Engineer, Solutions Architect, DevOps Engineer, Technical Support Engineer, Data Scientist, Operations Manager, Technical Support Specialist, Business Analyst, Data Analyst, Cloud Engineer, Solutions Architect, DevOps Engineer, Technical Support Engineer, Data Scientist, Operations Manager, Technical Support Specialist, Business Analyst, Data Analyst, Cloud Engineer, Solutions Architect, DevOps Engineer, Technical Support Engineer, Data Scientist, Operations Manager, Technical Support Specialist, Business Analyst, Data Analyst, Cloud Engineer, Solutions Architect, DevOps Engineer, Technical Support Engineer, Data Scientist, Operations Manager


In [18]:
# Check if CUDA is available and move the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_qlora.to(device)  # Move the model to the chosen device

messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": input_text}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
tokenizer.pad_token = tokenizer.eos_token
    
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = modelLoRA.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




The possible career paths for Analytical reasoning, Software Development, Python, Machine Learning and Bachelors in Computer Engineering are: Senior Software Engineer with an experience requirement of At least 5 years. The related skills are Neural Networks, Machine Learning and Image recognition. The possible career paths are Data Scientist. The experience requirement for Data Scientist is At least 4 years. The related skills are Data Analysis, Business Analysis, SQL, NoSQL, Tableau, Power BI and Python. The possible career paths are Full Stack Developer (Python,React js) with an experience requirement of At least 3 years. The related skills are Application Development, System Analysis, Requirement Gathering, Software Design, Development, Integration, Test, Deployment, Support, Documentation, Research


In [12]:
# Finding BERTScore for QLoRA
metrics = evaluate_bertscore(model_qlora, tokenizer, tokenized_dataset["test"])
print(metrics)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 10: BERT F1 = 0.9198


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 20: BERT F1 = 0.9234


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 30: BERT F1 = 0.9206


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 40: BERT F1 = 0.9208


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 50: BERT F1 = 0.9134


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 57: BERT F1 = 0.9101
{'bert_precision': 0.8716595768928528, 'bert_recall': 0.9547721147537231, 'bert_f1': 0.9100630879402161}


In [None]:
## Evaluate BERTScore for the Llama 3.2 3B Instruct model - baseline comparison:
HUGGING_FACE_TOKEN = "-----" # Insert Hugging Face token here
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=HUGGING_FACE_TOKEN,
)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGING_FACE_TOKEN)
tokenizer.pad_token = tokenizer.eos_token

metrics = evaluate_bertscore(model, tokenizer, tokenized_dataset["test"])
print(metrics)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 10: BERT F1 = 0.5601


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 20: BERT F1 = 0.5594


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 30: BERT F1 = 0.5594


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 40: BERT F1 = 0.5598


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 50: BERT F1 = 0.5533


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


After batch 57: BERT F1 = 0.5502
{'bert_precision': 0.5276824235916138, 'bert_recall': 0.5773471593856812, 'bert_f1': 0.5502427816390991}
