<a href="https://colab.research.google.com/github/shinnew9/Apziva_practice_code/blob/main/OLMo_LLMFineTuning%2BLoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

While OpenAI provides fine-tuning APIs, but GPT-4 does not support fine-tuning yet. I can fine-tune GPT-3.5-turbo instead but since the model is quite outdated, I will choose to finetune the latest LLM downloaded from HuggingFace.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Install Libraries for LLM finetuning

In [None]:
# !pip install deepspeed

In [None]:
!pip install transformers peft accelerate datasets bitsandbytes

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.1

In [None]:
#!pip install -U bitsandbytes

### Open CSV

In [None]:
import pandas as pd
import json

df = pd.read_csv("/content/drive/MyDrive/Apziva/3rd_PotentialTalents/data.csv")
df_copy = df.copy()
df_copy

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,
...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,
102,103,Always set them up for Success,Greater Los Angeles Area,500+,


In [None]:
# convert csv to jsonl

jsonl_data = df_copy.apply(lambda x: json.dumps({
    "prompt": f"Job Title: {x['job_title']}\nLocation: {x['location']}\nConnections: {x['connection']}",
    "completion": str(x['fit'])
}), axis=1)

# Save as JSONL file
with open("/content/drive/MyDrive/Apziva/3rd_PotentialTalents/latestmodels/train_data.jsonl", "w") as f:
  f.write("\n".join(jsonl_data))

print("Training data saves as train_data.jsonl")

print(jsonl_data)

Training data saves as train_data.jsonl
0      {"prompt": "Job Title: 2019 C.T. Bauer College...
1      {"prompt": "Job Title: Native English Teacher ...
2      {"prompt": "Job Title: Aspiring Human Resource...
3      {"prompt": "Job Title: People Development Coor...
4      {"prompt": "Job Title: Advisory Board Member a...
                             ...                        
99     {"prompt": "Job Title: Aspiring Human Resource...
100    {"prompt": "Job Title: Human Resources General...
101    {"prompt": "Job Title: Business Intelligence a...
102    {"prompt": "Job Title: Always set them up for ...
103    {"prompt": "Job Title: Director Of Administrat...
Length: 104, dtype: object


In [None]:
# In case I might use more

### Fine-Tuning Open-Source Models (HuggingFace+Lora)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


# Load Model & Tokenizer
# I applied Qwen/Qwen2.5-Math-1.5b was large enough to run in my environment, so didn't even think of trying a larger model, like Qwen/Qwen2.5-Math-7B
# Load Mistral in 4-bit Mode to Save Memory
model_name = "allenai/OLMoE-1B-7B-0125-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,   # Load model in 4-bit
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_quant_type = "nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = "auto"
)

# Apply LoRA Configuration
lora_config = LoraConfig(
    r = 8,                  # LoRA rank
    lora_alpha = 16,        # LoRA scaling factor
    target_modules = ["q_proj", "v_proj"],
    lora_dropout = 0.05,    # Dropout probability
    bias = "none"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model = prepare_model_for_kbit_training(model)

print("OLMo with LoRA is ready!")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

OLMo with LoRA is ready!


In [None]:
from datasets import load_dataset

dataset = load_dataset('json', data_files= '/content/drive/MyDrive/Apziva/3rd_PotentialTalents/latestmodels/train_data.jsonl')

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
def tokenize(example):
    # Prompt + Completion을 하나로 합쳐서 처리
    full_prompt = example["prompt"] + "\nFit Score: " + example["completion"]
    tokenized = tokenizer(
        full_prompt,
        padding="max_length",
        truncation = True,
        max_length = 128
        # return_tensors = "pt"
    )

    # labels도 input_ids와 동일하세 설정 (causal LM 방식)
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized


tokenized_dataset = dataset["train"].map(
    tokenize,
    remove_columns = dataset["train"].column_names
)

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

### Training

In [None]:
from transformers import Trainer, TrainingArguments
import json

# Fine-tuning arguments
training_args = TrainingArguments(
      output_dir = "/content/drive/MyDrive/Apziva/3rd_PotentialTalents/latestmodels/OLMoE-1B-FineTuned/results",
      num_train_epochs = 3,  # Number of training epochs
      per_device_train_batch_size = 2,  # Adjust based on my GPU memory
      save_strategy = "epoch",
      logging_dir = "/content/drive/MyDrive/Apziva/3rd_PotentialTalents/latestmodels/OLMoE-1B-FineTuned/logs",
      remove_unused_columns = False
)

# Trainer setup
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset,
    tokenizer = tokenizer
)


# Start training
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myoojinshin9918[0m ([33mApziva-Project3[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Step,Training Loss


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=156, training_loss=15.568924341446314, metrics={'train_runtime': 1294.1098, 'train_samples_per_second': 0.241, 'train_steps_per_second': 0.121, 'total_flos': 1633507281469440.0, 'train_loss': 15.568924341446314, 'epoch': 3.0})

### Inference

In [None]:
import torch
import re
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "/content/drive/MyDrive/Apziva/3rd_PotentialTalents/latestmodels/OLMoE-1B-Finetuned/results/checkpoint-156"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code = True,
    low_cpu_mem_usage = True)  # 중요

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# fit score만 뽑는 함수
def get_fit_score(job_title, search_term):
    prompt = f"""
    Given the job title and search term, assign a numerical fit score between 0 and 1 based on their similarity.
    The fit score should be a single number with no explanation.

    Job Title: {job_title}
    Search Term: {search_term}

    Fit Score:
    """

    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    #.to("cuda")

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=20,
            do_sample=False,     # deterministic output
            temperature=0.0,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    return extract_fit_score(decoded)

In [None]:
import re

def extract_fit_score(text):
  matches = re.findall(r"\b([0-1]\.\d+)\b", text)
  if matches:
      score = float(matches[0])
      return max(0.0, min(1.0, score))
  return 0.0

  print(f"Predicted Fit Score: {score}")

In [None]:
# Extracting only number
# def extract_fit_score(text):
#     match = re.search(r"([0-1]\.\d+)", text)
#     return float(match.group(1)) if match else 0.0

# # Testing
# job_title = "HR Manager"
# search_term = "Aspiring Human Resources"
# score = get_fit_score(job_title, search_term)
# print(f"Predicted Fit Score: {score}")



Predicted Fit Score: 0.9


In [None]:
from tqdm.notebook import tqdm  # for Jupyter/Colab
tqdm.pandas()

search_term = "Aspiring Human Resources"  # example, you may change it


if "fit_score" in df_copy.columns:
    del df_copy["fit_score"]


df_copy["fit_score"] = df_copy.progress_apply(
    lambda row: get_fit_score(row["job_title"], search_term),
    axis=1
)


# 높은 fit_score 순으로 정렬 및 rank 부여
df_copy["rank"] = df_copy["fit_score"].rank(ascending=False, method="min").astype(int)
df_sorted = df_copy.sort_values(by="fit_score", ascending=False).reset_index(drop=True)

# 결과 확인
df_sorted[["job_title", "location", "connection", "fit_score", "rank"]].head(10)

  0%|          | 0/104 [00:00<?, ?it/s]



Unnamed: 0,job_title,location,connection,fit_score,rank
0,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.9,1
1,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.9,1
2,People Development Coordinator at Ryan,"Denton, Texas",500+,0.9,1
3,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.9,1
4,Student at Humber College and Aspiring Human R...,Kanada,61,0.9,1
5,Aspiring Human Resources Specialist,Greater New York City Area,1,0.9,1
6,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.9,1
7,Student at Humber College and Aspiring Human R...,Kanada,61,0.9,1
8,Student at Humber College and Aspiring Human R...,Kanada,61,0.9,1
9,Student at Humber College and Aspiring Human R...,Kanada,61,0.9,1


In [None]:
# print(get_fit_score("Software Engineer", "Aspiring Human Resources"))  # 0.5
# print(get_fit_score("Recruiter", "Aspiring Human Resources"))   # 0.9
# print(get_fit_score("AI Researcher", "Aspiring Human Resources"))  # 0.9

# print(get_fit_score("Barista", "Aspiring Human Resources"))   # 예상: 0.1, 0.5
# print(get_fit_score("Talent Acquisition Lead", "Aspiring Human Resources"))  # 예상: 0.9, 0.9
# print(get_fit_score("Senior Engineer", "Aspiring Human Resources"))  # 예상: 0.2~0.4, 0.9
# print(get_fit_score("UX Designer", "Aspiring Human Resources"))  # 예상: 0.5
print(get_fit_score("Marketing Manager", "Aspiring Human Resources"))  # 예상: 0.5



0.5
0.5
