<a href="https://colab.research.google.com/github/shinnew9/Apziva_practice_code/blob/main/Project3-PotentialTalents/Qwen_LLMFineTuning%2BLoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

While OpenAI provides fine-tuning APIs, but GPT-4 does not support fine-tuning yet. I can fine-tune GPT-3.5-turbo instead but since the model is quite outdated, I will choose to finetune the latest LLM downloaded from HuggingFace.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Install Libraries for LLM finetuning

In [25]:
!pip install deepspeed



In [26]:
!pip install transformers peft accelerate datasets bitsandbytes



### Open CSV

In [27]:
import pandas as pd
import json

df = pd.read_csv("/content/drive/MyDrive/Apziva/3rd_PotentialTalents/data.csv")
df_copy = df.copy()
df_copy

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,
...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,
102,103,Always set them up for Success,Greater Los Angeles Area,500+,


In [28]:
# convert csv to jsonl

jsonl_data = df_copy.apply(lambda x: json.dumps({
    "prompt": f"Job Title: {x['job_title']}\nLocation: {x['location']}\nConnections: {x['connection']}",
    "completion": str(x['fit'])
}), axis=1)

# Save as JSONL file
with open("train_data.jsonl", "w") as f:
  f.write("\n".join(jsonl_data))

print("Training data saves as train_data.jsonl")

Training data saves as train_data.jsonl


In [4]:
# In case I might use more

### Fine-Tuning Open-Source Models (HuggingFace+Lora)

In [29]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load Model & Tokenizer
# I applied Qwen/Qwen2.5-Math-1.5b was large enough to run in my environment, so didn't even think of trying a larger model, like Qwen/Qwen2.5-Math-7B
model_name = "Qwen/Qwen2.5-Math-1.5B"  # Qwen/Qwen2.5-Math-7B
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

print("Base model loaded successfully!")

Base model loaded successfully!


In [30]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

lora_config = LoraConfig(
    r = 8, # Lora Rank
    lora_alpha = 16,  # LoRA scaling factor
    target_modules = ["q_proj", "v_proj"],  # which layers to fine-tune
    lora_dropout = 0.05,  # Dropout probability
    bias = "none"
)

# Prepare model for LoRA fine-tuning
model = get_peft_model(model, lora_config)
model = prepare_model_for_kbit_training(model)

print("LoRA applied successfully!")

LoRA applied successfully!


In [31]:
from datasets import load_dataset

# Load dataset from JSONL file
dataset = load_dataset("json", data_files={"train":"train_data.jsonl"})
train_dataset = dataset["train"]
print(train_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 104
})


In [33]:
from transformers import Trainer, TrainingArguments
import json

# Load DeepSpeed configuration from JSON file
deepspeed_config = None  # Initialize to None
try:
  config_file_path = "/content/drive/MyDrive/Apziva/3rd_PotentialTalents/latestmodels/Qwen2.5-Finetuned/ds_config_zero2.json"
  # Check if the file exists
  if os.path.exists(config_file_path):
    with open(config_file_path, "r") as f:
      # If keys in 'ds_config_zero2.json' file are enclosed with single quotes, replace them with double quotes
      deepspeed_config = json.loads(f.read().replace("'", "\""))
  else:
    print(f"Error: DeepSpeed config file not found at: {config_file_path}")
except json.JSONDecodeError as e:
  print(f"Invalid JSON in DeepSpeed config file: {e}")
except Exception as e:
  print(f"An unexpected error occurred: {e}")


# Fine-tuning arguments
training_args = TrainingArguments(
      output_dir = "/content/drive/MyDrive/Apziva/3rd_PotentialTalents/latestmodels/Qwen2.5-Finetuned/results",
      num_train_epochs = 3,  # Number of training epochs
      per_device_train_batch_size = 1,  # Adjust based on my GPU memory
      gradient_accumulation_steps = 2,  # Adjust due to OOM
      gradient_checkpointing = True,    # Adjust due to OOM
      save_strategy = "epoch",
      logging_dir = "/content/drive/MyDrive/Apziva/3rd_PotentialTalents/latestmodels/Qwen2.5-Finetuned/logs",
      remove_unused_columns = False,       # prevents the trainer from removing columns
      fp16 = True,
      deepspeed = deepspeed_config
)


def preprocess_function(examples):
  input_texts = [example['prompt'] for example in examples]
  target_texts = [str(example['completion']) for example in examples]
  tokenized_output = tokenizer(input_texts, text_target=target_texts, padding="max_length", truncation=True)
  # The tokenizer outputs are lists, need to convert to tensors
  return {k: torch.tensor(v) for k, v in tokenized_output.items()} # Convert to tensors


# Trainer setup
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    data_collator = preprocess_function
)


# Start training
trainer.train()

An unexpected error occurred: name 'os' is not defined
[2025-03-19 10:20:58,260] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 GiB. GPU 0 has a total capacity of 39.56 GiB of which 26.54 GiB is free. Process 4564 has 13.01 GiB memory in use. Of the allocated memory 12.28 GiB is allocated by PyTorch, and 252.33 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)