## Install Dependencies

In [4]:
%pip install kagglehub pandas
%pip install -q transformers peft datasets accelerate bitsandbytes sentencepiece pydantic huggingface_hub xformers
%pip install optuna

#%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
#%pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128


Note: you may need to restart the kernel to use updated packages.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.20.1+cu121 requires torch==2.5.1, but you have torch 2.7.0 which is incompatible.
torchaudio 2.5.1+cu121 requires torch==2.5.1, but you have torch 2.7.0 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 KB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting alembic>=1.5.0
  Downloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 KB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sqlalchemy>=1.4.2
  Downloading sqlalchemy-2.0.41-cp310-cp310-man

## Configurations  

In [1]:
# ==============================
# 🛠 CONFIGURATION
# ==============================

class Config:
    JSON_OUTPUT_DIR = "json_outputs_all_data"
    JSON_OUTPUT_NORMALIZED_DIR = "json_outputs_all_data/normalized"
    JSON_OUTPUT_NORMALIZED_JD = "json_outputs_all_data/normalized/jd"
    JSON_OUTPUT_NORMALIZED_RESUME = "json_outputs_all_data/normalized/resume"
    JSON_OUTPUT_SCORING_DIR = "json_outputs_all_data/scoring"
    JSON_OUTPUT_SCORING_SPLIT_DIR = "json_outputs_all_data/scoring/split"
    JSON_OUTPUT_SCORING_FT_DATA = "json_outputs_all_data/scoring/FT_data"
    JSON_OUTPUT_FINE_TUNE_SCORE = "json_outputs_all_data/fine-tune/scored"
    JSON_OUTPUT_FINE_TUNE_RECORD = "json_outputs_all_data/fine-tune/record"
    JSON_OUTPUT_FINE_TUNE_TEST_DATA = "json_outputs_all_data/fine-tune/test-data"
    JSON_OUTPUT_FINE_TUNE_OUTPUT = "json_outputs_all_data/fine-tune/optuna_output"
    JSON_OUTPUT_FINE_TUNE_MODEL = "json_outputs_all_data/fine-tune/model"

## Login to huggingface

In [2]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    # Prompt for token if not set in environment
    print("🔑 Please enter your Hugging Face token:")
    # For Colab or local prompt input
    HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


  from .autonotebook import tqdm as notebook_tqdm


# Full Fine-Tuning on Lambda with Optuna, LR Scheduler, Early Stopping

### Imports & Configuration

In [17]:
import optuna
import os
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import torch
from transformers import BitsAndBytesConfig



### Paths & Basic Config

In [18]:
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"


# ✅ Paths
train_path =os.path.join(Config.JSON_OUTPUT_FINE_TUNE_TEST_DATA, "train.jsonl") 
eval_path = os.path.join(Config.JSON_OUTPUT_FINE_TUNE_TEST_DATA, "eval.jsonl") 



### Load Tokenizer & Dataset

In [6]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token


data = load_dataset("json", data_files={"train": train_path, "validation": eval_path})


Generating train split: 23995 examples [00:00, 40014.30 examples/s]
Generating validation split: 5999 examples [00:00, 39194.83 examples/s]


### Tokenization Function

In [7]:
def tokenize(example):
    prompt = f"<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n{example['output']}<|im_end|>"
    tokens = tokenizer(prompt, padding="max_length", truncation=True, max_length=1024)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


tokenized_data = data.map(tokenize, remove_columns=data["train"].column_names)

Map: 100%|██████████| 23995/23995 [02:58<00:00, 134.23 examples/s]
Map: 100%|██████████| 5999/5999 [00:43<00:00, 138.10 examples/s]


### Define Optuna Objective Function

In [19]:
def objective(trial):
    from peft import prepare_model_for_kbit_training

    # Hyperparameter suggestions
    learning_rate = trial.suggest_float("learning_rate", 5e-5, 5e-4, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 2, 4)
    lora_r = trial.suggest_categorical("lora_r", [4, 8, 16])
    lora_alpha = trial.suggest_categorical("lora_alpha", [16, 32, 64])

    # Load base model with bitsandbytes quantization (no meta tensor error)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )

    base_model = prepare_model_for_kbit_training(base_model)

    # Apply LoRA
    lora_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )
    model = get_peft_model(base_model, lora_config)

    # Output directories for current trial
    output_dir = os.path.join(Config.JSON_OUTPUT_FINE_TUNE_OUTPUT, f"optuna_trial_{trial.number}")
    logging_dir = os.path.join(output_dir, "logs")

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=8,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        bf16=True,
        load_best_model_at_end=True,
        report_to="none",
        save_total_limit=1,
        logging_dir=logging_dir,
        logging_steps=10,
    )

    # Prepare trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["validation"],
        tokenizer=tokenizer,
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    # Training and evaluation
    trainer.train()
    trial.set_user_attr("best_model_path", trainer.state.best_model_checkpoint or output_dir)
    eval_metrics = trainer.evaluate()
    return eval_metrics["eval_loss"]


### Launch Optuna Tuning

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

print("✅ Best hyperparameters:")
print(study.best_params)


[I 2025-06-24 00:34:42,238] A new study created in memory with name: no-name-335da27f-3fb5-4e1a-91d8-b549ede1f15b
Loading checkpoint shards: 100%|██████████| 4/4 [00:12<00:00,  3.18s/it]
  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss


###  Save Final Best Model 

In [None]:
best_trial_number = study.best_trial.number
#best_model_path = f"optuna_output/{best_trial_number}"
best_model_path = study.best_trial.user_attrs.get("best_model_path")
print("✅ Best model path:", best_model_path)

model = AutoModelForCausalLM.from_pretrained(best_model_path)
model.save_pretrained(Config.JSON_OUTPUT_FINE_TUNE_MODEL)
tokenizer.save_pretrained(Config.JSON_OUTPUT_FINE_TUNE_MODEL)



## Save to hugging face

In [None]:
from huggingface_hub import HfApi, HfFolder
from transformers import AutoTokenizer, AutoModelForCausalLM

# ✅ Optional: Login (only needed once per environment)
# from huggingface_hub import login
# login("hf_your_access_token")

# Set model path and repo name
model_path = Config.JSON_OUTPUT_FINE_TUNE_MODEL
repo_name = "rubsj/Qwen2-Resume-ATS"  # customize this

# Push model and tokenizer to HF hub
model.push_to_hub(repo_name, , private=True)
tokenizer.push_to_hub(repo_name, , private=True)


## Create ZIP for Lambda Download

In [None]:
import shutil

zip_path = f"{Config.JSON_OUTPUT_FINE_TUNE_MODEL}.zip"
shutil.make_archive(base_name=Config.JSON_OUTPUT_FINE_TUNE_MODEL, format='zip', root_dir=Config.JSON_OUTPUT_FINE_TUNE_MODEL)
print(f"✅ Model zipped at: {zip_path}")


In [None]:
print(f"✅ Hugging Face Model URL: https://huggingface.co/{repo_name}")
print(f"✅ Local zip ready for download: {zip_path}")
