## Install Dependencies

In [None]:
%pip install kagglehub pandas
%pip install -q transformers peft datasets accelerate bitsandbytes sentencepiece pydantic huggingface_hub xformers
%pip install optuna

#%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
#%pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128


## Configurations  

In [None]:
# ==============================
# 🛠 CONFIGURATION
# ==============================

class Config:
    JSON_OUTPUT_DIR = "json_outputs_all_data"
    JSON_OUTPUT_NORMALIZED_DIR = "json_outputs_all_data/normalized"
    JSON_OUTPUT_NORMALIZED_JD = "json_outputs_all_data/normalized/jd"
    JSON_OUTPUT_NORMALIZED_RESUME = "json_outputs_all_data/normalized/resume"
    JSON_OUTPUT_SCORING_DIR = "json_outputs_all_data/scoring"
    JSON_OUTPUT_SCORING_SPLIT_DIR = "json_outputs_all_data/scoring/split"
    JSON_OUTPUT_SCORING_FT_DATA = "json_outputs_all_data/scoring/FT_data"
    JSON_OUTPUT_FINE_TUNE_SCORE = "json_outputs_all_data/fine-tune/scored"
    JSON_OUTPUT_FINE_TUNE_RECORD = "json_outputs_all_data/fine-tune/record"
    JSON_OUTPUT_FINE_TUNE_TEST_DATA = "json_outputs_all_data/fine-tune/test-data"
    JSON_OUTPUT_FINE_TUNE_OUTPUT = "json_outputs_all_data/fine-tune/optuna_output"
    JSON_OUTPUT_FINE_TUNE_MODEL = "json_outputs_all_data/fine-tune/model"

## Login to huggingface

In [None]:
from huggingface_hub import login
import os

# Set your token here securely or prompt for it in Colab
# Recommended: store in Colab secrets or environment variable
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")


if not HF_TOKEN:
    # Prompt for token if not set in environment
    print("🔑 Please enter your Hugging Face token:")
    # For Colab or local prompt input
    HF_TOKEN = input("🔑 Enter your Hugging Face token: ").strip()

login(token=HF_TOKEN)


# Full Fine-Tuning on Lambda with Optuna, LR Scheduler, Early Stopping

### Imports & Configuration

In [None]:
import optuna
import os
from optuna.integration import TrainerCallback
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
import torch


### Paths & Basic Config

In [None]:
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"


# ✅ Paths
train_path =os.path.join(Config.JSON_OUTPUT_FINE_TUNE_TEST_DATA, "train.jsonl") 
eval_path = os.path.join(Config.JSON_OUTPUT_FINE_TUNE_TEST_DATA, "eval.jsonl") 



### Load Tokenizer & Dataset

In [None]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token


data = load_dataset("json", data_files={"train": train_path, "validation": eval_path})


### Tokenization Function

In [None]:
def tokenize(example):
    prompt = f"<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n{example['output']}<|im_end|>"
    tokens = tokenizer(prompt, padding="max_length", truncation=True, max_length=1024)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


tokenized_data = data.map(tokenize, remove_columns=data["train"].column_names)

### Define Optuna Objective Function

In [None]:
def objective(trial):
    learning_rate = trial.suggest_loguniform("learning_rate", 5e-5, 5e-4)
    num_train_epochs = trial.suggest_int("num_train_epochs", 2, 4)
    lora_r = trial.suggest_categorical("lora_r", [4, 8, 16])
    lora_alpha = trial.suggest_categorical("lora_alpha", [16, 32, 64])

    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    )

    lora_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )

    model = get_peft_model(base_model, lora_config)
    output_dir = os.path.join(Config.JSON_OUTPUT_FINE_TUNE_OUTPUT, f"optuna_trial_{trial.number}")
    logging_dir = os.path.join(output_dir, "logs")

    training_args = TrainingArguments(
        output_dir=output_dir
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=8,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        bf16=True,
        load_best_model_at_end=True,
        report_to="none",
        save_total_limit=1,
        logging_dir=logging_dir,
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["eval"],
        tokenizer=tokenizer,
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    trial.set_user_attr("best_model_path", trainer.state.best_model_checkpoint)
    eval_metrics = trainer.evaluate()
    return eval_metrics["eval_loss"]


### Launch Optuna Tuning

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

print("✅ Best hyperparameters:")
print(study.best_params)


###  Save Final Best Model 

In [None]:
best_trial_number = study.best_trial.number
#best_model_path = f"optuna_output/{best_trial_number}"
best_model_path = study.best_trial.user_attrs.get("best_model_path")
print("✅ Best model path:", best_model_path)

model = AutoModelForCausalLM.from_pretrained(best_model_path)
model.save_pretrained(Config.JSON_OUTPUT_FINE_TUNE_MODEL)
tokenizer.save_pretrained(Config.JSON_OUTPUT_FINE_TUNE_MODEL)



## Save to hugging face

In [None]:
from huggingface_hub import HfApi, HfFolder
from transformers import AutoTokenizer, AutoModelForCausalLM

# ✅ Optional: Login (only needed once per environment)
# from huggingface_hub import login
# login("hf_your_access_token")

# Set model path and repo name
model_path = Config.JSON_OUTPUT_FINE_TUNE_MODEL
repo_name = "rubsj/Qwen2-Resume-ATS"  # customize this

# Push model and tokenizer to HF hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)


## Create ZIP for Lambda Download

In [None]:
import shutil

zip_path = f"{Config.JSON_OUTPUT_FINE_TUNE_MODEL}.zip"
shutil.make_archive(base_name=Config.JSON_OUTPUT_FINE_TUNE_MODEL, format='zip', root_dir=Config.JSON_OUTPUT_FINE_TUNE_MODEL)
print(f"✅ Model zipped at: {zip_path}")
