### 1. Environment Validation

In [None]:
import torch

# 1. Environment Detection
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# 2. Hardware Validation 
gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None"
is_t4 = "T4" in gpu_name

# 3. Status Report
print(f"üåç Is Google Colab: {IN_COLAB}")
print(f"üöÄ GPU Detected: {gpu_name}")

if IN_COLAB and is_t4:
    print("‚úÖ Success: Environment meets T4 GPU requirements.")
else:
    print("‚ùå Check Failed: Ensure T4 GPU runtime is enabled in Notebook Settings.")

In [None]:
#Checking the path of current directory
!pwd

### 2. Dependency Installation

In [None]:
import os

# Install Unsloth & Core Ecosystem
try:
    import unsloth
    print("‚úÖ Unsloth already installed.")
except ImportError:
    # Optimized install for T4 GPU Runtimes
    !pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
    !pip install -q --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
    print("‚úÖ Dependencies installed successfully.")

# Force update to ensure latest patches for Llama-3-8b support
!pip install -q --upgrade --no-cache-dir unsloth unsloth_zoo

### 3. Workspace Synchronization

In [None]:
import gdown
import os

# Project Artifacts Folder ID
folder_id = '1lBF3jieW1m4dapjzokIH1nTnHnBhkUQL'
drive_url = f'https://drive.google.com/drive/folders/{folder_id}'

# Download the synchronized project folder (uploaded the same to my gdrive account) from gdrive
# 'remaining_ok=True' ensures robustness for larger artifact transfers
gdown.download_folder(drive_url, quiet=False, remaining_ok=True)

# Validation of required directory structure
if os.path.exists("MiniProject01"):
    print("‚úÖ Workspace synchronized. Ready for model loading.")
else:
    print("‚ö†Ô∏è Warning: Folder name mismatch. Check gdown output.")

### 4. Configuration and Environment Initialization

In [None]:
import yaml
import torch
import gc
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

# 1. Load Centralized Project Settings 
with open("MiniProject01/src/config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

# 2. Load Instruction Persona and Prompts 
with open("MiniProject01/src/config/prompts.yaml", "r") as f:
    prompts = yaml.safe_load(f)

# 3. Status Report
print("‚úÖ Configuration and Prompt Library Loaded.")
print(f"   Target Model: {config['finetuning']['model_name']}")
print(f"   Training Limit: {config['finetuning']['training']['max_steps']} steps")

### 5. Model Loading & LoRA Configuration

In [None]:
# 1. Memory Management: Clear VRAM for the T4 GPU 
gc.collect()
torch.cuda.empty_cache()

ft_conf = config['finetuning']

# 2. Load Base Model with 4-bit Quantization
print("‚è≥ Loading Model via Unsloth (4-bit NF4)...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = ft_conf['model_name'],
    max_seq_length = ft_conf['max_seq_length'],
    dtype = None,
    load_in_4bit = ft_conf['load_in_4bit'], # Mandatory for T4 fit
    device_map = {"": 0} # Explicitly map to the single T4 GPU
)

# 3. Add LoRA Adapters for Projection Modules
print("‚öôÔ∏è Injecting LoRA Adapters into q, k, v, o_proj...")
model = FastLanguageModel.get_peft_model(
    model,
    r = ft_conf['lora']['r'],
    target_modules = ft_conf['lora']['target_modules'], # q_proj, k_proj, v_proj, o_proj
    lora_alpha = ft_conf['lora']['alpha'],
    lora_dropout = ft_conf['lora']['dropout'],
    bias = ft_conf['lora']['bias'],
    use_gradient_checkpointing = "unsloth", # Memory-efficient gradient handling
)

print("‚úÖ Model Ready for Financial Intelligence Training.")

### 6. Dataset Preparation & Formatting

In [None]:
# 1. Load Synthetic Training Data 
train_data_path = "/content/MiniProject01/artifacts/data/train.jsonl"
dataset = load_dataset("json", data_files=train_data_path, split="train")

# 2. Define Chat Template Mapping
def format_chat_template(row):
    # Retrieve the Lead AI Architect persona
    system_msg = prompts['intern_persona'].strip()
    
    # Combine question and Uber report context 
    user_content = f"{row['instruction']}\n\nContext:\n{row['input']}"
    
    # Structure for Llama-3 instruction tuning
    row_json = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": row['output']}
    ]
    
    # Apply tokenizer's chat template without tokenizing yet
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

print("üìù Formatting Dataset into Instruction Blocks...")
# Process in parallel for speed
dataset = dataset.map(format_chat_template, num_proc=2)

print(f"‚úÖ Processed {len(dataset)} instruction-tuning examples.")

### 7. Training Execution

In [None]:
tr_conf = config['finetuning']['training']

# 1. Configure Training Arguments
training_args = TrainingArguments(
    output_dir = tr_conf['output_dir'],
    per_device_train_batch_size = tr_conf['batch_size'],
    gradient_accumulation_steps = tr_conf['grad_accum_steps'],
    warmup_steps = 10,
    max_steps = tr_conf['max_steps'], 
    learning_rate = float(tr_conf['learning_rate']),
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 10,
    optim = tr_conf['optim'], # adamw_8bit for T4 VRAM efficiency
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = tr_conf['seed'],
    report_to = "none",
)

# 2. Initialize SFTTrainer
print("üöÄ Initializing SFTTrainer...")
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = config['finetuning']['max_seq_length'],
    args = training_args,
    packing = False, # Standard for single-instruction blocks
)

# 3. Execute Training Loop
print("üî• Starting The Intern Fine-Tuning...")
trainer_stats = trainer.train()

print("‚úÖ Training Complete! Loss converged.")

### 8. Inference Pipeline & Test Generation

In [None]:
import json
import time
from tqdm.auto import tqdm

# 1. Optimize for Inference
FastLanguageModel.for_inference(model)

# 2. Define Lead Architect Inference Function
def query_intern(instruction, context=""):
    """Generates a response using the fine-tuned 'Intern' persona."""
    system_msg = prompts['intern_persona'].strip()
    user_content = f"{instruction}\n\nContext:\n{context}"
    
    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_content}
    ]

    # Apply the specific Llama-3 chat template
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate output using config-driven parameters
    outputs = model.generate(
        inputs,
        max_new_tokens=config['finetuning']['inference']['max_new_tokens'],
        use_cache=config['finetuning']['inference']['use_cache'],
        temperature=config['finetuning']['inference']['temperature'],
    )
    
    # Extract response only
    response = tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]
    return response.strip()

# 3. Load Golden Test Set for Evaluation
test_data_path = "/content/MiniProject01/artifacts/data/golden_test_set.jsonl"
with open(test_data_path, "r") as f:
    test_data = [json.loads(line) for line in f]

# 4. Execute Prediction Loop with Latency Tracking
results = []
print(f"‚ö° Generating predictions for {len(test_data)} items...")

for item in tqdm(test_data):
    q = item['instruction']
    
    # Precise Latency Measurement
    start_time = time.time()
    try:
        pred = query_intern(q, item['input'])
    except Exception as e:
        pred = f"Inference Error: {str(e)}"
    end_time = time.time()
    
    results.append({
        "question": q,
        "ground_truth": item['output'],
        "intern_answer": pred,
        "intern_latency_ms": (end_time - start_time) * 1000
    })

# 5. Save Artifacts for The Showdown
intern_preds_path = "/content/MiniProject01/artifacts/data/intern_predictions.jsonl"
with open(intern_preds_path, "w") as f:
    for entry in results:
        f.write(json.dumps(entry) + "\n")

print(f"‚úÖ Success: Predictions saved to {intern_preds_path}")

### 9. Adapter Preservation

In [None]:
# 1. Define Output Directory
adapter_path = "/content/MiniProject01/artifacts/outputs/llama-3-financial-intern"

# 2. Save Trained LoRA Adapters
# This saves only the incremental weights, making it much smaller than the full model
model.save_pretrained(adapter_path)

# 3. Save Tokenizer for Consistency
tokenizer.save_pretrained(adapter_path)

print(f"‚úÖ Adapters and tokenizer successfully saved to: {adapter_path}")

### 10. Artifacts Export & Backup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

print("‚úÖ Google Drive Mounted at: /content/drive/MyDrive")

In [None]:
import os

# 1. Define Persistent Storage Paths
source_folder = "/content/MiniProject01"

# Destination: Google Drive directory for the final artifacts
destination_folder = "/content/drive/MyDrive/MiniProject01"

# 2. Create the destination directory structure
print(f"üìÇ Creating persistent directory: {destination_folder}")
!mkdir -p "$destination_folder"

# 3. Synchronize Artifacts
if os.path.exists(source_folder):
    print(f"üöÄ Archiving adapters and predictions to Drive...")
    # Copying recursively to preserve folder structure
    !cp -r "$source_folder"/* "$destination_folder"/
    
    # 4. Final Audit of Backed-up Files
    print("\n‚úÖ Backup Complete. Files in Drive:")
    !ls -lh "$destination_folder"
else:
    print(f"‚ùå Error: Source folder '{source_folder}' not found. Check local paths.")