In [1]:
# Core ML libraries
# torch>=2.0.0
!pip install transformers>=4.30.0
!pip install datasets>=2.10.0
!pip install accelerate>=0.20.0

# Data processing
!pip install pandas>=1.5.0
!pip install numpy>=1.24.0
!pip install scikit-learn>=1.3.0
!pip install openpyxl>=3.1.0

# Utilities
!pip install tqdm>=4.65.0
!pip install wandb>=0.15.0  # Optional: for experiment tracking
!pip install tensorboard>=2.13.0  # Optional: for logging

# Additional useful packages
!pip install peft>=0.4.0  # For parameter-efficient fine-tuning (LoRA)
!pip install bitsandbytes>=0.39.0  # For 8-bit training
!pip install sentencepiece>=0.1.99  # For some tokenizers

In [2]:
import pandas as pd
import numpy as np
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers.trainer_utils import get_last_checkpoint

from transformers import (
    DataCollatorWithPadding,
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig # For potential quantization
)
from sklearn.model_selection import train_test_split
import re
from typing import List, Dict, Any, Optional
import warnings
import os
from accelerate import Accelerator # For distributed training / mixed precision / device handling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training # For LoRA

In [16]:
warnings.filterwarnings("ignore")

# --- Configuration ---
# Ensure this path points to your Excel file
EXCEL_FILE_PATH = "./Resource-Skills-Experience-Data2.xlsx"
BASE_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1" # Or any other suitable model
OUTPUT_DIR = "./output/finetuned_resource_model_cuda" # Changed output dir name
LOGGING_DIR = "./logs/finetuning_logs_cuda"
CHECKPOINT_DIR = "./checkpoint/finetuning_checkpoints_cuda"
MAX_LENGTH = 512
TEST_SIZE = 0.1
RANDOM_STATE = 42

In [4]:
if torch.cuda.is_available():
    print(f"CUDA is available. Found {torch.cuda.device_count()} GPU(s).")
    print(f"Current CUDA device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
    # Set default device for tensors if not using Accelerator's context
    # torch.set_default_device("cuda") # Generally handled by Accelerator/Trainer
else:
    print("CUDA is not available. Training will run on CPU (which will be very slow).")
    print("Ensure you have an NVIDIA GPU, appropriate drivers, and PyTorch with CUDA support installed.")


CUDA is available. Found 1 GPU(s).
Current CUDA device: 0
Device name: Tesla T4


In [5]:
accelerator = Accelerator()
print(f"Accelerator initialized. Using device: {accelerator.device}")

Accelerator initialized. Using device: cuda


In [17]:
TRAINING_ARGS_CONFIG = {
    "output_dir": CHECKPOINT_DIR,
    "num_train_epochs": 3,
    "per_device_train_batch_size": 2, # Adjust based on your GPU memory
    "per_device_eval_batch_size": 2,  # Adjust based on your GPU memory
    "gradient_accumulation_steps": 8, # Effective batch size = batch_size * grad_accum * num_gpus
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "logging_dir": LOGGING_DIR,
    "logging_steps": 10,
    "learning_rate": 3e-5,
    "weight_decay": 0.01,
    "warmup_steps": 200,
    "lr_scheduler_type": "cosine",
    "save_total_limit": 2,
    "load_best_model_at_end": True,
    "metric_for_best_model": "eval_loss",
    "greater_is_better": False,
    # --- Mixed Precision ---
    # Set fp16=True for mixed-precision training on most NVIDIA GPUs.
    # Set bf16=True for Ampere GPUs (e.g., A100, RTX 30xx/40xx) or newer for potentially better stability.
    # Accelerator/Trainer will handle the backend.
    "fp16": torch.cuda.is_available(), # Enable fp16 only if CUDA is available
    "bf16": False, # Set to True if you have Ampere/Hopper GPU and want to use bfloat16
    # -----------------------
    "report_to": ["tensorboard"], # Add "wandb" if you have it configured
    "gradient_checkpointing": True, # Saves memory during training, might slow down slightly
    "optim": "adamw_torch", # Recommended optimizer
    # If using multiple GPUs, Trainer handles distribution automatically via Accelerator.
    # "fsdp": "full_shard auto_wrap", # Example for Fully Sharded Data Parallel (requires accelerate config)
    # "fsdp_transformer_layer_cls_to_wrap": ["MistralDecoderLayer"], # Specify layer class for FSDP wrapping (model specific)
}


In [18]:
USE_LORA = True # Set to False to disable LoRA (full fine-tuning)
LORA_CONFIG = LoraConfig(
    r=8, # Rank of the update matrices (higher rank = more parameters, potentially better fit but more memory)
    lora_alpha=16, # LoRA scaling factor
    # Target modules depend on the model architecture.
    # For Mistral: q_proj, k_proj, v_proj, o_proj are common targets in attention layers.
    # You might need to inspect the model architecture (`print(model)`) to confirm.
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05, # Dropout probability for LoRA layers
    bias="none", # Usually set to "none" for LoRA
    task_type="CAUSAL_LM" # Specifies the task type for PEFT
)

In [19]:
USE_QUANTIZATION = True # Set to False to disable quantization
# Ensure `bitsandbytes` is installed: pip install bitsandbytes
QUANTIZATION_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True, # Load model in 4-bit precision
    bnb_4bit_quant_type="nf4", # Use NF4 (NormalFloat4) quantization type
    # Compute dtype: Use bfloat16 for Ampere/newer GPUs, float16 for older GPUs.
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    bnb_4bit_use_double_quant=True, # Use double quantization for extra memory savings
) if USE_QUANTIZATION else None # Set to None if not using quantization


In [20]:
class ResourceSkillsDataset(Dataset):
    """Custom PyTorch Dataset for resource skills fine-tuning."""

    def __init__(self, data: List[Dict], tokenizer, max_length: int = MAX_LENGTH):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        print(f"Initialized dataset with {len(data)} examples.")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # Expecting pre-formatted prompt string in the 'prompt' key
        text = item["prompt"]

        # Tokenize the formatted prompt
        # Padding is handled by the DataCollator, so we don't pad here initially.
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=512,
            padding=True, # Let DataCollator handle padding
            return_tensors=None, # Return lists of IDs, not tensors yet
        )

        # For Causal LM, labels are usually the same as input_ids.
        # The model learns to predict the next token.
        # The loss calculation typically ignores padding tokens and potentially input prompt tokens.
        # DataCollatorForLanguageModeling handles this masking if mlm=False.
        labels = encoding["input_ids"].copy()

        # Return dictionary compatible with Trainer
        # Tensors will be created and moved to the correct device by the Trainer/DataCollator.
        return {
            "input_ids": encoding["input_ids"],
            "attention_mask": encoding["attention_mask"],
            "labels": labels,
        }


In [21]:
class ResourceSkillsFineTuner:
    """Encapsulates the fine-tuning pipeline steps."""

    def __init__(self, model_name: str = BASE_MODEL_NAME, use_lora: bool = USE_LORA, quantization_config: Optional[BitsAndBytesConfig] = QUANTIZATION_CONFIG):
        self.model_name = model_name
        self.use_lora = use_lora
        self.quantization_config = quantization_config
        self.tokenizer = None
        self.model = None
        # Use the global accelerator instance
        self.accelerator = accelerator

    def load_excel_data(self, file_path: str) -> pd.DataFrame:
        """Loads data from the specified Excel file."""
        print(f"Loading Excel file from: {file_path}")
        if not os.path.exists(file_path):
             raise FileNotFoundError(f"Error: Excel file not found at {file_path}")
        try:
            df = pd.read_excel(file_path)
            print(f"Loaded {len(df)} records. Columns: {df.columns.tolist()}")
            # Clean column names (remove leading/trailing spaces)
            df.columns = df.columns.str.strip()
            return df
        except Exception as e:
            print(f"Error loading or processing Excel file: {e}")
            raise

    def clean_text(self, text: Any) -> str:
        """Cleans and normalizes text fields from the DataFrame."""
        if pd.isna(text) or text == "":
            return "N/A"
        text = str(text).strip()
        text = re.sub(r"\s+", " ", text) # Consolidate whitespace
        # Keep alphanumeric, common punctuation, and symbols relevant to data
        text = re.sub(r"[^\w\s,.!?\-:/()$%+]", "", text)
        text = re.sub(r"(N/A\s*)+|,\s*N/A", "N/A", text, flags=re.IGNORECASE) # Consolidate N/A
        text = text.replace(";", ",") # Standardize list separators
        return text.strip()

    def format_instruction_prompt(self, instruction: str, response: str) -> str:
        """Formats a prompt according to the Mistral Instruct template."""
        # Format: <s>[INST] Instruction [/INST] Response</s>
        # Ensure EOS token `</s>` is correctly handled by tokenizer/training.
        return f"<s>[INST] {instruction.strip()} [/INST] {response.strip()}</s>"

    def create_training_prompts(self, df: pd.DataFrame) -> List[Dict[str, str]]:
        """Converts DataFrame rows into structured instruction prompts for fine-tuning."""
        print("Creating training prompts from DataFrame...")
        training_data = []
        expected_columns = [
            'Resource ID', 'Name', 'Job Title/Role', 'Job Title/Role Group',
            'Org Unit L3', 'Org Unit L4', 'Country', 'Worker Type', 'Manager Name',
            'Industry experience (years)', 'Total Experience (years)', 'Degree',
            'Languages', 'Therapeutic Area', 'Speciality', 'Functional Expertise',
            'Technical Skills', 'Certifications', 'Key Strengths',
            'Current Allocation (%)', 'Availability (%)', 'On Bench'
        ]

        # Check for missing columns
        missing_cols = [col for col in expected_columns if col not in df.columns]
        if missing_cols:
            print(f"Warning: Missing expected columns in Excel file: {missing_cols}")

        for idx, row in df.iterrows():
            # Use .get(col, default) for robustness against missing columns
            resource_id = self.clean_text(row.get('Resource ID', f'unknown_{idx}'))
            name = self.clean_text(row.get('Name', 'Unknown'))

            # --- 1. Resource Profile Prompt ---
            try:
                instruction = f"Generate a profile summary for resource {name} (ID: {resource_id})."
                response_parts = [
                    f"Name: {name}",
                    f"Resource ID: {resource_id}",
                    f"Role: {self.clean_text(row.get('Job Title/Role', 'N/A'))}",
                    f"Role Group: {self.clean_text(row.get('Job Title/Role Group', 'N/A'))}",
                    f"Department: {self.clean_text(row.get('Org Unit L3', 'N/A'))}",
                    f"Team: {self.clean_text(row.get('Org Unit L4', 'N/A'))}",
                    f"Location: {self.clean_text(row.get('Country', 'N/A'))}",
                    f"Employment Type: {self.clean_text(row.get('Worker Type', 'N/A'))}",
                    f"Reports to: {self.clean_text(row.get('Manager Name', 'N/A'))}",
                    f"Industry Experience: {self.clean_text(row.get('Industry experience (years)', 0))} years",
                    f"Total Experience: {self.clean_text(row.get('Total Experience (years)', 0))} years",
                    f"Education: {self.clean_text(row.get('Degree', 'N/A'))}",
                ]
                # Filter out N/A or zero experience entries for a cleaner response
                response = "\n".join(part for part in response_parts if "N/A" not in part and "0 years" not in part)
                if response: # Only add if there's meaningful content
                    training_data.append({
                        'prompt': self.format_instruction_prompt(instruction, response),
                        'type': 'profile',
                        'resource_id': resource_id
                    })
            except Exception as e:
                print(f"Warning: Error creating profile prompt for row {idx} (ID: {resource_id}): {e}")

            # --- 2. Skills & Expertise Prompt ---
            try:
                instruction = f"Detail the skills and expertise of resource {name} (ID: {resource_id})."
                response_parts = [
                    f"Languages Spoken: {self.clean_text(row.get('Languages', 'N/A'))}",
                    f"Therapeutic Areas of Focus: {self.clean_text(row.get('Therapeutic Area', 'N/A'))}",
                    f"Specialized Areas: {self.clean_text(row.get('Speciality', 'N/A'))}",
                    f"Functional Skills: {self.clean_text(row.get('Functional Expertise', 'N/A'))}",
                    f"Technical Proficiencies: {self.clean_text(row.get('Technical Skills', 'N/A'))}",
                    f"Relevant Certifications: {self.clean_text(row.get('Certifications', 'N/A'))}",
                    f"Identified Key Strengths: {self.clean_text(row.get('Key Strengths', 'N/A'))}",
                ]
                response = "\n".join(part for part in response_parts if "N/A" not in part)
                if response:
                    training_data.append({
                        'prompt': self.format_instruction_prompt(instruction, response),
                        'type': 'skills',
                        'resource_id': resource_id
                    })
            except Exception as e:
                print(f"Warning: Error creating skills prompt for row {idx} (ID: {resource_id}): {e}")

            # --- 3. Availability & Allocation Prompt ---
            try:
                instruction = f"Report the current allocation and availability status for {name} (ID: {resource_id})."
                response_parts = [
                    f"Current Project Allocation: {self.clean_text(row.get('Current Allocation (%)', 'N/A'))}%",
                    f"Current Availability: {self.clean_text(row.get('Availability (%)', 'N/A'))}%",
                    f"Currently On Bench: {self.clean_text(row.get('On Bench', 'N/A'))}",
                    # Add future allocation if data exists, e.g.:
                    # f"Next Quarter Allocation Forecast: {self.clean_text(row.get('Next 3M Allocation (%)', 'N/A'))}%"
                ]
                response = "\n".join(part for part in response_parts if "N/A" not in part and "N/A%" not in part)
                if response:
                    training_data.append({
                        'prompt': self.format_instruction_prompt(instruction, response),
                        'type': 'availability',
                        'resource_id': resource_id
                    })
            except Exception as e:
                print(f"Warning: Error creating availability prompt for row {idx} (ID: {resource_id}): {e}")

            # --- 4. Project Matching Query Prompt (Example) ---
            # Creates a hypothetical query based on the resource's own data for training.
            try:
                # Use first listed skill/role/exp if available
                skills = self.clean_text(row.get('Technical Skills', '')).split(',')
                required_skill = skills[0].strip() if skills and skills[0].strip() else None
                required_role = self.clean_text(row.get('Job Title/Role', ''))
                required_exp_str = self.clean_text(row.get('Industry experience (years)', '0'))
                required_exp = float(required_exp_str) if required_exp_str.replace('.', '', 1).isdigit() else 0

                if required_skill and required_role and required_exp > 0:
                    instruction = f"Evaluate if resource {name} (ID: {resource_id}) is a potential match for a project needing a '{required_role}' with expertise in '{required_skill}' and over {int(required_exp)-1} years of industry experience."

                    # Simple positive affirmation based on the source data
                    response = f"Based on available data, {name} appears suitable. Key matching attributes: Role ({required_role}), Skill ({required_skill}), Industry Experience ({required_exp:.1f} years)."

                    training_data.append({
                        'prompt': self.format_instruction_prompt(instruction, response),
                        'type': 'matching_positive',
                        'resource_id': resource_id
                    })
                # Optional: Add negative examples here if needed for robustness

            except Exception as e:
                print(f"Warning: Error creating matching prompt for row {idx} (ID: {resource_id}): {e}")

        print(f"Generated {len(training_data)} raw training examples.")
        # Deduplicate based on the exact prompt string
        unique_prompts = {item['prompt']: item for item in training_data}.values()
        final_data = list(unique_prompts)
        print(f"Returning {len(final_data)} unique training examples after deduplication.")
        return final_data

    def initialize_model_and_tokenizer(self):
        """Loads the tokenizer and model, applying quantization and LoRA if configured."""
        print(f"Loading tokenizer: {self.model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)

        # Set padding token if missing (common for Llama/Mistral based models)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            print(f"Tokenizer pad_token set to eos_token ({self.tokenizer.eos_token})")
        # Ensure padding side is right for Causal LMs during generation/training
        self.tokenizer.padding_side = "right"

        print(f"Loading model: {self.model_name}")
        model_load_kwargs = {
            'trust_remote_code': True,
            # Use Accelerator's device map for potentially multi-GPU setups or quantization
            # `device_map="auto"` lets Accelerate handle placement.
            # For quantization with single GPU, `device_map={'': accelerator.local_process_index}` is often used.
            'device_map': "auto",
        }

        if self.quantization_config:
            print("Applying quantization config...")
            model_load_kwargs['quantization_config'] = self.quantization_config
            # Note: `torch_dtype` might be needed depending on model/quantization interaction
            # model_load_kwargs['torch_dtype'] = torch.bfloat16 # or torch.float16
        else:
             # If not quantizing, explicitly set dtype for potential mixed precision
             compute_dtype = torch.bfloat16 if TRAINING_ARGS_CONFIG.get("bf16", False) else torch.float16
             model_load_kwargs['torch_dtype'] = torch.float16 if torch.cuda.is_available() else torch.float32

        try:
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                **model_load_kwargs
            )
        except Exception as e:
             print(f"Error loading model: {e}")
             print("Ensure you have enough GPU memory and necessary libraries (transformers, accelerate, bitsandbytes if quantizing).")
             raise

        # Prepare model for k-bit training if quantization is enabled
        if self.quantization_config:
             print("Preparing model for k-bit training (post-quantization setup)...")
             # This adapts the quantized model for training, often involving gradient checkpointing setup
             self.model = prepare_model_for_kbit_training(self.model, use_gradient_checkpointing=TRAINING_ARGS_CONFIG.get("gradient_checkpointing", False))

        # Apply LoRA if enabled
        if self.use_lora:
            print("Applying LoRA configuration...")
            self.model = get_peft_model(self.model, LORA_CONFIG)
            print("LoRA enabled. Trainable parameters:")
            self.model.print_trainable_parameters()

        # Ensure model is on the correct device (Accelerator usually handles this with device_map="auto")
        # print(f"Model loaded onto device(s): {self.model.device}") # Can be complex with device_map

        print("Model and tokenizer initialized successfully.")

    def prepare_datasets(self, training_data: List[Dict]):
        """Splits data into train/validation sets and creates Dataset objects."""
        if not training_data:
             raise ValueError("Cannot prepare datasets: No training data provided.")

        print(f"Splitting {len(training_data)} examples into train/validation sets (Test size: {TEST_SIZE})...")
        train_data, val_data = train_test_split(
            training_data,
            test_size=TEST_SIZE,
            random_state=RANDOM_STATE,
            # Stratify if prompts have meaningful 'type' distribution, otherwise not needed
            # stratify=[item['type'] for item in training_data]
        )
        print(f"Train set size: {len(train_data)}, Validation set size: {len(val_data)}")

        train_dataset = ResourceSkillsDataset(train_data, self.tokenizer, max_length=MAX_LENGTH)
        val_dataset = ResourceSkillsDataset(val_data, self.tokenizer, max_length=MAX_LENGTH)

        return train_dataset, val_dataset

    def train(self, train_dataset: Dataset, val_dataset: Dataset):
        """Configures and executes the training loop using the Hugging Face Trainer."""
        if self.model is None or self.tokenizer is None:
            raise RuntimeError("Model and tokenizer must be initialized before training.")

        print("Configuring Training Arguments...")
        training_args = TrainingArguments(**TRAINING_ARGS_CONFIG)

        # Data collator dynamically pads batches to the longest sequence in the batch.
        # mlm=False indicates standard causal language modeling (not masked language modeling).
        data_collator = DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm=False)

        print("Initializing Trainer...")
        # Trainer integrates with Accelerator automatically for device placement and distributed training.
        trainer = Trainer(
            model=self.model, # The potentially PEFT-modified and quantized model
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            # You can add compute_metrics function here for custom evaluation metrics
            # compute_metrics=compute_metrics_function,
        )

        # --- Start Training ---
        print("Starting training process...")
        print(f"Checkpoints will be saved to: {training_args.output_dir}")
        print(f"Logs will be saved to: {training_args.logging_dir}")

        # Resume from checkpoint if exists
        last_checkpoint = None
        if os.path.isdir(training_args.output_dir):
            last_checkpoint = get_last_checkpoint(training_args.output_dir)
            if last_checkpoint:
                print(f"Resuming training from checkpoint: {last_checkpoint}")

        train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
        print("Training finished.")

        # --- Save Results ---
        # Log & save training metrics
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state() # Saves optimizer state, scheduler state, etc.

        # Save the final model (handles PEFT adapters correctly)
        print(f"Saving final model to {OUTPUT_DIR}...")
        # Ensure the final output directory exists
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        trainer.save_model(OUTPUT_DIR)
        # Tokenizer is usually saved by trainer.save_model, but save explicitly if needed
        if not os.path.exists(os.path.join(OUTPUT_DIR, 'tokenizer_config.json')):
            self.tokenizer.save_pretrained(OUTPUT_DIR)
            print(f"Tokenizer explicitly saved to {OUTPUT_DIR}")

        print(f"Best model checkpoint was: {trainer.state.best_model_checkpoint}")
        print(f"Final model artifacts (adapter/full model) saved to: {OUTPUT_DIR}")

        return trainer

    def run_pipeline(self):
        """Executes the complete fine-tuning workflow."""
        try:
            # 1. Load Data
            df = self.load_excel_data(EXCEL_FILE_PATH)

            # 2. Create Prompts
            training_prompts = self.create_training_prompts(df)
            if not training_prompts:
                print("Error: No training prompts generated. Check data or prompt creation logic. Aborting.")
                return

            # 3. Initialize Model and Tokenizer (with CUDA/Quantization/LoRA handling)
            self.initialize_model_and_tokenizer()

            # 4. Prepare Datasets
            train_dataset, val_dataset = self.prepare_datasets(training_prompts)

            # 5. Train Model
            self.train(train_dataset, val_dataset)

            print("\n--- Fine-tuning pipeline completed successfully! ---")

        except FileNotFoundError as e:
            print(f"\n--- Pipeline failed: {e} ---")
        except ValueError as e:
            print(f"\n--- Pipeline failed due to value error: {e} ---")
        except ImportError as e:
             print(f"\n--- Pipeline failed due to missing library: {e} ---")
             print("Please ensure all required libraries (torch, transformers, pandas, accelerate, peft, bitsandbytes, etc.) are installed.")
        except torch.cuda.OutOfMemoryError:
            print("\n--- Pipeline failed: CUDA Out of Memory! ---")
            print("Try reducing `per_device_train_batch_size`, increasing `gradient_accumulation_steps`, enabling LoRA/Quantization, or using a smaller model.")
        except Exception as e:
            print(f"\n--- An unexpected error occurred: {e} ---")
            import traceback
            traceback.print_exc()

# --- Main Execution Block ---

In [11]:
!pip install huggingface_hub



In [12]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
if __name__ == "__main__":
    print("=====================================================")
    print(" Starting Resource Skills Fine-Tuning Pipeline (CUDA Optimized) ")
    print("=====================================================")

    # Ensure necessary directories exist
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    os.makedirs(LOGGING_DIR, exist_ok=True)
    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
    print(f"Output directory: {OUTPUT_DIR}")
    print(f"Logging directory: {LOGGING_DIR}")
    print(f"Checkpoint directory: {CHECKPOINT_DIR}")
    print(f"Using base model: {BASE_MODEL_NAME}")
    print(f"Using LoRA: {USE_LORA}")
    print(f"Using Quantization: {USE_QUANTIZATION}")
    print("-----------------------------------------------------")

    # Create and run the pipeline
    finetuner = ResourceSkillsFineTuner(
        model_name=BASE_MODEL_NAME,
        use_lora=USE_LORA,
        quantization_config=QUANTIZATION_CONFIG
    )
    finetuner.run_pipeline()

    print("=====================================================")
    print(" Pipeline execution finished. ")
    print("=====================================================")



 Starting Resource Skills Fine-Tuning Pipeline (CUDA Optimized) 
Output directory: ./output/finetuned_resource_model_cuda
Logging directory: ./logs/finetuning_logs_cuda
Checkpoint directory: ./checkpoint/finetuning_checkpoints_cuda
Using base model: mistralai/Mistral-7B-Instruct-v0.1
Using LoRA: True
Using Quantization: True
-----------------------------------------------------
Loading Excel file from: ./Resource-Skills-Experience-Data2.xlsx
Loaded 688 records. Columns: ['Name', 'Business Unit', 'Department', 'Role', 'Job Title', 'Job Title/Role', 'Country Code', 'Resource Worker Type (employee/contractor)', 'Line Manager', 'Languages', 'Resource ID', 'Degree(s)', 'Therapeutic Indication', 'Therapeutic Area', 'Company experience (years)', 'Speciality', 'Industry experience (years)', 'Pediatric Experience (yes/no)', 'Regulatory/Technological Area', 'Drug Modality and Technological Area', 'Study Area', 'Service Area', 'Technical Writing Experience', 'Other Experience', 'Rate per hour', '

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Tokenizer pad_token set to eos_token (</s>)
Loading model: mistralai/Mistral-7B-Instruct-v0.1
Applying quantization config...


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Preparing model for k-bit training (post-quantization setup)...
Applying LoRA configuration...
LoRA enabled. Trainable parameters:
trainable params: 13,631,488 || all params: 7,255,363,584 || trainable%: 0.1879
Model and tokenizer initialized successfully.
Splitting 2064 examples into train/validation sets (Test size: 0.1)...
Train set size: 1857, Validation set size: 207
Initialized dataset with 1857 examples.
Initialized dataset with 207 examples.
Configuring Training Arguments...
Initializing Trainer...


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training process...
Checkpoints will be saved to: ./checkpoint/finetuning_checkpoints_cuda
Logs will be saved to: ./logs/finetuning_logs_cuda


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss



--- Pipeline failed due to value error: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected). ---
 Pipeline execution finished. 
