In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv
/kaggle/input/bnb-to-load-transformers-models/peft-0.11.1-py3-none-any.whl
/kaggle/input/bnb-to-load-transformers-models/transformers-4.41.2-py3-none-any.whl
/kaggle/input/bnb-to-load-transformers-models/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl
/kaggle/input/bnb-to-load-transformers-models/accelerate-0.30.1-py3-none-any.whl
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/spm.model
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/config.json
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/README (1).md
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/README.md
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/tokenizer_config.json
/kaggle/input/hugging


# Install the necessary libraries for Parameter Efficient Fine-Tuning (PEFT) and model loading


In [13]:

!pip install -q /kaggle/input/bnb-to-load-transformers-models/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl
!pip install -q /kaggle/input/bnb-to-load-transformers-models/accelerate-0.30.1-py3-none-any.whl
!pip install /kaggle/input/bnb-to-load-transformers-models/peft-0.11.1-py3-none-any.whl
#!pip install -q /kaggle/input/bnb-to-load-transformers-models/transformers-4.41.2-py3-none-any.whl


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Processing /kaggle/input/bnb-to-load-transformers-models/peft-0.11.1-py3-none-any.whl
peft is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


# Import Libraries

In [4]:
# Importing necessary libraries
import os
import random
import warnings
warnings.simplefilter('ignore')  # Ignore warnings to keep the output clean

# Importing essential libraries for computation and machine learning
import numpy as np
import pandas as pd
import torch
import datasets
from sklearn.metrics import mean_absolute_percentage_error  # Metric for regression
from sklearn.model_selection import StratifiedKFold  # For performing cross-validation


# Libraries and Configuration for Model Training and Fine-Tuning (PEFT)

In [5]:
# HuggingFace libraries for model loading, training, and tokenization
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)


In [6]:
# Libraries for Parameter-Efficient Fine-Tuning (PEFT)
from peft import (
    LoftQConfig,
    LoraConfig,
    TaskType,
    get_peft_model,
    PeftModel,
    PeftConfig
)

# Configuration class containing all hyperparameters and setup details
class CFG:
    # Number of labels to predict. For regression, we have only one output value (the score).
    n_labels = 1  
    
    # Set device to GPU if available, otherwise use CPU.
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
    
    # Random seed for reproducibility of results.
    seed = 1  
    
    # Path to the pre-trained model checkpoint (DeBERTa in this case).
    model_ckpt = '/kaggle/input/huggingfacedebertav3variants/deberta-v3-base'
    
    # Hyperparameters for training:
    max_input_length = 2000  # Maximum number of tokens the model can handle for each input
    use_peft = False  # Flag to decide whether to use Parameter Efficient Fine-Tuning (PEFT)
    n_freeze = None  # Number of transformer layers to freeze. Set to None to not freeze any layers
    n_folds = 2  # Number of folds for Stratified KFold Cross-Validation
    learning_rate = 5.0e-5  # Learning rate for model optimization
    warmup_ratio = 0.1  # Warm-up ratio for learning rate scheduler
    n_epochs = 1  # Number of epochs for training
    train_batch_size = 4  # Batch size for training
    eval_batch_size = 1  # Batch size for evaluation
    grad_accum_steps = 4  # Number of steps to accumulate gradients before performing a backward pass
    steps = 200  # Number of steps between logging or saving the model
    fp16 = True  # Use 16-bit precision to reduce memory usage and speed up training



# Load the dataset

In [7]:

DATA_DIR = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/'  # Dataset location
df = pd.read_csv(DATA_DIR + 'train.csv')  # Read the training data from a CSV file

# Convert the score to a continuous variable for regression
df['label'] = df['score'] * 10  # Scaling score to continuous range
df['label'] = df['label'].astype('float32')  # Ensuring the label is float for regression task

# Check the distribution of the labels
df['label'].value_counts()


# Initialize the tokenizer from the pre-trained model checkpoint
tokenizer = AutoTokenizer.from_pretrained(CFG.model_ckpt)

# Create a data collator for padding sequences to the same length during batching
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



In [8]:
# Function to tokenize the input text
def tokenize(batch):
    """
    This function takes a batch of text data and tokenizes it using the tokenizer 
    from the pre-trained DeBERTa model.
    """
    tokenized_inputs = tokenizer(
        batch['full_text'],  # The input text column in the DataFrame
        padding=False,  # Padding is handled by the data collator during training
        truncation=True,  # Ensure that the text doesn't exceed the max input length
        max_length=CFG.max_input_length,  # Truncate input to the maximum length
    )
    return tokenized_inputs  # Return the tokenized inputs


# Model Initialization with Optional Fine-Tuning (PEFT)

In [9]:
# Function to initialize the model with optional fine-tuning
def model_init():
    """
    This function initializes the DeBERTa model and configures it for the regression task.
    It also handles freezing layers if specified and applies PEFT (if enabled).
    """
    # Load the model from the pre-trained checkpoint with the specified number of output labels
    model = AutoModelForSequenceClassification.from_pretrained(
        CFG.model_ckpt, 
        num_labels=CFG.n_labels,
    ).to(CFG.device)  # Move the model to the specified device (GPU or CPU)
    
    # Freeze the embedding and transformer layers if specified (for transfer learning)
    if CFG.n_freeze is not None:
        for param in model.base_model.embeddings.parameters():
            param.requires_grad = False  # Freeze embedding layer
        for i in range(CFG.n_freeze):  # Freeze the specified number of transformer layers
            for param in model.base_model.encoder.layer[i].parameters():
                param.requires_grad = False

    # Apply Parameter Efficient Fine-Tuning (PEFT) if enabled
    if CFG.use_peft:
        loftq_config = LoftQConfig(loftq_bits=4)  # PEFT configuration using LoftQ quantization
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,  # Sequence classification task (used for regression)
            inference_mode=False,  # Training mode (inference would be true during actual inference)
            init_lora_weights='loftq',  # Use LoftQ initialization for PEFT
            loftq_config=loftq_config,  # LoftQ configuration
            use_rslora=True,  # Use RSLora for low-rank adaptation
            r=16,  # Rank for low-rank adaptation
            lora_alpha=8,  # Alpha scaling factor for LoRA layers
            lora_dropout=0,  # No dropout in LoRA layers
        )
        model = get_peft_model(model, peft_config)  # Apply PEFT to the model
    
    return model  # Return the initialized model


# Evaluation Metric, Seed Setting, and Trainable Parameter Calculation Functions

In [10]:
# Compute the Mean Absolute Percentage Error (MAPE) metric for evaluation
def compute_metrics(pred):
    """
    This function calculates the Mean Absolute Percentage Error (MAPE) 
    between the true labels and the model predictions.
    """
    labels = pred.label_ids  # True labels from the evaluation set
    predictions = pred.predictions  # Predictions from the model
    mape = mean_absolute_percentage_error(labels, predictions)  # Calculate MAPE
    return {'mape': mape}  # Return the MAPE value


# Function to set the random seed for reproducibility
def seed_everything(seed: int):
    """
    This function sets the seed for all random number generators (Python, NumPy, PyTorch, CUDA) 
    to ensure reproducibility of results.
    """
    random.seed(seed)  # Set seed for Python's random module
    os.environ["PYTHONHASHSEED"] = str(seed)  # Set seed for Python's hash function
    np.random.seed(seed)  # Set seed for NumPy
    torch.manual_seed(seed)  # Set seed for PyTorch
    torch.cuda.manual_seed(seed)  # Set seed for CUDA (if using GPU)
    torch.backends.cudnn.deterministic = True  # Ensure deterministic results
    torch.backends.cudnn.benchmark = False  # Disable non-deterministic algorithms


# Function to print the number of trainable parameters in the model
def print_trainable_params(model):
    """
    This function prints the number of trainable parameters and the total number of parameters 
    in the model, as well as the ratio between them.
    """
    trainable_params = 0  # Initialize count for trainable parameters
    all_params = 0  # Initialize count for all parameters in the model
    
    # Iterate over the model parameters and count the trainable ones
    for _, param in model.named_parameters():
        all_params += param.numel()  # Add total number of parameters
        if param.requires_grad == True:  # Check if the parameter is trainable
            trainable_params += param.numel()  # Add to the trainable parameter count
    
    # Print the counts and the ratio
    print(f"trainable parameters: {trainable_params}, all parameters: {all_params}, ratio: {100 * trainable_params / all_params}%")



<a name="model-initialization-with-optional-fine-tuning-peft"></a>
## Model Initialization, Cross-Validation, and Training Process

In [11]:
# Initialize the model
model = model_init()

# Print the number of trainable parameters
print_trainable_params(model)

# Set random seed for reproducibility
seed_everything(CFG.seed)


# Cross-validation using Stratified KFold
skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)  # Stratified KFold setup for splitting data
for fold, (tr_idx, va_idx) in enumerate(skf.split(df, df['label'])):
    # Split the dataset into training and validation sets
    df_train = df.loc[tr_idx, ['full_text', 'label']].copy()  # Training data
    df_valid = df.loc[va_idx, ['full_text', 'label']].copy()  # Validation data
    print('#'*25, f"Fold {fold}", '#'*25)  # Print fold information
    
    # Prepare datasets using HuggingFace Datasets library
    ds_train = datasets.Dataset.from_pandas(df_train)  # Convert to HuggingFace dataset format
    ds_valid = datasets.Dataset.from_pandas(df_valid)  # Convert to HuggingFace dataset format
    
    # Tokenize the datasets
    tokenized_ds_train = ds_train.map(tokenize, batched=True, batch_size=None)
    tokenized_ds_valid = ds_valid.map(tokenize, batched=True, batch_size=None)
    
    # Convert datasets to PyTorch format
    tokenized_ds_train.set_format('torch')
    tokenized_ds_valid.set_format('torch')
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir='/kaggle/temp/',  # Directory to save model checkpoints
        overwrite_output_dir=True,  # Allow overwriting the output directory
        learning_rate=CFG.learning_rate,  # Learning rate
        warmup_ratio=CFG.warmup_ratio,  # Warm-up ratio for learning rate
        num_train_epochs=CFG.n_epochs,  # Number of epochs to train the model
        per_device_train_batch_size=CFG.train_batch_size,  # Batch size for training
        per_device_eval_batch_size=CFG.eval_batch_size,  # Batch size for evaluation
        gradient_accumulation_steps=CFG.grad_accum_steps,  # Gradient accumulation steps
        gradient_checkpointing=True,  # Use gradient checkpointing for memory efficiency
        fp16=CFG.fp16,  # Enable FP16 precision for faster computation
        logging_strategy='steps',  # Log every few steps
        logging_steps=CFG.steps,  # Number of steps between logging
        evaluation_strategy='steps',  # Evaluate model every few steps
        eval_steps=CFG.steps,  # Number of steps between evaluations
        save_strategy='steps',  # Save model every few steps
        save_steps=CFG.steps,  # Number of steps between saving
        save_total_limit=1,  # Keep only the most recent checkpoint
        load_best_model_at_end=True,  # Load the best model at the end of training
        report_to='none',  # Disable reporting (use 'tensorboard' for logging)
        seed=CFG.seed,  # Set the random seed for training
    )
    
    # Initialize the trainer
    trainer = Trainer(
        model=model,  # Model to train
        args=training_args,  # Training arguments
        train_dataset=tokenized_ds_train,  # Training dataset
        eval_dataset=tokenized_ds_valid,  # Validation dataset
        tokenizer=tokenizer,  # Tokenizer for text preprocessing
        data_collator=data_collator,  # Data collator for batching
        compute_metrics=compute_metrics,  # Metrics for evaluation
    )
    
    # Start the training process
    trainer.train()
    
    # Only perform one fold to save time. Uncomment 'break' to perform cross-validation with multiple folds.
    # break

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/huggingfacedebertav3variants/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable parameters: 184422913, all parameters: 184422913, ratio: 100.0%
######################### Fold 0 #########################


Map:   0%|          | 0/8653 [00:00<?, ? examples/s]

Map:   0%|          | 0/8654 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Mape
200,386.5261,105.185226,0.416124
400,56.921,68.126259,0.310461


######################### Fold 1 #########################


Map:   0%|          | 0/8654 [00:00<?, ? examples/s]

Map:   0%|          | 0/8653 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Mape
200,47.4831,40.639767,0.221849
400,41.8864,43.680431,0.218837
