# Regression task version 2

### connect to google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### install dependencies

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install -q -U bitsandbytes
!pip install transformers
!pip install torch
!pip install sentencepiece
!pip install datasets
!pip install huggingface_hub
!pip install datasets
!pip install peft



In [None]:
# Import libraries
import torch
import yaml
import json
from transformers import LlamaTokenizer, LlamaForCausalLM, Trainer, TrainingArguments, PreTrainedTokenizerFast, BitsAndBytesConfig
from datasets import Dataset, load_dataset
import os
import sentencepiece
from transformers import AutoModelForSequenceClassification, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
import numpy as np
import torch.nn.functional as F

### load arguments from config file

In [None]:
# Load configuration from yaml file
with open('/content/drive/MyDrive/llm3_bp/config.yaml', 'r') as f:
  config = yaml.safe_load(f)
  print(config['model_name'])

meta-llama/Llama-3.2-1B-Instruct


### Load pretrained model and tokenizer from HF

In [None]:
# Get the HF token from the environment variable
hf_token = config['HUGGINGFACE_TOKEN']
if hf_token is None:
    raise ValueError('HUGGINGFACE_TOKEN environment variable is not set')
if not isinstance(hf_token, str):
    raise ValueError('HUGGINGFACE_TOKEN environment variable is not a string')

# Load the Tokenizer and Model
model_name = config['model_name'] # meta-Llama/Llama-3.2-1B-Instruct-4bit
print(model_name)

meta-llama/Llama-3.2-1B-Instruct


In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name, token=hf_token, use_fast=True)
#tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name, use_fast=True)

# Load the pre-trained model without quantization
model = AutoModelForSequenceClassification.from_pretrained(
    config['model_name'],
    num_labels=2,  # Assuming regression task with a single output
    token = hf_token
)


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## load training arguments from config

In [None]:

# Set up training arguments following set to a config file
training_args = TrainingArguments(
    output_dir=config['training_args_regre']['output_dir'],
    num_train_epochs=config['training_args_regre']['num_train_epochs'],
    per_device_train_batch_size=config['training_args_regre']['per_device_train_batch_size'],
    gradient_accumulation_steps=config['training_args_regre']['gradient_accumulation_steps'],
    gradient_checkpointing=config['training_args_regre']['gradient_checkpoint'],
    learning_rate=float(config['training_args_regre']['learning_rate']),
    fp16=config['training_args_regre']['fp16'],
    bf16=config['training_args_regre']['bf16'],
    save_safetensors=config['training_args_regre']['save_safetensors'],
    #save_steps=config['training_args']['save_steps'],
    save_total_limit=config['training_args_regre']['save_total_limit'],
    logging_steps=config['training_args_regre']['logging_steps'],
    save_strategy= config['training_args_regre']['save_strategy'],
    save_steps= config['training_args_regre']['save_steps'],
    #save_total_limit= config['training_args_regre']['save_total_limit'],
    logging_strategy= config['training_args_regre']['logging_strategy'],
    logging_dir= config['training_args_regre']['logging_dir'],
    #eval_steps= config['training_args_regre']['eval_steps'],
    #remove_unused_columns = config['training_args_regre']['remove_unused_columns'],
    #load_best_model_at_end=config['training_args_regre']['load_best_model_at_end'],
    #metric_for_best_model=config['training_args_regre']['metric_for_best_model'],
    #greater_is_better=config['training_args_regre']['greater_is_better'],
    #save_total_limit=config['training_args_regre']['save_total_limit'],
    #logging_steps=config['training_args_regre']['logging_steps'],
    evaluation_strategy=config['training_args_regre']['evaluation_strategy'],
    report_to=config['training_args_regre']['report_to'],
    #remove_unused_columns=False
)



### define loss function for BP regression task (SBP and DBP)

In [None]:
def custom_loss_function(predictions, labels, sbp_weight=1.0, dbp_weight=1.0):
  """
  Custom loss function for predicting SBP and DBP

  predictions: Tensor of shape [batch_size, 2] containing predicted SBP and DBP
  labels: Tensor of shape [batch_size, 2] containing true SBP and DBP
  """

  sbp_pred = predictions[:, 0]  # Predicted SBP
  dbp_pred = predictions[:, 1]  # Predicted DBP

  sbp_labels = labels[:, 0]     # True SBP
  dbp_labels = labels[:, 1]     # True DBP

  # Compute the Mean Squared Error (MSE) for both outputs
  sbp_loss = F.mse_loss(sbp_pred, sbp_true)
  dbp_loss = F.mse_loss(dbp_pred, dbp_true)

  # Combine the losses
  total_loss = (sbp_loss * sbp_weight) + (dbp_loss * dbp_weight)

  return total_loss

### define compute metrics

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions

    # Extract SBP and DBP predictions and labels
    sbp_preds, dbp_preds = preds[:, 0], preds[:, 1]
    sbp_labels, dbp_labels = labels[:, 0], labels[:, 1]

    # Compute MSE for both SBP and DBP
    sbp_mse = np.mean((sbp_preds - sbp_labels) ** 2)
    dbp_mse = np.mean((dbp_preds - dbp_labels) ** 2)

    # Return metrics as a dictionary
    return {
        "sbp_mse": sbp_mse,
        "dbp_mse": dbp_mse,
        "average_mse": (sbp_mse + dbp_mse) / 2
    }


### split dataset into train dataset and test dataset

In [None]:
# regression
# dataset preparation
# Load dataset
#data_file_dir = config['data']['data_prompt1_file']
dataset = load_dataset('json', data_files='/content/drive/MyDrive/llm3_bp/test_prompt_1.jsonl')
print("Preview the dataset", dataset)

# Split the Dataset
# Split the dataset into 80% training and 20% testing
# Split the dataset into 90% training and 10% testing
train_test_split = dataset['train'].train_test_split(test_size=0.1)

# Assign the train and test datasets
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Print sizes of the train and test datasets
print(f"Train dataset: {len(train_dataset)} examples")  # Training examples count
print(f"Test dataset: {len(test_dataset)} examples")    # Test examples count

# Optionally, if you need to separate input and target columns for further processing
input_train = train_dataset['input']
input_test = test_dataset['input']
target_train = train_dataset['target']
target_test = test_dataset['target']

print(f"Input Train dataset: {len(input_train)} examples")
print(f"Input Test dataset: {len(input_test)} examples")

print(f"Target Train dataset: {len(target_train)} examples")
print(f"Target Test dataset: {len(target_test)} examples")

Preview the dataset DatasetDict({
    train: Dataset({
        features: ['input', 'target'],
        num_rows: 360146
    })
})
Train dataset: 324131 examples
Test dataset: 36015 examples
Input Train dataset: 324131 examples
Input Test dataset: 36015 examples
Target Train dataset: 324131 examples
Target Test dataset: 36015 examples


### initialize the trainer

In [None]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,  # Include your evaluation dataset
#     compute_metrics=compute_metrics,       # Add the compute_metrics function
#     tokenizer=tokenizer,
#     data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
# )

### Check data column names

In [None]:
print(train_dataset.column_names)

['input', 'target']


### define the tokenize function

In [None]:
# def tokenize_function(examples):
#     return tokenizer(examples['input'], truncation=True, padding="max_length")

### version 2 of the tokenize function

In [None]:
# def tokenize_function(example):
#     if isinstance(example["target"], list):
#         target_text = " ".join(example["target"])
#     else:
#         target_text = example["target"]

#     sbp_value = -1.0  # Default value for missing or invalid SBP
#     dbp_value = -1.0  # Default value for missing or invalid DBP

#     try:
#         if "SBP value is" in target_text and "DBP value is" in target_text:
#             sbp_str = target_text.split("SBP value is ")[1].split(",")[0].strip()
#             dbp_str = target_text.split("DBP value is ")[1].split(".")[0].strip()

#             sbp_value = float(sbp_str) if sbp_str != "N/A" else -1.0
#             dbp_value = float(dbp_str) if dbp_str != "N/A" else -1.0
#         else:
#             print(f"Warning: Target text does not contain valid SBP and DBP values: {target_text}")
#     except (IndexError, ValueError) as e:
#         print(f"Warning: Skipping example with invalid target text format: {target_text}. Error: {e}")

#     # Tokenize the input text
#     input_tokenized = tokenizer(
#         example["inputs"],
#         truncation=True,
#         max_length=512,  # Adjust based on input size
#         padding="max_length",
#         return_tensors="pt"
#     )

#     # Ensure labels are the correct shape
#     labels = torch.tensor([sbp_value, dbp_value], dtype=torch.float32).unsqueeze(0)  # Shape [1, 2]

#     # Squeeze input tensors to match model expected input
#     input_ids = input_tokenized["input_ids"].squeeze(0)
#     attention_mask = input_tokenized["attention_mask"].squeeze(0)

#     return {
#         "input_ids": input_ids,  # Shape [seq_len]
#         "attention_mask": attention_mask,  # Shape [seq_len]
#         "labels": labels  # Shape [1, 2]
#     }


In [None]:
# def tokenize_function(example):
#     # Check if the target is a list and convert it to a string if needed
#     if isinstance(example["target"], list):
#         target_text = " ".join(example["target"])  # Join list elements into a single string
#     else:
#         target_text = example["target"]  # If it's already a string, use it as is

#     # Initialize default values for SBP and DBP
#     sbp_value = -1.0  # Default value for missing or invalid SBP
#     dbp_value = -1.0  # Default value for missing or invalid DBP

#     # Ensure the parsing logic doesn't fail without setting these values
#     try:
#         # Check if both SBP and DBP exist in the target text
#         if "SBP value is" in target_text and "DBP value is" in target_text:
#             # Extract SBP and DBP values from the target text
#             sbp_str = target_text.split("SBP value is ")[1].split(",")[0].strip()
#             dbp_str = target_text.split("DBP value is ")[1].split(".")[0].strip()

#             # Convert SBP string to float if it's valid
#             if sbp_str != "N/A":
#                 try:
#                     sbp_value = float(sbp_str)
#                 except ValueError:
#                     print(f"Warning: Invalid SBP value encountered: {sbp_str}. Using default value -1.0.")
#             else:
#                 print("Warning: SBP value is 'N/A'. Using default value -1.0.")

#             # Convert DBP string to float if it's valid
#             if dbp_str != "N/A":
#                 try:
#                     dbp_value = float(dbp_str)
#                 except ValueError:
#                     print(f"Warning: Invalid DBP value encountered: {dbp_str}. Using default value -1.0.")
#             else:
#                 print("Warning: DBP value is 'N/A'. Using default value -1.0.")
#         else:
#             print(f"Warning: Target text does not contain valid SBP and DBP values: {target_text}")
#     except (IndexError, ValueError) as e:
#         print(f"Warning: Skipping example with invalid target text format: {target_text}. Error: {e}")

#     # Tokenize the input text
#     input_tokenized = tokenizer(
#         example["input"],
#         truncation=True,
#         max_length=config['training_args']['max_length'],
#         padding="max_length",
#         return_tensors="pt",
#     )

#     # Convert SBP and DBP values to tensor format for the model
#     labels = torch.tensor([sbp_value, dbp_value], dtype=torch.float32)

#     # Prepare the final dictionary with input IDs, attention mask, and labels
#     input_ids = input_tokenized["input_ids"].squeeze(0)  # Remove batch dimension
#     attention_mask = input_tokenized["attention_mask"].squeeze(0)  # Remove batch dimension

#     return {
#         "input_ids": input_ids,  # Tokenized input text
#         "attention_mask": attention_mask,  # Attention mask to handle padding
#         "labels": labels  # SBP and DBP values as a tensor
#     }


### version 3 of toekenizer function

In [None]:
def tokenize_function(example):
    # Extract the target values (SBP and DBP)
    # sbp_value = example.get("sbp", -1.0)  # Default to -1.0 if SBP is missing
    # dbp_value = example.get("dbp", -1.0)  # Default to -1.0 if DBP is missing
    # Check if the target is a list and convert it to a string if needed
    if isinstance(example["target"], list):
        target_text = " ".join(example["target"])  # Join list elements into a single string
    else:
        target_text = example["target"]  # If it's already a string, use it as is

    # Initialize default values for SBP and DBP
    sbp_value = -1.0  # Default value for missing or invalid SBP
    dbp_value = -1.0  # Default value for missing or invalid DBP

    # Ensure the parsing logic doesn't fail without setting these values
    try:
        # Check if both SBP and DBP exist in the target text
        if "SBP value is" in target_text and "DBP value is" in target_text:
            sbp_str = target_text.split("SBP value is ")[1].split(",")[0].strip()
            dbp_str = target_text.split("DBP value is ")[1].split(".")[0].strip()

            # Convert SBP string to float if it's not "N/A"
            if sbp_str != "N/A":
                try:
                    sbp_value = float(sbp_str)
                except ValueError:
                    print(f"Warning: Invalid SBP value encountered: {sbp_str}. Using default value -1.0.")
            else:
                print("Warning: SBP value is 'N/A'. Using default value -1.0.")

            # Convert DBP string to float if it's not "N/A"
            if dbp_str != "N/A":
                try:
                    dbp_value = float(dbp_str)
                except ValueError:
                    print(f"Warning: Invalid DBP value encountered: {dbp_str}. Using default value -1.0.")
            else:
                print("Warning: DBP value is 'N/A'. Using default value -1.0.")
        else:
            print(f"Warning: Target text does not contain valid SBP and DBP values: {target_text}")
            #raise ValueError("Target text does not contain valid SBP and DBP values.")

    except (IndexError, ValueError) as e:
        print(f"Warning: Skipping example with invalid target text format: {target_text}. Error: {e}")
        #raise ValueError(f"Failed to extract SBP and DBP values from the target text due to {e}.")


    # Tokenize the input text
    input_tokenized = tokenizer(
        example["input"],
        truncation=True,
        max_length=config['training_args_regre']['max_length'],  # Adjust based on input size
        padding="max_length",
        return_tensors="pt"
    )

    # Ensure labels are the correct shape, return labels as a tensor of shape [2] (SBP, DBP)
    labels = torch.tensor([sbp_value, dbp_value], dtype=torch.float32) #.unsqueeze(0) Shape [1, 2] # shape [2]

    # Squeeze input tensors to match model expected input
    input_ids = input_tokenized["input_ids"].squeeze(0)
    attention_mask = input_tokenized["attention_mask"].squeeze(0)

    return {
        "input_ids": input_ids,  # Shape [seq_len]
        "attention_mask": attention_mask,  # Shape [seq_len]
        "labels": labels  # Shape [1, 2]
    }


### tokenize the train dataset and test dataset

In [None]:
# padding
# Check if the tokenizer has a pad_token, otherwise define one
if tokenizer.pad_token is None:
    # Define a custom token for padding (if the model doesn't already have one)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Use a special string like '[PAD]'

# Now the tokenizer will use [PAD] as the pad_token


In [None]:
# Apply the tokenization function to the dataset
train_dataset = train_dataset.map(tokenize_function, batched=False, remove_columns=train_dataset.column_names)
print("Preview the train tokenized dataset", train_dataset[0])

test_dataset = test_dataset.map(tokenize_function, batched=False, remove_columns=test_dataset.column_names)
print("Preview the test tokenized dataset", test_dataset[0])


Map:   0%|          | 0/324131 [00:00<?, ? examples/s]

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m


### Preview after tokenize the train and test dataset

In [None]:
# Preview Apply the tokenization function to the dataset

print("Preview the train tokenized dataset", train_dataset[0])

print("Preview the test tokenized dataset", test_dataset[0])

Preview the train tokenized dataset {'input_ids': [128000, 791, 53194, 4519, 527, 7432, 4519, 11, 393, 15249, 82, 4519, 11, 323, 8743, 82, 4519, 13, 3092, 8743, 82, 4519, 527, 510, 717, 13, 914, 11, 220, 15, 13, 1927, 11, 220, 15, 13, 1721, 11, 220, 16, 13, 605, 11, 482, 15, 13, 1927, 11, 482, 15, 13, 845, 1145, 393, 15249, 82, 4519, 527, 510, 15, 13, 1227, 11, 15, 13, 1399, 11, 220, 15, 13, 2287, 1145, 7432, 82, 4519, 527, 510, 3076, 13, 6281, 11, 220, 16, 13, 1806, 11, 220, 15, 13, 2137, 11, 220, 15, 13, 5833, 11, 220, 15, 13, 1135, 11, 220, 19, 13, 2287, 948, 20817, 389, 1521, 828, 11, 1148, 1053, 387, 279, 19698, 42345, 7918, 6680, 7410, 320, 17094, 47, 8, 323, 1891, 561, 7918, 6680, 7410, 320, 3590, 47, 8, 2819, 30, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
print(train_dataset.column_names)

['input_ids', 'attention_mask', 'labels']


## Implement LoRA config

In [None]:
# # Define the LoRA configuration
config_lora = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",  # SEQ_CLS for sequence classification
    target_modules=["q_proj", "k_proj", "v_proj"],  # Updated target modules
)

# Wrap the model with PEFT
model = get_peft_model(model, config_lora)

for param in model.parameters():
  param.requires_grad = True


In [None]:
import torch
import torch.nn as nn
# try define loss function
criterion = nn.MSELoss()

In [None]:
# Reshape lables within the training loop
# class CustomTrainer(Trainer):
#   def training_step(self, model, inputs):
#     """
#     Perform a training step on a batch of inputs.
#     Subclass and override in inject custom behavior.
#     """
#     model.train()
#     inputs = self._prepare_inputs(inputs)

#     # Reshape labels here
#     if "labels" in inputs:
#       labels = inputs["labels"].squeeze(1) # remove the extra dimension

#     with self.autocast_smart_context_manager():
#       loss = self.compute_loss(model, inputs)

#     if self.args.gradient_accumulation_steps > 1:
#       loss = loss / self.args.gradient_accumulation_steps

#     if self.do_floating_point_precision_scaling:
#       self.scaler.scale(loss).backward()

#     elif self.use_apex:
#       with autocast():
#         loss.backward()

#     return loss.detach()

#   # replace the existing training step in CustomTrainer
#   trainer.training_step = training_step.__get__(trainer, trainer.__class__)


In [None]:


# # define trainer class
# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.pop("labels")
#         outputs = model(**inputs)
#         logits = outputs.logits
#         loss = criterion(logits, labels)
#         return (loss, outputs) if return_outputs else loss


In [None]:
# from torch.cuda.amp import autocast, GradScaler

# #scaler = GradScaler()
# criterion = nn.MSELoss()


# class CustomTrainer(Trainer):
#   def compute_Loss(self, model, inputs, return_outputs=False):
#     labels = inputs.get("labels")

#      # Reshape labels to match the model's output size
#     #labels = labels.squeeze() # Assuming shape is [batch_size, 2]

#     # Ensure labels are of the correct shape
#     if labels is not None:
#       if labels.dim() == 3 and labels.shape[1] == 1:
#         labels = labels.squeeze(1)

#     # enable autocasting for mixed precision within compute_loss
#     with autocast():
#       outputs = model(**inputs)
#       logits = outputs.logits

#       # print the shape of logits and labels
#       print(f"logits shape: {logits.shape}")
#       print(f"labels shape: {labels.shape}")


#       loss = criterion(logits, labels)

#     return (loss, outputs) if return_outputs else loss

## version 4 computing loss function

In [None]:
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()
criterion = nn.MSELoss()

class CustomTrainer(Trainer):
    def compute_Loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")

        # Ensure labels are of the correct shape
        if labels is not None:
            if labels.dim() == 3 and labels.shape[1] == 1:
                labels = labels.squeeze(1)
            elif labels.dim() == 1:
                labels = labels.unsqueeze(-1).expand(-1, model(**inputs).logits.size(-1))

        # Enable autocasting for mixed precision within compute_loss
        with autocast():
            outputs = model(**inputs)
            logits = outputs.logits

            # Print shapes for debugging
            print(f"logits shape: {logits.shape}")
            print(f"labels shape: {labels.shape}")

            # Calculate error terms and log them
            error_terms = (logits - labels) ** 2
            print(f"Error terms: {error_terms}")

            # Calculate and print the minimum, maximum, and mean of the error terms
            print(f"Error min: {error_terms.min().item()}, Error max: {error_terms.max().item()}, Error mean: {error_terms.mean().item()}")

            # Calculate the loss
            loss = criterion(logits, labels)
            print(f"Loss: {loss.item()}")  # Log the final loss value

        # Check for NaNs or large values
        if torch.isnan(loss) or loss > 1e6:
            print("Warning: Loss is NaN or excessively large")

        return (loss, outputs) if return_outputs else loss


  scaler = GradScaler()


In [None]:
# define compute metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

def compute_metrics(eval_pred):
  # Unpack predictions and labels
  predictions, labels = eval_pred

  # Convert to numpy if predictions or labels are torch tensors
  if isinstance(predictions, torch.Tensor):
      predictions = predictions.cpu().numpy()
  if isinstance(labels, torch.Tensor):
      labels = labels.cpu().numpy()

  # squeeze predictions if necessary
  if predictions.ndim == 3 and predictions.shape[1] == 1:
    predictions = np.squeeze(predictions, axis=1)

  # Flatten both predictions and labels (if needed)
  preds = np.ravel(predictions)
  labels = np.ravel(labels)

  # compute metrics
  mse = mean_squared_error(labels, preds)
  mae = mean_absolute_error(labels, preds)
  #rmse = np.squrt(mse)

  return {
      "MAE": mae,
      "MSE": mse,
      #"RMSE": rmse
  }


In [None]:
# set up the trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # Include your evaluation dataset
    compute_metrics=compute_metrics,       # Add the compute_metrics function
    tokenizer=tokenizer)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,  # Include your evaluation dataset
#     compute_metrics=compute_metrics,       # Add the compute_metrics function
#     tokenizer=tokenizer
# )

In [None]:
# check index range
# import torch

# def check_token_ids(dataset, tokenizer):
#     """
#     Checks if any token IDs in the dataset are outside the tokenizer's vocabulary.
#     """
#     max_token_id = tokenizer.vocab_size -1 # Get the maximum valid token ID

#     for example in dataset:
#         input_ids = example["input_ids"]

#         # Check each token ID
#         for token_id in input_ids:
#             if token_id > max_token_id:
#                 print(f"Found token ID {token_id} which is out of range (max: {max_token_id})")


# # Check for out-of-range token IDs in the train and test datasets
# print("Checking train dataset...")
# check_token_ids(train_dataset, tokenizer)
# print("Checking test dataset...")
# check_token_ids(test_dataset, tokenizer)

## Inspect the dataset and tokenization

In [None]:
# example_text = "Sample input text"
# input_tokenized = tokenizer(example_text)
# input_ids = input_tokenized.input_ids.clone().detach().requires_grad_(True)

# print(input_tokenized)


In [None]:
# After adding tokens using tokenizer.add_tokens()
model.resize_token_embeddings(len(tokenizer))

Embedding(128257, 2048)

In [None]:
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
# print(f"Target size: {labels.size()}")
# print(f"Input sizeL {input.size()}")

In [None]:
trainer.train()

  return fn(*args, **kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step,Training Loss,Validation Loss,Mae,Mse
100,-3395.5037,-4397.234863,39.995243,2943.446777
200,-4493.9269,-5060.661133,42.138332,3728.595215
300,-5228.2494,-5989.945312,56.87569,5410.772461
400,-6300.81,-6858.495605,69.765732,7417.567871
500,-6871.5381,-7500.774902,79.695831,9254.560547
600,-8357.0238,-8548.150391,102.014618,13076.533203
700,-8708.3919,-9453.095703,119.885979,17037.552734
800,-10547.6513,-10423.616211,138.286865,21876.263672
900,-11462.0775,-11403.360352,156.127808,27351.470703
1000,-13202.7025,-12424.875977,175.413437,33886.09375


  return fn(*args, **kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return fn(*args, **kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return fn(*args, **kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return fn(*args, **kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-1B-Instruct.
  return fn(*args, **kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the

KeyboardInterrupt: 

## Save the fine-tuned LLM-BP regression

In [None]:
# Save the Fine-tuned model
colab_save_model_dir = "/content/drive/MyDrive/llm3_bp/llmbp_regression/fine_tuned_model_regression"
trainer.save_model(colab_save_model_dir)
tokenizer.save_pretrained(colab_save_model_dir)

config.save_pretrained(colab_save_model_dir)

In [None]:
import os

# Define the directory where the fine-tuned model, tokenizer, and config will be saved
save_model_dir = "/content/drive/MyDrive/llm3_bp/llmbp_regression/fine_tuned_model/model"

# Create the directory if it doesn't exist
if not os.path.exists(save_model_dir):
    os.makedirs(save_model_dir)

# Save the fine-tuned model
trainer.save_model(save_model_dir)  # This saves the model's weights (pytorch_model.bin)

# Save the tokenizer
tokenizer.save_pretrained(save_model_dir)

# Save the fine-tuned model configuration
model.config.save_pretrained(save_model_dir)

print(f"Model, tokenizer, and configuration saved to {save_model_dir}")


## Evaluation of overall evaluate dataset

In [None]:
import pandas as pd
# Evaluate the model on the evaluation dataset
eval_results = trainer.evaluate()

# Convert the evaluation results to a Pandas DataFrame
eval_df = pd.DataFrame([eval_results])

save_path = '/content/drive/MyDrive/llm3_bp/llmbp_regression/evaluation_result/evaluation_results.csv'

# Save the result to a CSV file
eval_df.to_csv(save_path, index=False)

# Print the evaluation results
print(f"Evaluation results saved to evaluation_results.csv")
print(eval_results)


## Evaluation of each result of evaluate dataset

In [None]:
# Evaluate the mode on the evaluation dataset
eval_results = trainer.evaluate()

# Get the predictions from the trainer (this assumes my trainer has a method for predictions)
predictions, labels, _ = trainer.predict(eval_dataset)

# Convert the evaluation results to a Pandas DataFrame
eval_each_df = pd.DataFrame([eval_each_results])

# Save the evaluation results to a CSV file
eval_each_save_path = '/content/drive/MyDrive/llm3_bp/llmbp_regression/evaluation_result/evaluation_each_results.csv'
eval_each_df.to_csv(eval_each_save_path, index=False)

# Create a DataFrame for the predictions and corresponding labels
pred_df = pd.DataFrame({
    'Predictions': predictions.flatten(),
    'Labels': labels.flatten()
})

# Save the predictions and labels to a CSV file
pred_save_path = '/content/drive/MyDrive/llm3_bp/llmbp_regression/evaluation_result/predict_each_results.csv'
pred_df.to_csv(pred_save_path, index=False)

# Print the evaluation results
# Print the evaluation results and save paths
print(f"Evaluation results saved to {eval_save_path}")
print(eval_results)
print(f"Predictions and labels saved to {pred_save_path}")



In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

# Load the fine-tuned model, tokenizer, and configuration from the saved directory
fine_tuned_model_dir = "/content/drive/MyDrive/llm3_bp/fine_tuned_model"
model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_model_dir)
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_dir)
config = AutoConfig.from_pretrained(fine_tuned_model_dir)

print(f"Fine-tuned model, tokenizer, and config loaded from {fine_tuned_model_dir}")
