In [None]:
!pip install -q transformers datasets openpyxl scikit-learn seaborn joblib pandas lifelines XlsxWriter

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import logging
from lifelines.utils import concordance_index
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
    brier_score_loss,
    classification_report,
    mean_squared_error,
    mean_absolute_error,
    r2_score
)
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, EarlyStoppingCallback)
import datasets
from datasets import Dataset
import joblib
from joblib import Parallel, delayed
import gc
from scipy import stats

In [None]:
base_path = "/kaggle/input/nkr-iknl-breast-syntheticdata"
base_output_path = "/kaggle/working"
model_name = "emilyalsentzer/Bio_ClinicalBERT"
main_data_file = os.path.join(base_output_path, "main_data.csv")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
def apply_days_threshold(data, num_days):
    data = data.copy()
    data['converted_from_dead'] = 0

    # Remove records where vit_stat == 0 and vit_stat_int < num_days
    condition_remove = (data['vit_stat'] == 0) & (data['vit_stat_int'] < num_days)
    data = data[~condition_remove]

    # Convert records where vit_stat == 1 and vit_stat_int > num_days to vit_stat == 0
    condition_convert = (data['vit_stat'] == 1) & (data['vit_stat_int'] > num_days)
    data.loc[condition_convert, 'vit_stat'] = 0
    data.loc[condition_convert, 'converted_from_dead'] = 1

    return data

In [None]:
def create_record_id(data):
    if 'record_id' not in data.columns:
        data = data.reset_index(drop=True).reset_index().rename(columns={'index': 'record_id'})
    else:
        data['record_id'] = data['record_id'].astype(str)
    return data

In [None]:
feature_list = [
        ('leeft', 'Age at incidence date'),
        ('gesl', 'Gender'),
        ('incjr', 'Year of incidence'),
        ('tumsoort', 'Tumor species'),
        ('diag_basis', 'Basis for diagnosis'),
        ('topo', 'Topography excluding sub-localisation'),
        ('topo_sublok', 'Topography including sub-localisation'),
        ('later', 'Lateralisation'),
        ('morf', 'Morphology'),
        ('gedrag', 'Tumor behavior'),
        ('diffgrad', 'Level of differentiation'),
        ('ct', 'Clinical T (TNM)'),
        ('cn', 'Clinical N (TNM)'),
        ('cm', 'Clinical M (TNM)'),
        ('pt', 'Pathological T (TNM)'),
        ('pn', 'Pathological N (TNM)'),
        ('pm', 'Pathological M (TNM)'),
        ('stadium', 'Tumor stage (pTNM and cTNM)'),
        ('cstadium', 'Clinical stage'),
        ('pstadium', 'Pathological stage'),
        ('ond_lymf', 'Number of regional lymph nodes examined'),
        ('pos_lymf', 'Number of positive regional lymph nodes'),
        ('er_stat', 'Oestrogen receptor status'),
        ('pr_stat', 'Progesterone receptor status'),
        ('her2_stat', 'HER2 status'),
        ('dcis_comp', 'DCIS component present'),
        ('multifoc', 'Multicentric/multifocal presence'),
        ('tum_afm', 'Tumor size (mm)'),
        ('swk', 'Sentinel node status'),
        ('swk_uitslag', 'Sentinel node procedure result'),
        ('mari', 'MARI procedure status'),
        ('mari_uitslag', 'MARI procedure result'),
        ('okd', 'Cerebral node dissection'),
        ('org_chir', 'Surgery performed'),
        ('uitgebr_chir_code', 'Expanded surgery code'),
        ('dir_reconstr', 'Direct reconstruction'),
        ('chemo', 'Chemotherapy'),
        ('target', 'Targeted therapy'),
        ('horm', 'Hormonal therapy'),
        ('rt', 'Radiotherapy'),
        ('meta_rt', 'Metastatic radiotherapy'),
        ('meta_chir', 'Metastatic surgery'),
    ]

In [None]:
def generate_classification_prompt(data):
    def create_classification_prompt(row):
        prompt_lines = ["Patient Record:"]
        
        for col, description in feature_list:
            if col in data.columns:
                value = row[col]
                if pd.notnull(value):
                    line = f"- {description}: {value}"
                    prompt_lines.append(line)
                # If the value is null (NaN), skip adding this line
            # If the column does not exist in the DataFrame, skip adding this line
        
        prompt_lines.append("Based on this medical record, determine the patient's vital status and the time interval since their last examination.")
        prompt = '\n'.join(prompt_lines)
        return prompt
    
    data['classification_prompt'] = data.apply(create_classification_prompt, axis=1)
    return data

In [None]:
def generate_regression_prompt(data):
    def create_regression_prompt(row):
        prompt_lines = ["Patient Record:"]
        
        for col, description in feature_list:
            if col in data.columns:
                value = row[col]
                if pd.notnull(value):
                    line = f"- {description}: {value}"
                    prompt_lines.append(line)
                # If the value is null (NaN), skip adding this line
            # If the column does not exist in the DataFrame, skip adding this line
        
        prompt_lines.append("Based on this medical record, predict the patient's time interval since their last examination in days.")
        prompt = '\n'.join(prompt_lines)
        return prompt
    
    data['regression_prompt'] = data.apply(create_regression_prompt, axis=1)
    return data

In [None]:
def tokenize_classification_data(data, tokenizer, max_length=512):
    tokenized = tokenizer(
        data['classification_prompt'],
        padding='max_length',
        truncation=True,
        max_length=max_length
    )
    return tokenized

In [None]:
def tokenize_regression_data(data, tokenizer, max_length=512):
    tokenized = tokenizer(
        data['regression_prompt'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length
    )
    return tokenized

In [None]:
def downsample_class_0(
    data, 
    target_column='vit_stat', 
    majority_class=0, 
    minority_class=1, 
    priority_column='converted_from_dead'
):
    
    majority_class_data = data[data[target_column] == majority_class].copy()
    minority_class_data = data[data[target_column] == minority_class].copy()
    
    priority_data = majority_class_data[majority_class_data[priority_column] == 1].copy()
    non_priority_data = majority_class_data[majority_class_data[priority_column] == 0].copy()
    
    desired_majority_size = len(minority_class_data)
    
    current_majority_size = len(majority_class_data)
    
    num_to_remove = current_majority_size - desired_majority_size
    
    if num_to_remove <= 0:
        print("No downsampling needed. The majority class is already balanced or smaller than the minority class.")
        return data.copy()
    
    data_to_retain = []
    
    num_priority = len(priority_data)
    
    if num_to_remove <= num_priority:
        priority_downsampled = priority_data.sample(n=num_to_remove, random_state=42)
        priority_retained = priority_data.drop(priority_downsampled.index)
        data_to_retain = [priority_retained, non_priority_data]
        print(f"Removed {num_to_remove} priority records (`{priority_column} == 1`) from the majority class.")
    else:
        priority_downsampled = priority_data
        remaining_to_remove = num_to_remove - num_priority
        remaining_to_remove = min(remaining_to_remove, len(non_priority_data))
        if remaining_to_remove > 0:
            non_priority_downsampled = non_priority_data.sample(n=remaining_to_remove, random_state=42)
            non_priority_retained = non_priority_data.drop(non_priority_downsampled.index)
            data_to_retain = [non_priority_retained]
            print(f"Removed all {num_priority} priority records (`{priority_column} == 1`) and {remaining_to_remove} non-priority records (`{priority_column} == 0`) from the majority class.")
        else:
            data_to_retain = [non_priority_data]
            print(f"Removed all {num_priority} priority records (`{priority_column} == 1`) from the majority class.")
    
    majority_downsampled = pd.concat(data_to_retain).copy()
    
    downsampled_data = pd.concat([majority_downsampled, minority_class_data]).sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"\nOriginal class distribution:\n{data[target_column].value_counts()}")
    print(f"\nNew class distribution after downsampling:\n{downsampled_data[target_column].value_counts()}")
    print(f"\nNumber of priority records removed: {len(priority_downsampled)}")
    if 'non_priority_downsampled' in locals():
        print(f"Number of non-priority records removed: {remaining_to_remove}")
    
    return downsampled_data

In [None]:
def load_and_preprocess_classification_data(num_days, num_folds=5, max_length=256):
    data_path = os.path.join(base_path, "NKR_IKNL_breast_syntheticdata.csv")
    main_data = pd.read_csv(data_path, delimiter=";")

    if (os.path.exists(main_data_file)):
        main_data = pd.read_csv(main_data_file)
    else:
        main_data = apply_days_threshold(main_data, num_days)
        main_data = downsample_class_0(main_data)
        main_data = main_data.sample(frac=1, random_state=42).reset_index(drop=True)
        main_data = create_record_id(main_data)
        main_data.to_csv(main_data_file, index=False)

    main_data = generate_classification_prompt(main_data)

    train_data, test_data = train_test_split(
        main_data,
        test_size=0.2,
        stratify=main_data['vit_stat'],
        random_state=42
    )

    test_file = os.path.join(base_output_path, "classification_test.csv")
    test_data.to_csv(test_file, index=False)
    print(f"Hold-out test set saved with shape: {test_data.shape}")

    train_data_folds = {}
    eval_data_folds = {}
    all_folds_exist = True

    for fold in range(1, num_folds + 1):
        train_file = os.path.join(base_output_path, f"classification_train_{fold}.csv")
        eval_file = os.path.join(base_output_path, f"classification_eval_{fold}.csv")

        if os.path.exists(train_file) and os.path.exists(eval_file):
            train_data_fold = pd.read_csv(train_file)
            eval_data_fold = pd.read_csv(eval_file)
            train_data_folds[fold] = train_data_fold
            eval_data_folds[fold] = eval_data_fold
            print(f"Loaded existing data for fold {fold} - Train: {train_data_fold.shape}, Eval: {eval_data_fold.shape}")
        else:
            all_folds_exist = False
            print(f"Files for fold {fold} are missing. Need to generate all folds.")
            break 

    if not all_folds_exist:
        print(f"Splitting main training data into {num_folds} folds.")
        skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
        for fold_index, (train_idx, eval_idx) in enumerate(skf.split(train_data, train_data['vit_stat']), start=1):
            train_data_split = train_data.iloc[train_idx].reset_index(drop=True)
            eval_data_split = train_data.iloc[eval_idx].reset_index(drop=True)

            train_split_file = os.path.join(base_output_path, f"classification_train_{fold_index}.csv")
            eval_split_file = os.path.join(base_output_path, f"classification_eval_{fold_index}.csv")
            train_data_split.to_csv(train_split_file, index=False)
            eval_data_split.to_csv(eval_split_file, index=False)

            train_data_folds[fold_index] = train_data_split
            eval_data_folds[fold_index] = eval_data_split

        print(f"Cross-validation data saved for all {num_folds} folds.")

    tokenized_train_folds = {}
    tokenized_eval_folds = {}
    
    for fold_index in train_data_folds.keys():
        print(f"Tokenizing fold {fold_index} - Training data")
        train_dataset = Dataset.from_pandas(train_data_folds[fold_index])
        tokenized_train = train_dataset.map(
            lambda x: tokenize_classification_data(x, tokenizer, max_length=max_length),
            batched=True,
            batch_size=1000,
            num_proc=4,
            remove_columns=['classification_prompt']
        )
        tokenized_train = tokenized_train.rename_column("vit_stat", "labels")
        tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

        print(f"Tokenizing fold {fold_index} - Evaluation data")
        eval_dataset = Dataset.from_pandas(eval_data_folds[fold_index])
        tokenized_eval = eval_dataset.map(
            lambda x: tokenize_classification_data(x, tokenizer, max_length=max_length),
            batched=True,
            batch_size=1000,
            num_proc=4,
            remove_columns=['classification_prompt']
        )
        tokenized_eval = tokenized_eval.rename_column("vit_stat", "labels")
        tokenized_eval.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

        tokenized_train_folds[fold_index] = tokenized_train
        tokenized_eval_folds[fold_index] = tokenized_eval

    return tokenized_train_folds, tokenized_eval_folds, test_data

In [None]:
train_classification_data_folds, eval_classification_data_folds, test_data = load_and_preprocess_classification_data(1825)

In [None]:
def compute_metrics_classifier(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)

    probs = torch.softmax(torch.tensor(p.predictions), dim=1).numpy()
    pos_probs = probs[:, 1]

    brier = brier_score_loss(labels, pos_probs)

    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted', zero_division=0)
    f1 = f1_score(labels, preds, average='weighted', zero_division=0)
    roc_auc = roc_auc_score(labels, pos_probs, average='weighted') if len(np.unique(labels)) > 1 else float('nan')
    pr_auc = average_precision_score(labels, pos_probs, average='weighted') if len(np.unique(labels)) > 1 else float('nan')

    combined_score = (f1 * 0.5) + (roc_auc * 0.3) + (pr_auc * 0.2)

    return {
        'accuracy': acc,
        'brier_score': brier,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'combined_score': combined_score
    }

In [None]:
def fine_tune_classifier(tokenized_train_folds, tokenized_eval_folds, output_dir, num_labels=2, epochs=5):
    ## REMEMBER TO CHANGE MAX LENGTH FROM 256 TO 512
    initial_best_model_path = os.path.join(output_dir, "best_model")

    if os.path.exists(initial_best_model_path) and os.listdir(initial_best_model_path):
        print(f"Found existing model at {initial_best_model_path}. Loading the model instead of retraining.")
        best_model = AutoModelForSequenceClassification.from_pretrained(initial_best_model_path)
        return best_model
    
    best_combined_score = float('-inf')
    best_model_dir = None
    
    os.makedirs(output_dir, exist_ok=True)
    
    for fold_index, train_dataset in tokenized_train_folds.items():
        eval_dataset = tokenized_eval_folds[fold_index]

        print(f"\nStarting training for Fold {fold_index}")

        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        ).to(device)

        fold_output_dir = os.path.join(output_dir, f"fold_{fold_index}")
        os.makedirs(fold_output_dir, exist_ok=True)

        training_args = TrainingArguments(
            output_dir=fold_output_dir,
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=epochs,
            weight_decay=0.01,
            logging_strategy='no',
            logging_steps=0,
            logging_dir=None,
            report_to="none",
            fp16=torch.cuda.is_available(),
            dataloader_num_workers=4,
            load_best_model_at_end=True,
            metric_for_best_model="combined_score",
            save_total_limit=1,
            gradient_accumulation_steps=2,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics_classifier,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        trainer.train()
        metrics = trainer.evaluate()
        print(f"Fold {fold_index} - Evaluation Metrics:", metrics)

        if metrics["eval_combined_score"] > best_combined_score:
            best_combined_score = metrics["eval_combined_score"]
            best_model_dir = fold_output_dir
            trainer.save_model(initial_best_model_path)
            print(f"New best model found in fold {fold_index} with combined score: {best_combined_score}")

    if best_model_dir is not None:
        print(f"\nLoading the best model from {initial_best_model_path}")
        best_model = AutoModelForSequenceClassification.from_pretrained(initial_best_model_path)
        return best_model
    else:
        raise ValueError("No model was trained. Please check your training data and configuration.")

In [None]:
best_model = fine_tune_classifier(
    tokenized_train_folds=train_classification_data_folds,
    tokenized_eval_folds=eval_classification_data_folds,
    output_dir=base_output_path,
    num_labels=2,
    epochs=5
)

In [None]:
def predict_vit_stat(model, tokenizer, data, max_length = 512):
    encodings = tokenizer(list(data['classification_prompt']), truncation=True, padding=True, max_length = max_length)
    dataset = Dataset.from_dict({**encodings})
    
    trainer = Trainer(model=model, args=TrainingArguments(output_dir="./results", report_to="none"))
    predictions = trainer.predict(dataset)
    pred_labels = np.argmax(predictions.predictions, axis=1)
    
    return pred_labels

In [None]:
predicted_labels = predict_vit_stat(best_model, tokenizer, test_data, max_length = 256)

test_data['predicted_vit_stat'] = predicted_labels
print(test_data[['classification_prompt', 'vit_stat', 'predicted_vit_stat']])

In [None]:
def save_classification_metrics_to_excel(
    true_labels, 
    pred_labels, 
    train_classification_data_folds, 
    eval_classification_data_folds, 
    test_data, 
    filename="classification_results.xlsx"
):
    columns_to_exclude = ['input_ids', 'token_type_ids', 'attention_mask', 'classification_prompt']

    if isinstance(test_data, Dataset):
        test_df = test_data.to_pandas()
    elif isinstance(test_data, pd.DataFrame):
        test_df = test_data.copy()
    else:
        raise TypeError("test_data must be a pandas DataFrame or a Hugging Face Dataset.")

    test_df.drop(columns=[col for col in columns_to_exclude if col in test_df.columns], inplace=True)

    cm = confusion_matrix(true_labels, pred_labels)
    cm_df = pd.DataFrame(
        cm, 
        index=[f"Actual_{i}" for i in range(len(cm))], 
        columns=[f"Predicted_{i}" for i in range(len(cm))]
    )

    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average='weighted', zero_division=0)
    recall = recall_score(true_labels, pred_labels, average='weighted', zero_division=0)
    f1 = f1_score(true_labels, pred_labels, average='weighted', zero_division=0)

    classification_rep = classification_report(true_labels, pred_labels, output_dict=True, zero_division=0)
    classification_rep_df = pd.DataFrame(classification_rep).transpose()

    with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
        cm_df.to_excel(writer, sheet_name="Confusion Matrix", index=True)
        
        metrics_df = pd.DataFrame({
            "Metric": ["Accuracy", "Precision", "Recall", "F1 Score"],
            "Value": [accuracy, precision, recall, f1]
        })
        metrics_df.to_excel(writer, sheet_name="Overall Metrics", index=False)
        
        classification_rep_df.to_excel(writer, sheet_name="Classification Report")
        
        def save_folds(folds_dict, fold_type="Train"):
            for fold_num, fold in folds_dict.items():
                sheet_name = f"{fold_type} Fold {fold_num}"
                
                if isinstance(fold, Dataset):
                    fold_df = fold.to_pandas()
                elif isinstance(fold, pd.DataFrame):
                    fold_df = fold.copy()
                else:
                    print(f"Skipping {sheet_name}: Unsupported data type {type(fold)}.")
                    continue
                
                fold_df.drop(columns=[col for col in columns_to_exclude if col in fold_df.columns], inplace=True)
                
                try:
                    fold_df.to_excel(writer, sheet_name=sheet_name, index=False)
                    print(f"Saved {sheet_name}")
                except Exception as e:
                    print(f"Error saving {sheet_name}: {e}")

        save_folds(train_classification_data_folds, fold_type="Train")
        
        save_folds(eval_classification_data_folds, fold_type="Eval")
        
        try:
            test_df.to_excel(writer, sheet_name="Test Data", index=False)
            print("Saved Test Data")
        except Exception as e:
            print(f"Error saving Test Data: {e}")
    
    print(f"All metrics and data have been saved to '{filename}'.")

In [None]:
true_labels = test_data['vit_stat']
pred_labels = test_data['predicted_vit_stat']
save_classification_metrics_to_excel(true_labels, pred_labels, train_classification_data_folds, eval_classification_data_folds, test_data)

In [None]:
processed_main_data = pd.DataFrame()
if(os.path.exists(main_data_file)):
    processed_main_data = pd.read_csv(main_data_file)

In [None]:
def preprocess_regression_data(data, test_data_with_predictions):
    train_data_regression = data[data['vit_stat'] == 1].copy()
    test_data_regression = test_data_with_predictions[test_data_with_predictions['predicted_vit_stat'] == 1].copy()

    if 'classification_prompt' in test_data_regression.columns:
        test_data_regression = test_data_regression.drop(columns=['classification_prompt'])

    # Remove any records from train_data_regression that are in test_data_regression
    test_record_ids = test_data_regression['record_id'].unique()
    train_data_regression = train_data_regression[~train_data_regression['record_id'].isin(test_record_ids)].copy()

    # Create second test set: subset of test_data_regression where 'vit_stat' == 1
    test_data_regression_real1s = test_data_regression[test_data_regression['vit_stat'] == 1].copy()

    train_data_regression.to_csv(os.path.join(base_output_path, "test_data_regression.csv"), index=False)
    test_data_regression.to_csv(os.path.join(base_output_path, "test_data_regression.csv"), index=False)
    test_data_regression_real1s.to_csv(os.path.join(base_output_path, "test_data_regression_real1s.csv"), index=False)


    # Generate regression prompts
    train_data_regression = generate_regression_prompt(train_data_regression)
    test_data_regression = generate_regression_prompt(test_data_regression)
    test_data_regression_real1s = generate_regression_prompt(test_data_regression_real1s)

    # Tokenize the data
    train_encodings = tokenize_regression_data(train_data_regression, tokenizer)
    test_encodings = tokenize_regression_data(test_data_regression, tokenizer)
    test_real1s_encodings = tokenize_regression_data(test_data_regression_real1s, tokenizer)

    # Create Hugging Face Datasets
    train_dataset = datasets.Dataset.from_dict({
        'input_ids': train_encodings['input_ids'],
        'attention_mask': train_encodings['attention_mask'],
        'labels': train_data_regression['vit_stat_int'].astype(float),
        'record_id': train_data_regression['record_id'].tolist()
    })

    test_dataset = datasets.Dataset.from_dict({
        'input_ids': test_encodings['input_ids'],
        'attention_mask': test_encodings['attention_mask'],
        'labels': test_data_regression['vit_stat_int'].astype(float),
        'record_id': test_data_regression['record_id'].tolist()
    })

    test_dataset_real1s = datasets.Dataset.from_dict({
        'input_ids': test_real1s_encodings['input_ids'],
        'attention_mask': test_real1s_encodings['attention_mask'],
        'labels': test_data_regression_real1s['vit_stat_int'].astype(float),
        'record_id': test_data_regression_real1s['record_id'].tolist()
    })

    # Set format for PyTorch
    columns = ['input_ids', 'attention_mask', 'labels']
    train_dataset.set_format(type='torch', columns=columns)
    test_dataset.set_format(type='torch', columns=columns)
    test_dataset_real1s.set_format(type='torch', columns=columns)

    print(f"Training set size: {train_dataset.shape[0]}")
    print(f"Test set size (predicted 1s): {test_dataset.shape[0]}")
    print(f"Test set size (true positives): {test_dataset_real1s.shape[0]}")

    return train_dataset, test_dataset, test_dataset_real1s

In [None]:
train_data_regression, test_data_regression, test_data_regression_real1s = preprocess_regression_data(processed_main_data, test_data)

In [None]:
def compute_metrics_regression(p):
    preds = p.predictions.squeeze()
    labels = p.label_ids.squeeze()

    if preds.ndim > 1:
        preds = preds.reshape(-1)
    if labels.ndim > 1:
        labels = labels.reshape(-1)

    mse = mean_squared_error(labels, preds)
    mae = mean_absolute_error(labels, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(labels, preds)

    return {
        'mse': mse,
        'mae': mae,
        'rmse': rmse,
        'r2': r2
    }

In [None]:
def fine_tune_regressor(train_dataset, eval_dataset, output_dir, epochs=5):
    model_save_path = os.path.join(output_dir, "best_regression_model")

    if os.path.exists(model_save_path) and os.listdir(model_save_path):
        print(f"Found existing regression model at {model_save_path}. Loading the model instead of retraining.")
        model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
        return model

    os.makedirs(output_dir, exist_ok=True)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=1,
        problem_type="regression"
    ).to(device)

    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=epochs,
        weight_decay=0.01,
        logging_strategy='epoch',
        logging_steps=0,
        logging_dir=None,
        report_to="none",
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=4,
        load_best_model_at_end=True,
        metric_for_best_model="rmse",
        greater_is_better=False,
        save_total_limit=1,
        gradient_accumulation_steps=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics_regression,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    metrics = trainer.evaluate()
    print("Evaluation Metrics:", metrics)

    # Save the model
    trainer.save_model(model_save_path)
    print(f"Regression model saved at {model_save_path}")

    return model

In [None]:
train_test_split = train_data_regression.train_test_split(test_size=0.2, seed=42)
split_train_data_regression = train_test_split['train']
eval_data = train_test_split['test']

In [None]:
best_regressor_model = fine_tune_regressor(
    train_dataset=split_train_data_regression,
    eval_dataset=eval_data,
    output_dir=base_output_path,
    epochs=5
)

In [None]:
def predict_vit_stat_int(model, data):
    if 'labels' in data.column_names:
        data_for_prediction = data.remove_columns(['labels'])
    else:
        data_for_prediction = data

    data_for_prediction.set_format(type='torch', columns=['input_ids', 'attention_mask'])

    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir="./results",
            per_device_eval_batch_size=16,
            report_to="none",
            dataloader_num_workers=4
        )
    )

    # Get predictions
    predictions = trainer.predict(data_for_prediction)

    # For regression, predictions.predictions is of shape (num_samples, 1)
    pred_values = predictions.predictions.squeeze()

    return pred_values

In [None]:
predicted_vit_stat_int_test = predict_vit_stat_int(
    model=best_regressor_model,
    data=test_data_regression
)

# Evaluate performance
true_vit_stat_int = np.array(test_data_regression['labels'])
predicted_vit_stat_int = predicted_vit_stat_int_test

mse = mean_squared_error(true_vit_stat_int, predicted_vit_stat_int)
mae = mean_absolute_error(true_vit_stat_int, predicted_vit_stat_int)
rmse = np.sqrt(mse)
r2 = r2_score(true_vit_stat_int, predicted_vit_stat_int)

print("Performance on test_data_regression:")
print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2 Score: {r2:.4f}")

# Predict on test_data_regression_real1s
predicted_vit_stat_int_real1s = predict_vit_stat_int(
    model=best_regressor_model,
    data=test_data_regression_real1s
)

# Evaluate performance
true_vit_stat_int_real1s = np.array(test_data_regression_real1s['labels'])
predicted_vit_stat_int = predicted_vit_stat_int_real1s

mse_real1s = mean_squared_error(true_vit_stat_int_real1s, predicted_vit_stat_int)
mae_real1s = mean_absolute_error(true_vit_stat_int_real1s, predicted_vit_stat_int)
rmse_real1s = np.sqrt(mse_real1s)
r2_real1s = r2_score(true_vit_stat_int_real1s, predicted_vit_stat_int)

print("\nPerformance on test_data_regression_real1s (True Positives):")
print(f"MSE: {mse_real1s:.2f}")
print(f"MAE: {mae_real1s:.2f}")
print(f"RMSE: {rmse_real1s:.2f}")
print(f"R2 Score: {r2_real1s:.4f}")

In [None]:
def save_regression_metrics_to_excel(
    train_dataset,
    eval_dataset,
    test_dataset,
    test_dataset_real1s,
    predicted_vit_stat_int_test,
    predicted_vit_stat_int_real1s,
    filename="regression_results.xlsx"
):
    # Compute regression metrics for test_data_regression
    true_vit_stat_int_test = np.array(test_dataset['labels'])
    predicted_vit_stat_int_test = np.array(predicted_vit_stat_int_test)
    mse_test = mean_squared_error(true_vit_stat_int_test, predicted_vit_stat_int_test)
    mae_test = mean_absolute_error(true_vit_stat_int_test, predicted_vit_stat_int_test)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(true_vit_stat_int_test, predicted_vit_stat_int_test)

    # Compute regression metrics for test_data_regression_real1s
    true_vit_stat_int_real1s = np.array(test_dataset_real1s['labels'])
    predicted_vit_stat_int_real1s = np.array(predicted_vit_stat_int_real1s)
    mse_real1s = mean_squared_error(true_vit_stat_int_real1s, predicted_vit_stat_int_real1s)
    mae_real1s = mean_absolute_error(true_vit_stat_int_real1s, predicted_vit_stat_int_real1s)
    rmse_real1s = np.sqrt(mse_real1s)
    r2_real1s = r2_score(true_vit_stat_int_real1s, predicted_vit_stat_int_real1s)

    # Create DataFrame with regression metrics
    metrics_df = pd.DataFrame({
        "Dataset": ["Test Data (Predicted 1s)", "Test Data (True Positives)"],
        "RMSE": [rmse_test, rmse_real1s],
        "R2 Score": [r2_test, r2_real1s]
    })

    # Load datasets from CSV files
    train_data_file = os.path.join(base_output_path, "train_data_regression.csv")
    test_data_file = os.path.join(base_output_path, "test_data_regression.csv")
    test_real1s_file = os.path.join(base_output_path, "test_data_regression_real1s.csv")

    train_df = pd.read_csv(train_data_file)
    test_df = pd.read_csv(test_data_file)
    test_real1s_df = pd.read_csv(test_real1s_file)

    # Convert eval_dataset to DataFrame and merge with train_df to get full data
    eval_df = eval_dataset.to_pandas()
    eval_df['record_id'] = eval_df['record_id'].astype(int)
    train_df['record_id'] = train_df['record_id'].astype(int)
    eval_full_df = pd.merge(eval_df, train_df, on='record_id', how='left')

    # Merge predictions into test_df
    test_df['record_id'] = test_df['record_id'].astype(int)
    test_dataset_record_ids = [int(rid) for rid in test_dataset['record_id']]
    pred_df = pd.DataFrame({
        'record_id': test_dataset_record_ids,
        'predicted_vit_stat_int': predicted_vit_stat_int_test
    })
    test_df = pd.merge(test_df, pred_df, on='record_id', how='left')

    # Merge predictions into test_real1s_df
    test_real1s_df['record_id'] = test_real1s_df['record_id'].astype(int)
    test_real1s_dataset_record_ids = [int(rid) for rid in test_dataset_real1s['record_id']]
    pred_real1s_df = pd.DataFrame({
        'record_id': test_real1s_dataset_record_ids,
        'predicted_vit_stat_int': predicted_vit_stat_int_real1s
    })
    test_real1s_df = pd.merge(test_real1s_df, pred_real1s_df, on='record_id', how='left')

    # Define columns to exclude
    columns_to_exclude = ['input_ids', 'token_type_ids', 'attention_mask', 'regression_prompt', 'labels']

    # Drop unnecessary columns
    train_df = train_df.drop(columns=[col for col in columns_to_exclude if col in train_df.columns])
    eval_full_df = eval_full_df.drop(columns=[col for col in columns_to_exclude if col in eval_full_df.columns])
    test_df = test_df.drop(columns=[col for col in columns_to_exclude if col in test_df.columns])
    test_real1s_df = test_real1s_df.drop(columns=[col for col in columns_to_exclude if col in test_real1s_df.columns])

    # Save datasets and metrics to Excel
    with pd.ExcelWriter(filename, engine='xlsxwriter') as writer:
        # Save regression metrics
        metrics_df.to_excel(writer, sheet_name="Regression Metrics", index=False)

        # Save train data
        train_df.to_excel(writer, sheet_name="Train Data", index=False)

        # Save eval data
        eval_full_df.to_excel(writer, sheet_name="Eval Data", index=False)

        # Save test data with predictions
        test_df.to_excel(writer, sheet_name="Test Data Predicted 1s", index=False)

        # Save test_real1s data with predictions
        test_real1s_df.to_excel(writer, sheet_name="Test Data True Positives", index=False)

    print(f"Regression metrics and data have been saved to '{filename}'.")

In [None]:
save_regression_metrics_to_excel(
    train_dataset=split_train_data_regression,
    eval_dataset=eval_data,
    test_dataset=test_data_regression,
    test_dataset_real1s=test_data_regression_real1s,
    predicted_vit_stat_int_test=predicted_vit_stat_int_test,
    predicted_vit_stat_int_real1s=predicted_vit_stat_int_real1s
)