---------------
# Seaching for all folders:

# Ensemble approach


In [1]:
import os
import re
import pandas as pd
from assess import extract_losses, extract_tr_value, format_losses
# Define the main root directory
main_root = '/data/sama/Deep_Learning_Pipeline_Test/experiments_dose/test'
# main_root = '/data/bahrdoh/Deep_Learning_Pipeline_Test/experiments_dose/test/'


# List all directories in the main root
folders = [f for f in os.listdir(main_root) if os.path.isdir(os.path.join(main_root, f))]

# Initialize lists for storing all results
all_results = []

# Iterate over the folders and extract losses
for folder in folders:
    # Split the folder name by underscore and get the 3rd element
    folder_parts = folder.split('_')
    
    # Ensure the folder has at least 3 parts for ensemble_id
    if len(folder_parts) < 3:
        ensemble_id = None  # Assign None if we can't find the ensemble_id
    else:
        ensemble_id = folder_parts[2]
    
    # Extract losses from folder name
    train_loss, val_loss, test_loss, avg_train_loss, avg_val_loss, avg_test_loss = extract_losses(folder)
    
    # Extract 'tr' value from folder name
    tr_value = extract_tr_value(folder)

    # Append data to the all_results list
    all_results.append({
        'esm': ensemble_id,
        'tr': tr_value,
        'Folder': folder,
        'train_loss': train_loss,
        'val_loss': val_loss,
        'test_loss': test_loss,
        'avg_train_loss': avg_train_loss,
        'avg_val_loss': avg_val_loss,
        'avg_test_loss': avg_test_loss
    })

# Create DataFrames from the results
df_all_models = pd.DataFrame(all_results)

# Drop rows where the ensemble_id is missing (optional)
df_cleaned = df_all_models.dropna(subset=['esm'], how='all')

# Display the cleaned DataFrame with all folders, grouped by ensemble_id
df_cleaned


Unnamed: 0,esm,tr,Folder,train_loss,val_loss,test_loss,avg_train_loss,avg_val_loss,avg_test_loss
0,7,6.0,20240901_075039_7_258_Dual_DCNN_LReLu_6_197_tr...,0.989,1.271,2.935,,,
1,7,4.0,20240831_111643_7_505_Dual_DCNN_LReLu_4_115_tr...,1.016,1.348,2.97,,,
2,7,1.0,20240830_084408_7_410_Dual_DCNN_LReLu_1_238_tr...,1.274,3.08,2.933,,,
3,7,2.0,20240830_190612_7_205_Dual_DCNN_LReLu_2_258_tr...,0.975,1.397,2.822,,,
4,7,5.0,20240831_215951_7_590_Dual_DCNN_LReLu_5_270_tr...,1.021,1.388,2.852,,,
5,7,8.0,20240902_005735_7_168_Dual_DCNN_LReLu_8_105_tr...,1.087,1.395,2.879,,,
6,2,0.0,20240822_114104_2_341_Dual_DCNN_LReLu_0_76_tr_...,1.409,3.842,3.208,,,
9,5,3.0,20240827_234123_5_838_Dual_DCNN_LReLu_3_109_tr...,1.074,1.496,3.114,,,
10,5,1.0,20240827_084503_5_410_Dual_DCNN_LReLu_1_253_tr...,0.973,2.22,2.827,,,
12,7,0.0,20240829_225243_7_341_Dual_DCNN_LReLu_0_78_tr_...,1.444,3.625,2.742,,,


In [2]:
import os
import json
import pandas as pd
from assess import extract_loss_function_from_folder




# Use .loc to avoid the SettingWithCopyWarning
df_cleaned.loc[:, 'loss_function_name'] = df_cleaned['Folder'].apply(
    lambda folder: extract_loss_function_from_folder(os.path.join(main_root, folder))
)
# Display the updated DataFrame
df_cleaned


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.loc[:, 'loss_function_name'] = df_cleaned['Folder'].apply(


Unnamed: 0,esm,tr,Folder,train_loss,val_loss,test_loss,avg_train_loss,avg_val_loss,avg_test_loss,loss_function_name
0,7,6.0,20240901_075039_7_258_Dual_DCNN_LReLu_6_197_tr...,0.989,1.271,2.935,,,,l1_loss
1,7,4.0,20240831_111643_7_505_Dual_DCNN_LReLu_4_115_tr...,1.016,1.348,2.97,,,,l1_loss
2,7,1.0,20240830_084408_7_410_Dual_DCNN_LReLu_1_238_tr...,1.274,3.08,2.933,,,,l1_loss
3,7,2.0,20240830_190612_7_205_Dual_DCNN_LReLu_2_258_tr...,0.975,1.397,2.822,,,,l1_loss
4,7,5.0,20240831_215951_7_590_Dual_DCNN_LReLu_5_270_tr...,1.021,1.388,2.852,,,,l1_loss
5,7,8.0,20240902_005735_7_168_Dual_DCNN_LReLu_8_105_tr...,1.087,1.395,2.879,,,,l1_loss
6,2,0.0,20240822_114104_2_341_Dual_DCNN_LReLu_0_76_tr_...,1.409,3.842,3.208,,,,l1_loss
9,5,3.0,20240827_234123_5_838_Dual_DCNN_LReLu_3_109_tr...,1.074,1.496,3.114,,,,l1_loss
10,5,1.0,20240827_084503_5_410_Dual_DCNN_LReLu_1_253_tr...,0.973,2.22,2.827,,,,l1_loss
12,7,0.0,20240829_225243_7_341_Dual_DCNN_LReLu_0_78_tr_...,1.444,3.625,2.742,,,,l1_loss


In [3]:
import pandas as pd
from assess import keep_lowest_ensemble_losses
# Function to find and keep only the rows with the lowest losses for train, val, and test



df_ensemble = keep_lowest_ensemble_losses(df_cleaned)

# Display the DataFrame with only the lowest train, val, and test losses
df_ensemble


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_losses['tag'] = 'Other'


Unnamed: 0,esm,tr,Folder,train_loss,val_loss,test_loss,avg_train_loss,avg_val_loss,avg_test_loss,loss_function_name,tag
17,7,9.0,20240902_115416_7_120_Dual_DCNN_LReLu_9_296_tr...,0.912,1.176,3.21,1.067,1.729,2.864,l1_loss,"Esb_Test: [1.067, 1.729, 2.864]"
19,7,3.0,20240831_040942_7_838_Dual_DCNN_LReLu_3_185_tr...,0.962,1.384,2.519,,,,l1_loss,Other


In [4]:
import pandas as pd

# Step 1: Identify rows with 'Best_Esmb_X' in the tag
best_esmb_rows = df_ensemble[df_ensemble['tag'].str.startswith('Esb_')]

# Step 2: Get the unique 'esm' values from the 'esm' column of these rows
best_esm_values = best_esmb_rows['esm'].unique()
print(best_esm_values)
# Step 3: Filter the DataFrame to keep all rows that share the same 'esm' as the best rows
df_best_esms = df_cleaned[df_cleaned['esm'].isin(best_esm_values)]


# Display the DataFrame for esm=5
df_best_esms


['7']


Unnamed: 0,esm,tr,Folder,train_loss,val_loss,test_loss,avg_train_loss,avg_val_loss,avg_test_loss,loss_function_name,tag
0,7,6.0,20240901_075039_7_258_Dual_DCNN_LReLu_6_197_tr...,0.989,1.271,2.935,,,,l1_loss,Other
1,7,4.0,20240831_111643_7_505_Dual_DCNN_LReLu_4_115_tr...,1.016,1.348,2.97,,,,l1_loss,Other
2,7,1.0,20240830_084408_7_410_Dual_DCNN_LReLu_1_238_tr...,1.274,3.08,2.933,,,,l1_loss,Other
3,7,2.0,20240830_190612_7_205_Dual_DCNN_LReLu_2_258_tr...,0.975,1.397,2.822,,,,l1_loss,Other
4,7,5.0,20240831_215951_7_590_Dual_DCNN_LReLu_5_270_tr...,1.021,1.388,2.852,,,,l1_loss,Other
5,7,8.0,20240902_005735_7_168_Dual_DCNN_LReLu_8_105_tr...,1.087,1.395,2.879,,,,l1_loss,Other
12,7,0.0,20240829_225243_7_341_Dual_DCNN_LReLu_0_78_tr_...,1.444,3.625,2.742,,,,l1_loss,Other
17,7,9.0,20240902_115416_7_120_Dual_DCNN_LReLu_9_296_tr...,0.912,1.176,3.21,1.067,1.729,2.864,l1_loss,"Esb_Test: [1.067, 1.729, 2.864]"
19,7,3.0,20240831_040942_7_838_Dual_DCNN_LReLu_3_185_tr...,0.962,1.384,2.519,,,,l1_loss,Other
20,7,7.0,20240901_182503_7_192_Dual_DCNN_LReLu_7_232_tr...,0.991,1.228,2.777,,,,l1_loss,Other


In [5]:
import os
import pandas as pd
from assess import attach_csv_outputs


df_best_esms = df_best_esms.apply(lambda row: attach_csv_outputs(main_root, row), axis=1)

# Display the updated DataFrame with the attached CSV data

df_best_esms = df_best_esms.reset_index(drop=True)
df_best_esms

Unnamed: 0,esm,tr,Folder,train_loss,val_loss,test_loss,avg_train_loss,avg_val_loss,avg_test_loss,loss_function_name,tag,test_output,val_output,train_output
0,7,6.0,20240901_075039_7_258_Dual_DCNN_LReLu_6_197_tr...,0.989,1.271,2.935,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...
1,7,4.0,20240831_111643_7_505_Dual_DCNN_LReLu_4_115_tr...,1.016,1.348,2.97,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...
2,7,1.0,20240830_084408_7_410_Dual_DCNN_LReLu_1_238_tr...,1.274,3.08,2.933,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...
3,7,2.0,20240830_190612_7_205_Dual_DCNN_LReLu_2_258_tr...,0.975,1.397,2.822,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...
4,7,5.0,20240831_215951_7_590_Dual_DCNN_LReLu_5_270_tr...,1.021,1.388,2.852,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...
5,7,8.0,20240902_005735_7_168_Dual_DCNN_LReLu_8_105_tr...,1.087,1.395,2.879,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...
6,7,0.0,20240829_225243_7_341_Dual_DCNN_LReLu_0_78_tr_...,1.444,3.625,2.742,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...
7,7,9.0,20240902_115416_7_120_Dual_DCNN_LReLu_9_296_tr...,0.912,1.176,3.21,1.067,1.729,2.864,l1_loss,"Esb_Test: [1.067, 1.729, 2.864]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...
8,7,3.0,20240831_040942_7_838_Dual_DCNN_LReLu_3_185_tr...,0.962,1.384,2.519,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...
9,7,7.0,20240901_182503_7_192_Dual_DCNN_LReLu_7_232_tr...,0.991,1.228,2.777,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...


In [6]:
import os
import json
import pandas as pd

# Function to read the JSON file and extract paths


# Apply the function to each row of df_best_esms
df_best_esms = df_best_esms.apply(lambda row: attach_data_dict_paths(row, main_root), axis=1)

# Display the updated DataFrame with the attached paths from the JSON files
df_best_esms


Unnamed: 0,esm,tr,Folder,train_loss,val_loss,test_loss,avg_train_loss,avg_val_loss,avg_test_loss,loss_function_name,tag,test_output,val_output,train_output,train_dict,val_dict,test_dict
0,7,6.0,20240901_075039_7_258_Dual_DCNN_LReLu_6_197_tr...,0.989,1.271,2.935,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
1,7,4.0,20240831_111643_7_505_Dual_DCNN_LReLu_4_115_tr...,1.016,1.348,2.97,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
2,7,1.0,20240830_084408_7_410_Dual_DCNN_LReLu_1_238_tr...,1.274,3.08,2.933,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
3,7,2.0,20240830_190612_7_205_Dual_DCNN_LReLu_2_258_tr...,0.975,1.397,2.822,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
4,7,5.0,20240831_215951_7_590_Dual_DCNN_LReLu_5_270_tr...,1.021,1.388,2.852,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
5,7,8.0,20240902_005735_7_168_Dual_DCNN_LReLu_8_105_tr...,1.087,1.395,2.879,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
6,7,0.0,20240829_225243_7_341_Dual_DCNN_LReLu_0_78_tr_...,1.444,3.625,2.742,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
7,7,9.0,20240902_115416_7_120_Dual_DCNN_LReLu_9_296_tr...,0.912,1.176,3.21,1.067,1.729,2.864,l1_loss,"Esb_Test: [1.067, 1.729, 2.864]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
8,7,3.0,20240831_040942_7_838_Dual_DCNN_LReLu_3_185_tr...,0.962,1.384,2.519,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
9,7,7.0,20240901_182503_7_192_Dual_DCNN_LReLu_7_232_tr...,0.991,1.228,2.777,,,,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...


### Ading AVERAGE MODEL

In [7]:
import pandas as pd

# Step 1: Calculate the average of numerical columns (e.g., train_loss, val_loss, test_loss)
average_row = {
    'esm': df_best_esms['esm'][0],  # Placeholder for esm, indicating this is the average row
    'Folder': None,  # No folder for the average
    'train_loss': df_best_esms['train_loss'].mean(),
    'val_loss': df_best_esms['val_loss'].mean(),
    'test_loss': df_best_esms['test_loss'].mean(),
    'avg_train_loss': df_best_esms['avg_train_loss'].mean(),
    'avg_val_loss': df_best_esms['avg_val_loss'].mean(),
    'avg_test_loss': df_best_esms['avg_test_loss'].mean(),
    'loss_function_name': None,  # No loss function for the average row
    'train_dict': None,  # No specific dictionary for the average
    'val_dict': None,    # No specific dictionary for the average
    'test_dict': None    # No specific dictionary for the average
}

# Convert the dictionary to a DataFrame
average_row_df = pd.DataFrame([average_row])

# Step 2: Concatenate the new row with the existing DataFrame
df_best_esms = pd.concat([df_best_esms, average_row_df], ignore_index=True)
df_best_esms = df_best_esms.drop(columns=['avg_train_loss', 'avg_val_loss', 'avg_test_loss'])

# Step 3: Display the updated DataFrame with the new average row
df_best_esms


Unnamed: 0,esm,tr,Folder,train_loss,val_loss,test_loss,loss_function_name,tag,test_output,val_output,train_output,train_dict,val_dict,test_dict
0,7,6.0,20240901_075039_7_258_Dual_DCNN_LReLu_6_197_tr...,0.989,1.271,2.935,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
1,7,4.0,20240831_111643_7_505_Dual_DCNN_LReLu_4_115_tr...,1.016,1.348,2.97,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
2,7,1.0,20240830_084408_7_410_Dual_DCNN_LReLu_1_238_tr...,1.274,3.08,2.933,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
3,7,2.0,20240830_190612_7_205_Dual_DCNN_LReLu_2_258_tr...,0.975,1.397,2.822,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
4,7,5.0,20240831_215951_7_590_Dual_DCNN_LReLu_5_270_tr...,1.021,1.388,2.852,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
5,7,8.0,20240902_005735_7_168_Dual_DCNN_LReLu_8_105_tr...,1.087,1.395,2.879,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
6,7,0.0,20240829_225243_7_341_Dual_DCNN_LReLu_0_78_tr_...,1.444,3.625,2.742,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
7,7,9.0,20240902_115416_7_120_Dual_DCNN_LReLu_9_296_tr...,0.912,1.176,3.21,l1_loss,"Esb_Test: [1.067, 1.729, 2.864]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
8,7,3.0,20240831_040942_7_838_Dual_DCNN_LReLu_3_185_tr...,0.962,1.384,2.519,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
9,7,7.0,20240901_182503_7_192_Dual_DCNN_LReLu_7_232_tr...,0.991,1.228,2.777,l1_loss,Other,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...


In [8]:
import pandas as pd

# Function to format and round the values with spaces after commas

# Function to update or assign the 'tag' column for all rows
def update_tag(row):
    # Format the loss values using the format_losses function
    formatted_losses = format_losses(row['train_loss'], row['val_loss'], row['test_loss'])

    # If the 'tag' is NaN, assign the default tag with _AVG
    if pd.isna(row['tag']):
        return f"Esb_Test_AVG_[{formatted_losses}]"

    # If 'tr' exists, append it to the existing tag
    elif pd.notna(row['tr']):
        return f"Esb_Test_{int(row['tr'])}_[{formatted_losses}]"
    
    # Otherwise, return the existing tag
    return row['tag']

# Apply the function to update the 'tag' column for all rows
df_best_esms['tag'] = df_best_esms.apply(update_tag, axis=1)

df_best_esms = df_best_esms.reset_index(drop=True)

df_best_esms


Unnamed: 0,esm,tr,Folder,train_loss,val_loss,test_loss,loss_function_name,tag,test_output,val_output,train_output,train_dict,val_dict,test_dict
0,7,6.0,20240901_075039_7_258_Dual_DCNN_LReLu_6_197_tr...,0.989,1.271,2.935,l1_loss,"Esb_Test_6_[0.99, 1.27, 2.94]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
1,7,4.0,20240831_111643_7_505_Dual_DCNN_LReLu_4_115_tr...,1.016,1.348,2.97,l1_loss,"Esb_Test_4_[1.02, 1.35, 2.97]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
2,7,1.0,20240830_084408_7_410_Dual_DCNN_LReLu_1_238_tr...,1.274,3.08,2.933,l1_loss,"Esb_Test_1_[1.27, 3.08, 2.93]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
3,7,2.0,20240830_190612_7_205_Dual_DCNN_LReLu_2_258_tr...,0.975,1.397,2.822,l1_loss,"Esb_Test_2_[0.97, 1.4, 2.82]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
4,7,5.0,20240831_215951_7_590_Dual_DCNN_LReLu_5_270_tr...,1.021,1.388,2.852,l1_loss,"Esb_Test_5_[1.02, 1.39, 2.85]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
5,7,8.0,20240902_005735_7_168_Dual_DCNN_LReLu_8_105_tr...,1.087,1.395,2.879,l1_loss,"Esb_Test_8_[1.09, 1.4, 2.88]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
6,7,0.0,20240829_225243_7_341_Dual_DCNN_LReLu_0_78_tr_...,1.444,3.625,2.742,l1_loss,"Esb_Test_0_[1.44, 3.62, 2.74]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
7,7,9.0,20240902_115416_7_120_Dual_DCNN_LReLu_9_296_tr...,0.912,1.176,3.21,l1_loss,"Esb_Test_9_[0.91, 1.18, 3.21]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
8,7,3.0,20240831_040942_7_838_Dual_DCNN_LReLu_3_185_tr...,0.962,1.384,2.519,l1_loss,"Esb_Test_3_[0.96, 1.38, 2.52]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
9,7,7.0,20240901_182503_7_192_Dual_DCNN_LReLu_7_232_tr...,0.991,1.228,2.777,l1_loss,"Esb_Test_7_[0.99, 1.23, 2.78]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...


In [None]:
# import pandas as pd

# # Assuming df_best_esms is your DataFrame and 'test_output' contains the relevant data
# data1 = df_best_esms.iloc[0]['test_output']
# df_split1 = data1['PatientID;pred_0;pred_1;pred_2;true_0;true_1;true_2;Mode'].str.split(';', expand=True)
# df_split1.columns = ['PatientID', 'pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2', 'Mode']
# df_split1
# data2 = df_best_esms.iloc[1]['test_output']
# df_split2 = data2['PatientID;pred_0;pred_1;pred_2;true_0;true_1;true_2;Mode'].str.split(';', expand=True)
# df_split2.columns = ['PatientID', 'pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2', 'Mode']
# df_split2
# # Calculate the average for corresponding columns for numeric values, keeping non-numeric as they are
# df_avg = pd.DataFrame()

# # Averaging numeric columns
# numeric_cols = df_split1.select_dtypes(include='number').columns
# df_avg[numeric_cols] = (df_split1[numeric_cols] + df_split12[numeric_cols]) / 2

# # Retain non-numeric columns
# df_avg['PatientID'] = df_split1['PatientID']
# df_avg['Mode'] = df_split1['Mode']


# df_avg

In [None]:
# # Assuming df_best_esms is a DataFrame containing multiple 'test_output' fields
# # We will iterate over all the rows except the last one, and compute the averages as requested

# # List to hold all split DataFrames
# split_dfs = []

# # Loop over all the rows except the last one in df_best_esms
# for i in range(len(df_best_esms) - 1):
#     data = df_best_esms.iloc[i]['test_output']
#     df_split = data['PatientID;pred_0;pred_1;pred_2;true_0;true_1;true_2;Mode'].str.split(';', expand=True)
#     df_split.columns = ['PatientID', 'pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2', 'Mode']
#     split_dfs.append(df_split)

# # Now we calculate the average of all these DataFrames
# # First, convert all numeric columns to numeric types, ignoring non-numeric columns
# for df in split_dfs:
#     df[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']] = df[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']].apply(pd.to_numeric, errors='coerce')

# # Initialize the df_avg with the first DataFrame in the list
# df_avg = split_dfs[0].copy()

# # Iteratively calculate the cumulative sum of all numeric columns
# for df in split_dfs[1:]:
#     df_avg[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']] += df[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']]

# # Calculate the average by dividing the cumulative sum by the number of DataFrames
# df_avg[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']] /= len(split_dfs)

# # Retain the non-numeric columns from the first DataFrame
# df_avg['PatientID'] = split_dfs[0]['PatientID']
# df_avg['Mode'] = split_dfs[0]['Mode']
# df_avg



In [None]:
# # Convert df_avg to a CSV-like format with semicolons as the delimiter, without the header or index
# df_avg_str = df_avg.to_csv(index=False, header=True, sep=';')

# # Strip any trailing newlines from the string
# df_avg_str = df_avg_str.strip()

# # Replace the 'test_output' in the last row with the CSV-like string
# df_best_esms.at[df_best_esms.index[-1], 'test_output'] = df_avg_str

# # Check the updated df_best_esms to ensure the format is correct
# df_best_esms


In [9]:
# Defining an enhanced function to handle multiple columns like 'test_output', 'train_output', and 'val_output'

def average_outputs(df_best_esms, columns):
    """
    Averages the specified output columns ('test_output', 'train_output', 'val_output') 
    over all rows except the last one, and updates the last row with the averaged data.
    
    Parameters:
    df_best_esms (DataFrame): The DataFrame containing the output columns.
    columns (list of str): List of column names to process (e.g., ['test_output', 'train_output', 'val_output']).
    
    Returns:
    DataFrame: The updated DataFrame with averaged values in the last row for the specified columns.
    """
    for col in columns:
        # List to hold all split DataFrames for the current column
        split_dfs = []

        # Loop over all the rows except the last one in df_best_esms
        for i in range(len(df_best_esms) - 1):
            data = df_best_esms.iloc[i][col]
            df_split = data['PatientID;pred_0;pred_1;pred_2;true_0;true_1;true_2;Mode'].str.split(';', expand=True)
            df_split.columns = ['PatientID', 'pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2', 'Mode']
            split_dfs.append(df_split)

        # Now we calculate the average of all these DataFrames
        # First, convert all numeric columns to numeric types, ignoring non-numeric columns
        for df in split_dfs:
            df[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']] = df[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']].apply(pd.to_numeric, errors='coerce')

        # Initialize the df_avg with the first DataFrame in the list
        df_avg = split_dfs[0].copy()

        # Iteratively calculate the cumulative sum of all numeric columns
        for df in split_dfs[1:]:
            df_avg[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']] += df[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']]

        # Calculate the average by dividing the cumulative sum by the number of DataFrames
        df_avg[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']] /= len(split_dfs)

        # Retain the non-numeric columns from the first DataFrame
        df_avg['PatientID'] = split_dfs[0]['PatientID']
        df_avg['Mode'] = split_dfs[0]['Mode']

        # Convert df_avg to a CSV-like format with semicolons as the delimiter, without the header or index
        df_avg_str = df_avg.to_csv(index=False, header=True, sep=';')

        # Strip any trailing newlines from the string
        df_avg_str = df_avg_str.strip()

        # Replace the column in the last row with the CSV-like string
        df_best_esms.at[df_best_esms.index[-1], col] = df_avg_str

    return df_best_esms

# Now this function can be used to process 'test_output', 'train_output', and 'val_output' in df_best_esms.
columns_to_process = ['test_output', 'train_output', 'val_output']
df_best_esms = average_outputs(df_best_esms, columns_to_process)
df_best_esms


Unnamed: 0,esm,tr,Folder,train_loss,val_loss,test_loss,loss_function_name,tag,test_output,val_output,train_output,train_dict,val_dict,test_dict
0,7,6.0,20240901_075039_7_258_Dual_DCNN_LReLu_6_197_tr...,0.989,1.271,2.935,l1_loss,"Esb_Test_6_[0.99, 1.27, 2.94]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
1,7,4.0,20240831_111643_7_505_Dual_DCNN_LReLu_4_115_tr...,1.016,1.348,2.97,l1_loss,"Esb_Test_4_[1.02, 1.35, 2.97]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
2,7,1.0,20240830_084408_7_410_Dual_DCNN_LReLu_1_238_tr...,1.274,3.08,2.933,l1_loss,"Esb_Test_1_[1.27, 3.08, 2.93]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
3,7,2.0,20240830_190612_7_205_Dual_DCNN_LReLu_2_258_tr...,0.975,1.397,2.822,l1_loss,"Esb_Test_2_[0.97, 1.4, 2.82]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
4,7,5.0,20240831_215951_7_590_Dual_DCNN_LReLu_5_270_tr...,1.021,1.388,2.852,l1_loss,"Esb_Test_5_[1.02, 1.39, 2.85]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
5,7,8.0,20240902_005735_7_168_Dual_DCNN_LReLu_8_105_tr...,1.087,1.395,2.879,l1_loss,"Esb_Test_8_[1.09, 1.4, 2.88]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
6,7,0.0,20240829_225243_7_341_Dual_DCNN_LReLu_0_78_tr_...,1.444,3.625,2.742,l1_loss,"Esb_Test_0_[1.44, 3.62, 2.74]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
7,7,9.0,20240902_115416_7_120_Dual_DCNN_LReLu_9_296_tr...,0.912,1.176,3.21,l1_loss,"Esb_Test_9_[0.91, 1.18, 3.21]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
8,7,3.0,20240831_040942_7_838_Dual_DCNN_LReLu_3_185_tr...,0.962,1.384,2.519,l1_loss,"Esb_Test_3_[0.96, 1.38, 2.52]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
9,7,7.0,20240901_182503_7_192_Dual_DCNN_LReLu_7_232_tr...,0.991,1.228,2.777,l1_loss,"Esb_Test_7_[0.99, 1.23, 2.78]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...


In [28]:
import os

def average_outputs(df_best_esms, columns):
    """
    Averages the specified output columns ('test_output', 'train_output', 'val_output') 
    over all rows except the last one, and updates the last row with the averaged data.
    Also saves the averaged DataFrames as CSV files in the folder with the highest 'tr' for each 'esm'.
    
    Parameters:
    df_best_esms (DataFrame): The DataFrame containing the output columns.
    columns (list of str): List of column names to process (e.g., ['test_output', 'train_output', 'val_output']).
    
    Returns:
    DataFrame: The updated DataFrame with averaged values in the last row for the specified columns.
    """
    
    # Iterate over unique 'esm' values
    for esm in df_best_esms['esm'].unique():
        # Get the rows corresponding to the current 'esm'
        esm_group = df_best_esms[df_best_esms['esm'] == esm]

        # Find the row with the highest 'tr' value for the current 'esm'
        highest_tr_row = esm_group.loc[esm_group['tr'].idxmax()]

        # Get the folder path for the row with the highest 'tr'
        output_dir = main_root + "/" +highest_tr_row['Folder']

        for col in columns:
            # List to hold all split DataFrames for the current column
            split_dfs = []

            # Loop over all the rows except the last one in df_best_esms
            for i in range(len(df_best_esms) - 1):
                data = df_best_esms.iloc[i][col]
                df_split = data['PatientID;pred_0;pred_1;pred_2;true_0;true_1;true_2;Mode'].str.split(';', expand=True)
                df_split.columns = ['PatientID', 'pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2', 'Mode']
                split_dfs.append(df_split)

            # Now we calculate the average of all these DataFrames
            # First, convert all numeric columns to numeric types, ignoring non-numeric columns
            for df in split_dfs:
                df[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']] = df[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']].apply(pd.to_numeric, errors='coerce')

            # Initialize the df_avg with the first DataFrame in the list
            df_avg = split_dfs[0].copy()

            # Iteratively calculate the cumulative sum of all numeric columns
            for df in split_dfs[1:]:
                df_avg[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']] += df[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']]

            # Calculate the average by dividing the cumulative sum by the number of DataFrames
            df_avg[['pred_0', 'pred_1', 'pred_2', 'true_0', 'true_1', 'true_2']] /= len(split_dfs)

            # Retain the non-numeric columns from the first DataFrame
            df_avg['PatientID'] = split_dfs[0]['PatientID']
            df_avg['Mode'] = split_dfs[0]['Mode']

            # Convert df_avg to a CSV-like format with semicolons as the delimiter, without the header or index
            df_avg_str = df_avg.to_csv(index=False, header=True, sep=';')

            # Strip any trailing newlines from the string
            df_avg_str = df_avg_str.strip()

            # Replace the column in the last row with the CSV-like string
            df_best_esms.at[df_best_esms.index[-1], col] = df_avg_str

            # Save the averaged DataFrame to a CSV file in the output directory
            csv_file_path = os.path.join(output_dir, f'AVG_Dual_DCNN_LReLu_{col}.csv')
            df_avg.to_csv(csv_file_path, index=False)
            print(f"Saved {col} averages to {csv_file_path}")

    return df_best_esms

# Now this function can be used to process 'test_output', 'train_output', and 'val_output' in df_best_esms.
columns_to_process = ['test_output', 'train_output', 'val_output']
df_best_esms = average_outputs(df_best_esms, columns_to_process)


Saved test_output averages to /data/sama/Deep_Learning_Pipeline_Test/experiments_dose/test/20240902_115416_7_120_Dual_DCNN_LReLu_9_296_tr_0.912_val_1.176_test_3.21_avg_tr_1.067_val_1.729_test_2.864/AVG_Dual_DCNN_LReLu_test_output.csv
Saved train_output averages to /data/sama/Deep_Learning_Pipeline_Test/experiments_dose/test/20240902_115416_7_120_Dual_DCNN_LReLu_9_296_tr_0.912_val_1.176_test_3.21_avg_tr_1.067_val_1.729_test_2.864/AVG_Dual_DCNN_LReLu_train_output.csv
Saved val_output averages to /data/sama/Deep_Learning_Pipeline_Test/experiments_dose/test/20240902_115416_7_120_Dual_DCNN_LReLu_9_296_tr_0.912_val_1.176_test_3.21_avg_tr_1.067_val_1.729_test_2.864/AVG_Dual_DCNN_LReLu_val_output.csv


In [10]:
# Define the path where you want to save the pickle file
pickle_file_path = 'df_best_esms.pkl'  # Replace with your desired path

# Save the DataFrame as a pickle file
df_best_esms.to_pickle(pickle_file_path)

print(f"DataFrame saved as pickle at: {pickle_file_path}")


DataFrame saved as pickle at: df_best_esms.pkl


------------

# Single Model

In [19]:
import pandas as pd
from assess import keep_lowest_single_losses


# Apply the function to the df_single_models DataFrame
df_single = keep_lowest_single_losses(df_cleaned)


# Display the DataFrame with only the lowest train, val, and test losses
df_single


Unnamed: 0,esm,tr,Folder,train_loss,val_loss,test_loss,loss_function_name,tag
17,7,9.0,20240902_115416_7_120_Dual_DCNN_LReLu_9_296_tr...,0.912,1.176,3.21,l1_loss,"Best_Val_single: [0.91, 1.18, 3.21]"
19,7,3.0,20240831_040942_7_838_Dual_DCNN_LReLu_3_185_tr...,0.962,1.384,2.519,l1_loss,"Best_Test_single: [0.96, 1.38, 2.52]"


In [20]:
null_values = df_single.isnull()
df_single.isnull().sum()


esm                   0
tr                    0
Folder                0
train_loss            0
val_loss              0
test_loss             0
loss_function_name    0
tag                   0
dtype: int64

In [21]:

df_single = df_single.apply(lambda row: attach_csv_outputs(row, main_root), axis=1)

# Display the updated DataFrame with the attached CSV data

df_single = df_single.reset_index(drop=True)
df_single

Unnamed: 0,esm,tr,Folder,train_loss,val_loss,test_loss,loss_function_name,tag,test_output,val_output,train_output
0,7,9.0,20240902_115416_7_120_Dual_DCNN_LReLu_9_296_tr...,0.912,1.176,3.21,l1_loss,"Best_Val_single: [0.91, 1.18, 3.21]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...
1,7,3.0,20240831_040942_7_838_Dual_DCNN_LReLu_3_185_tr...,0.962,1.384,2.519,l1_loss,"Best_Test_single: [0.96, 1.38, 2.52]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...


In [22]:
# Access the first index in the DataFrame
first_index = df_single.index[0]

# Now access the 'test_output' for the first row based on the actual index
test_output_first_row = df_single.loc[first_index, 'test_output']
print(test_output_first_row)

    PatientID;pred_0;pred_1;pred_2;true_0;true_1;true_2;Mode
0    DBP_OP022;1.4436655;-14.506543;-9.597106;1.619...      
1    DBP_OP022;16.59303;6.260155;0.8427954;11.46955...      
2    DBP_OP022;15.235685;7.826196;-2.8190432;11.615...      
3    DBP_OP022;-6.9917336;-13.337357;13.719217;-10....      
4    DBP_OP022;6.6398683;-2.7785864;6.6773725;3.362...      
..                                                 ...      
315  DBP_OP038;-1.9790478;13.661573;12.266103;-11.1...      
316  DBP_OP038;6.3643713;13.147768;8.878446;1.48233...      
317  DBP_OP038;6.26395;10.50584;11.6188345;1.603638...      
318  DBP_OP038;7.3276763;16.172184;4.9008555;5.9930...      
319  DBP_OP038;6.821365;-1.019985;-4.054994;0.48144...      

[320 rows x 1 columns]


In [23]:
# Apply the function to each row of df_best_esms
df_single = df_single.apply(lambda row: attach_data_dict_paths(main_root, row), axis=1)


# Display the updated DataFrame with the attached paths from the JSON files
df_single

Unnamed: 0,esm,tr,Folder,train_loss,val_loss,test_loss,loss_function_name,tag,test_output,val_output,train_output,train_dict,val_dict,test_dict
0,7,9.0,20240902_115416_7_120_Dual_DCNN_LReLu_9_296_tr...,0.912,1.176,3.21,l1_loss,"Best_Val_single: [0.91, 1.18, 3.21]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...
1,7,3.0,20240831_040942_7_838_Dual_DCNN_LReLu_3_185_tr...,0.962,1.384,2.519,l1_loss,"Best_Test_single: [0.96, 1.38, 2.52]",PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;true...,PatientID;pred_0;pred_1;pred_2;true_0;tru...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...,[{'fixed': '/data/bahrdoh/Datasets/Second_ds/n...


In [24]:
# Saving the DataFrame as a pickle file
pickle_file_path = "df_single.pkl"
df_single.to_pickle(pickle_file_path)

# Loading the DataFrame from the pickle file
df_single = pd.read_pickle(pickle_file_path)

print(df_single['tag'].tolist())

['Best_Val_single: [0.91, 1.18, 3.21]', 'Best_Test_single: [0.96, 1.38, 2.52]']


--------------------


In [None]:
# import pandas as pd

# # Assuming df_filtered_tagged_losses is your DataFrame
# # Replace the following line with the actual loading of your DataFrame if necessary
# # df_filtered_tagged_losses = pd.read_pickle('selected_folders_2.pkl')

# # Folder name to search for
# folder_name_to_search = '20240722_191053_410_Dual_DCNN_LReLu_1_149_tr_0.988_val_3.565_test_3.683_avg_tr_1.018_val_4.223_test_4.064'

# # Check if the folder name exists in the Folder column
# folder_exists = folder_name_to_search in df_loaded['Folder'].values

# if folder_exists:
#     print(f"The folder '{folder_name_to_search}' exists in the DataFrame.")
# else:
#     print(f"The folder '{folder_name_to_search}' does not exist in the DataFrame.")