##  0. Import modules

In [2]:
import os, sys, re, glob, gc
from datetime import datetime

import pandas as pd
import numpy as np

## 1. Environment setup

In [3]:
def setup_environment():
    """Setup paths and detect if running on Google Colab."""
    try:
        # Check if running on Google Colab
        from google.colab import drive

        drive.mount('/content/drive', force_remount=True)
        base_dir = '/content/TFM/SNN/CL'
        os.makedirs(f"{base_dir}CL", exist_ok=True)

        is_colab = True
    except ImportError:
        # Fallback for local environment
        base_dir = '/home/scosta/TFM/SNN/CL'
        is_colab = False

    # Change to the base directory
    os.chdir(base_dir)

    # Print the current working directory
    print(f"Current working directory: {os.getcwd()}")

    return is_colab, base_dir


# Detect environment and set up paths
is_colab, base_dir = setup_environment()

FileNotFoundError: [Errno 2] No such file or directory: '/home/scosta/TFM/CL'

In [4]:
os.chdir('/home/scosta/TFM/SNN/work/') # deleteme

##  2. Data loading

In [5]:
## Metrics logs

# Get the list of file paths
file_paths = glob.glob('./**/metrics.csv', recursive=True)

# Load files
data_frames = []
for file_path in file_paths:
    # Extract the full directory name
    full_dir = file_path.split('/')[1]
    
    # Extract the version number
    version = file_path.split('/')[2].split('_')[1]  # Assumes "version_X" structure in the path
    
    # Extract the task name
    task_name_match = re.search(r'SNN_CL_(.*?)_rate', full_dir)
    task_name = task_name_match.group(1) if task_name_match else None
    
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    
    # Add columns for the task name, directory, and version
    df['Task'] = task_name
    df['Directory'] = full_dir
    df['Version'] = version
    
    # Append the DataFrame to the list
    data_frames.append(df)
    
# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(data_frames, ignore_index=True)

# Reshape the DataFrame to proper structure
metrics_columns = ['co2_emissions', 'ewc_loss',
       'test_loss/dataloader_idx_0', 'test_loss/dataloader_idx_1',
       'train_loss', 'val_loss/dataloader_idx_0', 'val_loss/dataloader_idx_1', 'val_loss/dataloader_idx_2']  # Adjust these to match your actual column names
id_columns = [col for col in final_df.columns if col not in metrics_columns]  # Keep all other columns as identifiers

melted_df = pd.melt(
    final_df,
    id_vars=id_columns,  # Columns to keep as identifiers
    value_vars=metrics_columns,  # Columns to unpivot
    var_name='Metric',  # New column name for metrics
    value_name='Value'  # New column name for values
)

# Remove rows with NaN in the 'Value' column if any
melted_df = melted_df.dropna(subset=['Value'])

# Sort the DataFrame by 'Task', 'Metric', and 'Version'
melted_df = melted_df.sort_values(by=['Task', 'Metric', 'Version'], ascending=[True, True, True])

# Rename fields to match numbering
melted_df['Metric'] = melted_df['Metric'].str.replace('dataloader_idx_', 'task_')
melted_df[melted_df['Task'] == 'task2_cumulative'] = melted_df[melted_df['Task'] == 'task2_cumulative'].replace('val_loss/task_2', 'val_loss/task_1&2')
melted_df['Metric'] = melted_df['Metric'].replace('val_loss/task_1', 'val_loss/task_2')
melted_df['Metric'] = melted_df['Metric'].replace('val_loss/task_0', 'val_loss/task_1')
melted_df['Metric'] = melted_df['Metric'].replace('test_loss/task_1', 'test_loss/task_2')
melted_df['Metric'] = melted_df['Metric'].replace('test_loss/task_0', 'test_loss/task_1')


# Calculate cobined val_loss and test_loss for tracks 1 and 2 (in all but cumulative model)
# Exclude specific tasks from grouping
filtered_df = melted_df[(~melted_df['Task'].isin(['task2_cummulative'])) & (melted_df['Metric'].isin(['val_loss/task_1', 'val_loss/task_2']))]
filtered2_df = melted_df[melted_df['Metric'].isin(['test_loss/task_1', 'test_loss/task_2'])]

# Grouping by relevant columns and calculating mean value
grouped = (
    filtered_df.groupby(['epoch', 'step', 'Task', 'Directory', 'Version'], as_index=False)
    .agg({'Value': 'mean'})
)

grouped2 = (
    filtered2_df.groupby(['epoch', 'step', 'Task', 'Directory', 'Version'], as_index=False)
    .agg({'Value': 'mean'})
)

# Add a new column for the combined metric name
grouped['Metric'] = 'val_loss/task_1&2'
grouped2['Metric'] = 'test_loss/task_1&2'

# Combine the grouped rows back with the original DataFrame
result = pd.concat([melted_df, grouped, grouped2], ignore_index=True)

# Sort the final DataFrame for clarity
df = result.sort_values(by=['epoch', 'step', 'Task', 'Version']).reset_index(drop=True)

# Cleanup at the end
del data_frames, final_df, melted_df, filtered_df, filtered2_df, grouped, grouped2, result
gc.collect()

199

In [6]:
## Time and CO2 emission logs from CodeCarbon logs
# Get the list of file paths
file_paths = glob.glob('./**/emissions*.csv', recursive=True)

# Step 3: Process each file
data_frames = []
for file_path in file_paths:
    # Extract the full directory name
    full_dir = file_path.split('/')[1]
    
    # Extract the version number
    version = file_path.split('/')[2].split('_')[1]
    
    # Extract the task name
    task_name_match = re.search(r'SNN_CL_(.*?)_rate', full_dir)
    task_name = task_name_match.group(1) if task_name_match else None
    
    # Read the CSV file into a pandas DataFrame
    other_df = pd.read_csv(file_path)
    
    # Add columns for the task name, directory, and version
    other_df['Task'] = task_name
    other_df['Directory'] = full_dir
    other_df['Version'] = version
    
    # Append the DataFrame to the list
    data_frames.append(other_df)
    
# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(data_frames, ignore_index=True)

# Rename emissions column
final_df.rename(columns={'emissions': 'Value'}, inplace=True)

# Round CO2 emissions values
df.loc[df['Metric'] == 'co2_emissions', 'Value'] = df.loc[df['Metric'] == 'co2_emissions', 'Value'].round(6)
final_df['Value'] = final_df['Value'].round(6)

# Left join based on Task, Version and CO2_value
merged = pd.merge(
    df[df['Metric'] == 'co2_emissions'].sort_values(by=['Task','Version', 'step'], ascending=True),
    final_df.sort_values(by=['Task','Version','Value'], ascending=True)[['Task', 'Version','Value', 'timestamp', 'duration']], 
    on=['Task','Version','Value'], 
    how='left' 
)

# Unpivot duration
id_columns = ['epoch', 'step', 'Task', 'Directory', 'Version',
       'timestamp']
metrics_columns = ['duration']

melted_df = pd.melt(
    merged.drop('Value', axis=1),
    id_vars=id_columns, 
    value_vars=metrics_columns,  
    var_name='Metric',  
    value_name='Value' 
)

# Remove rows with NaN in the 'Value' column
melted_df = melted_df.dropna(subset=['Value'])

# Concatenate all DataFrames into a single DataFrame
df = pd.concat([df, melted_df], ignore_index=True)

# Cleanup at the end
del data_frames, final_df, melted_df
gc.collect()

0

## 3. Filter data by epoch of  best performance on task 2 (best models)

In [15]:
# Filter for 'val_loss' metric on task 2
val_loss_data = df[df['Metric'] == 'val_loss/task_2']

# Find the row with the minimum val_loss for each Task and Version
min_val_loss = val_loss_data.loc[val_loss_data.groupby(['Task', 'Version'])['Value'].idxmin()]

# Select rows from df1 where columns match df2
best_models_df= df.merge(min_val_loss, on=['epoch', 'Task', 'Version'], suffixes=('','_y'))
best_models_df.drop(columns=['step_y', 'Directory_y', 'Metric_y',
       'Value_y', 'timestamp_y'], inplace=True)

best_models_df

Unnamed: 0,epoch,step,Task,Directory,Version,Metric,Value,timestamp
0,13,1028,task1,SNN_CL_task1_rate_numsteps_25_gain_0.5,2,ewc_loss,0.000000,
1,13,1028,task1,SNN_CL_task1_rate_numsteps_25_gain_0.5,2,train_loss,0.035925,
2,13,1077,task1,SNN_CL_task1_rate_numsteps_25_gain_0.5,2,co2_emissions,0.005240,
3,13,1077,task1,SNN_CL_task1_rate_numsteps_25_gain_0.5,2,ewc_loss,0.000000,
4,13,1077,task1,SNN_CL_task1_rate_numsteps_25_gain_0.5,2,train_loss,0.030908,
...,...,...,...,...,...,...,...,...
685,58,4542,task2_naive,SNN_CL_task2_naive_rate_numsteps_25_gain_0.5,0,duration,3378.969228,2024-12-19T02:53:16
686,55,4311,task2_naive,SNN_CL_task2_naive_rate_numsteps_25_gain_0.5,1,duration,3075.423814,2024-12-19T03:46:42
687,59,4619,task2_naive,SNN_CL_task2_naive_rate_numsteps_25_gain_0.5,2,duration,3522.931513,2024-12-19T04:52:54
688,57,4465,task2_naive,SNN_CL_task2_naive_rate_numsteps_25_gain_0.5,3,duration,3324.366687,2024-12-19T05:48:44


## 4. Summary results statistics

In [16]:
# Remove ewc_loss if not needed
best_models_df = best_models_df[best_models_df['Metric'] != 'ewc_loss']
# Group by Task and Metric and calculate the mean and standard deviation of the 'Value' column
grouped_stats = best_models_df.groupby(['Task', 'Metric'])['Value'].agg(['mean', 'std']).reset_index()

# Set 'Metric' as the main index and 'mean' and 'std' as subindexes
grouped_stats.set_index(['Task', 'Metric'], inplace=True)

# Display the result
grouped_stats = grouped_stats.stack().unstack('Task').transpose().sort_values(by=[('val_loss/task_1&2', 'mean')], ascending=True)
grouped_stats

Metric,co2_emissions,co2_emissions,duration,duration,ewc_loss,ewc_loss,train_loss,train_loss,val_loss/task_1,val_loss/task_1,val_loss/task_1&2,val_loss/task_1&2,val_loss/task_2,val_loss/task_2
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
Task,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
task2_cumulative,0.022668,0.002544,6310.994483,713.694311,0.0,0.0,0.026283,0.005607,0.025976,0.000461,0.039356,0.000351,0.052744,0.0007
task2_Rep0.15,0.013789,0.001278,3838.647218,355.410141,0.0,0.0,0.035539,0.006639,0.031115,0.001906,0.04119,0.001289,0.051265,0.001924
task2_Rep0.25,0.015855,0.000868,4424.266393,235.18749,0.0,0.0,0.024294,0.003289,0.031271,0.001388,0.041576,0.000983,0.051881,0.001284
task2_Rep0.2,0.014919,0.000888,4157.36806,234.869927,0.0,0.0,0.02858,0.004143,0.03152,0.000698,0.041808,0.00119,0.052096,0.002306
task2_Rep0.1,0.01321,0.000607,3688.671941,164.085562,0.0,0.0,0.032131,0.006112,0.033536,0.001333,0.042525,0.00082,0.051515,0.00045
task2_Rep0.05,0.011762,0.000879,3251.545412,237.438751,0.0,0.0,0.029112,0.004563,0.036481,0.001685,0.043609,0.001134,0.050736,0.000964
task2_ewc_1.0e+12,0.013346,0.000449,3725.731311,115.919998,0.000257,5e-05,0.076071,0.012667,0.039217,0.005619,0.058804,0.002058,0.078392,0.004138
task2_ewc_1.0e+11,0.012821,0.000356,3569.433721,107.965652,0.000235,2.3e-05,0.063371,0.012759,0.054798,0.00545,0.060978,0.002314,0.067159,0.002202
task2_ewc_5.0e+10,0.012735,0.000414,3534.323601,114.675649,0.000226,1.5e-05,0.053552,0.007815,0.060501,0.006389,0.061845,0.002965,0.06319,0.000958
task2_ewc_1.0e+10,0.012286,0.000471,3395.317305,133.475173,0.000228,2.5e-05,0.047948,0.007012,0.066805,0.006766,0.062956,0.003489,0.059108,0.001363
