In [12]:
import os
import numpy as np
import pandas as pd
import re

root_path = "../Run_logs/logs/"
path_list = os.listdir(root_path)

def parse_all_info(path_name):
    model_index = path_name.find("model_")
    data_index = path_name.find("data_")
    seed_index = path_name.find("seed_")
    miss_rate_index = path_name.find("miss_rate_")
    ablate_shared_encoder_index = path_name.find("ablate_shared_encoder_")
    ablate_missing_modality_features_index = path_name.find("ablate_missing_modality_features_")
    
    model_name = path_name[model_index + 6 : data_index - 1]
    data_name = path_name[data_index + 5 : seed_index - 1]
    seed_end_index = path_name.find("_", seed_index + 5)
    seed = int(path_name[seed_index + 5:seed_end_index])
    miss_rate = float(path_name[miss_rate_index + 10 : miss_rate_index + 13])
    ablate_shared_encoder = path_name[ablate_shared_encoder_index + 22 : ablate_shared_encoder_index + 26]
    ablate_missing_modality_features = path_name[ablate_missing_modality_features_index + 33 : ablate_missing_modality_features_index + 37]
    
    if ablate_shared_encoder == "True":
        ablate_shared_encoder = True
    else:
        ablate_shared_encoder = False

    if ablate_missing_modality_features == "True":
        ablate_missing_modality_features = True
    else:
        ablate_missing_modality_features = False
    
    return model_name, data_name, seed, miss_rate, ablate_shared_encoder, ablate_missing_modality_features


data_list = ["dsads", "realdisp"]
seed_list = [1, 42]

def extract_final_scores(path):
    """Extracts mean and standard deviation of scores from the score.txt file."""
    with open(path) as f:
        lines = f.readlines()

    # Initialize dictionaries to hold mean and std values
    mean_scores = {}
    std_scores = {}

    # Define a regex pattern to match the metric names and their values
    pattern = r"(\w+ \w+ F1|Test \w+): ([0-9.]+)"
    
    for line in lines:
        if line.startswith("MEAN") or line.startswith("STD"):
            matches = re.findall(pattern, line)
            for match in matches:
                key, value = match
                key = key.replace("Test ", "")  # Remove the 'Test' prefix from the key
                if line.startswith("MEAN"):
                    mean_scores[key] = float(value)
                elif line.startswith("STD"):
                    std_scores[key] = float(value)
    
    return mean_scores, std_scores


In [13]:
# Initialize an empty list to hold all parsed information and scores
data = []

# Assuming 'path_list' contains the relative paths to the folders with score.txt files
for folder in path_list:
    full_path = os.path.join(root_path, folder, "score.txt")
    if os.path.exists(full_path):
        model_name, data_name, seed, miss_rate, ablate_shared_encoder, ablate_missing_modality_features = parse_all_info(folder)
        mean_scores, std_scores = extract_final_scores(full_path)
        # Append data for Test Accuracy and weighted F1 Mean and STD
        data.append({
            'Dataset': data_name,
            'Seed': seed,
            'Miss Rate': miss_rate,
            'Acc': f"{mean_scores['Accuracy']:.3f}",
            'Acc std': f"{std_scores['Accuracy']:.3f}",
            'F1': f"{mean_scores['weighted F1']:.3f}",
            'F1 std': f"{std_scores['weighted F1']:.3f}",
            'Ablate ShaEnc': ablate_shared_encoder,
            'Ablate MissMod': ablate_missing_modality_features
        })

df = pd.DataFrame(data)
df = df.sort_values(by=["Dataset", "Miss Rate", "Seed"]).reset_index(drop=True)
df

Unnamed: 0,Dataset,Seed,Miss Rate,Acc,Acc std,F1,F1 std,Ablate ShaEnc,Ablate MissMod
0,dsads,1,0.0,0.872,0.03,0.857,0.035,False,False
1,dsads,1,0.0,0.832,0.015,0.822,0.02,True,False
2,dsads,1,0.0,0.872,0.03,0.857,0.035,False,True
3,dsads,42,0.0,0.848,0.025,0.839,0.031,False,True
4,dsads,42,0.0,0.847,0.033,0.837,0.033,True,False
5,dsads,42,0.0,0.848,0.025,0.839,0.031,False,False
6,dsads,1,0.2,0.053,0.0,0.005,0.0,True,False
7,dsads,1,0.2,0.053,0.0,0.005,0.0,False,True
8,dsads,1,0.2,0.878,0.026,0.876,0.024,False,False
9,dsads,42,0.2,0.053,0.0,0.005,0.0,False,True


In [14]:
# Only keep the rows where 'Ablate ShaEnc' is False and 'Ablate MissMod' is False
df = df[(df['Ablate ShaEnc'] == False) & (df['Ablate MissMod'] == False)]

# Drop the 'Ablate ShaEnc' and 'Ablate MissMod' columns
df = df.drop(columns=['Ablate ShaEnc', 'Ablate MissMod'])

df

Unnamed: 0,Dataset,Seed,Miss Rate,Acc,Acc std,F1,F1 std
0,dsads,1,0.0,0.872,0.03,0.857,0.035
5,dsads,42,0.0,0.848,0.025,0.839,0.031
8,dsads,1,0.2,0.878,0.026,0.876,0.024
10,dsads,42,0.2,0.855,0.037,0.847,0.038
13,dsads,1,0.4,0.844,0.05,0.835,0.057
16,dsads,42,0.4,0.86,0.04,0.85,0.046
20,dsads,1,0.6,0.794,0.046,0.785,0.048
22,dsads,42,0.6,0.844,0.044,0.834,0.047
25,realdisp,1,0.0,0.932,0.011,0.93,0.011
29,realdisp,42,0.0,0.936,0.011,0.933,0.014


In [15]:
# Drop seed column
df = df.drop(columns=['Seed']).reset_index(drop=True)
df

Unnamed: 0,Dataset,Miss Rate,Acc,Acc std,F1,F1 std
0,dsads,0.0,0.872,0.03,0.857,0.035
1,dsads,0.0,0.848,0.025,0.839,0.031
2,dsads,0.2,0.878,0.026,0.876,0.024
3,dsads,0.2,0.855,0.037,0.847,0.038
4,dsads,0.4,0.844,0.05,0.835,0.057
5,dsads,0.4,0.86,0.04,0.85,0.046
6,dsads,0.6,0.794,0.046,0.785,0.048
7,dsads,0.6,0.844,0.044,0.834,0.047
8,realdisp,0.0,0.932,0.011,0.93,0.011
9,realdisp,0.0,0.936,0.011,0.933,0.014


In [16]:
# Ensure numeric columns are of type float
numeric_cols = ['Miss Rate', 'Acc', 'Acc std', 'F1', 'F1 std']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Calculating average of every two rows for the last four columns
averaged_df = df.groupby(df.index // 2).agg({
    'Dataset': 'first',  # Keep the first dataset name
    'Miss Rate': 'first',  # Keep the first miss rate
    'Acc': 'mean',
    'Acc std': 'mean',
    'F1': 'mean',
    'F1 std': 'mean'
})

# Reset index to clean up the DataFrame
averaged_df.reset_index(drop=True, inplace=True)

# Display the resulting DataFrame
averaged_df

Unnamed: 0,Dataset,Miss Rate,Acc,Acc std,F1,F1 std
0,dsads,0.0,0.86,0.0275,0.848,0.033
1,dsads,0.2,0.8665,0.0315,0.8615,0.031
2,dsads,0.4,0.852,0.045,0.8425,0.0515
3,dsads,0.6,0.819,0.045,0.8095,0.0475
4,realdisp,0.0,0.934,0.011,0.9315,0.0125
5,realdisp,0.1,0.9185,0.022,0.9165,0.025
6,realdisp,0.3,0.899,0.032,0.8955,0.034
7,realdisp,0.5,0.899,0.0265,0.896,0.026
8,realdisp,0.7,0.787,0.0365,0.782,0.0375
