In [1]:
import os
import numpy as np
import pandas as pd
import re

root_path = "../Run_logs/logs/"
path_list = os.listdir(root_path)

def parse_all_info(path_name):
    model_index = path_name.find("model_")
    data_index = path_name.find("data_")
    seed_index = path_name.find("seed_")
    miss_rate_index = path_name.find("miss_rate_")
    ablate_shared_encoder_index = path_name.find("ablate_shared_encoder_")
    ablate_missing_modality_features_index = path_name.find("ablate_missing_modality_features_")
    
    model_name = path_name[model_index + 6 : data_index - 1]
    data_name = path_name[data_index + 5 : seed_index - 1]
    seed_end_index = path_name.find("_", seed_index + 5)
    seed = int(path_name[seed_index + 5:seed_end_index])
    miss_rate = float(path_name[miss_rate_index + 10 : miss_rate_index + 13])
    ablate_shared_encoder = path_name[ablate_shared_encoder_index + 22 : ablate_shared_encoder_index + 26]
    ablate_missing_modality_features = path_name[ablate_missing_modality_features_index + 33 : ablate_missing_modality_features_index + 37]
    
    if ablate_shared_encoder == "True":
        ablate_shared_encoder = True
    else:
        ablate_shared_encoder = False

    if ablate_missing_modality_features == "True":
        ablate_missing_modality_features = True
    else:
        ablate_missing_modality_features = False
    
    return model_name, data_name, seed, miss_rate, ablate_shared_encoder, ablate_missing_modality_features


data_list = ["dsads", "realdisp"]
seed_list = [1, 42]

def extract_final_scores(path):
    """Extracts mean and standard deviation of scores from the score.txt file."""
    with open(path) as f:
        lines = f.readlines()

    # Initialize dictionaries to hold mean and std values
    mean_scores = {}
    std_scores = {}

    # Define a regex pattern to match the metric names and their values
    pattern = r"(\w+ \w+ F1|Test \w+): ([0-9.]+)"
    
    for line in lines:
        if line.startswith("MEAN") or line.startswith("STD"):
            matches = re.findall(pattern, line)
            for match in matches:
                key, value = match
                key = key.replace("Test ", "")  # Remove the 'Test' prefix from the key
                if line.startswith("MEAN"):
                    mean_scores[key] = float(value)
                elif line.startswith("STD"):
                    std_scores[key] = float(value)
    
    return mean_scores, std_scores

def average_every_two_rows(df):
    # Ensure numeric columns are of type float
    numeric_cols = ['Miss Rate', 'Acc', 'Acc std', 'F1', 'F1 std']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

    # Calculating average of every two rows for the last four columns
    averaged_df = df.groupby(df.index // 2).agg({
        'Dataset': 'first',  # Keep the first dataset name
        'Miss Rate': 'first',  # Keep the first miss rate
        'Acc': 'mean',
        'Acc std': 'mean',
        'F1': 'mean',
        'F1 std': 'mean'
    })

    # Reset index to clean up the DataFrame
    averaged_df.reset_index(drop=True, inplace=True)

    return averaged_df


In [2]:
# Initialize an empty list to hold all parsed information and scores
data = []

# Assuming 'path_list' contains the relative paths to the folders with score.txt files
for folder in path_list:
    full_path = os.path.join(root_path, folder, "score.txt")
    if os.path.exists(full_path):
        model_name, data_name, seed, miss_rate, ablate_shared_encoder, ablate_missing_modality_features = parse_all_info(folder)
        mean_scores, std_scores = extract_final_scores(full_path)
        # Append data for Test Accuracy and weighted F1 Mean and STD
        data.append({
            'Dataset': data_name,
            'Seed': seed,
            'Miss Rate': miss_rate,
            'Acc': mean_scores['Accuracy'],
            'Acc std': std_scores['Accuracy'],
            'F1': mean_scores['weighted F1'],
            'F1 std': std_scores['weighted F1'],
            'Ablate ShaEnc': ablate_shared_encoder,
            'Ablate MissModGen': ablate_missing_modality_features
        })

df_results = pd.DataFrame(data)
df_results = df_results.sort_values(by=["Dataset", "Miss Rate", "Seed"]).reset_index(drop=True)
df_results

Unnamed: 0,Dataset,Seed,Miss Rate,Acc,Acc std,F1,F1 std,Ablate ShaEnc,Ablate MissModGen
0,dsads,1,0.0,0.878618,0.03613,0.877047,0.03121,False,False
1,dsads,1,0.0,0.876316,0.011931,0.871253,0.014885,True,False
2,dsads,1,0.0,0.878618,0.03613,0.877047,0.03121,False,True
3,dsads,42,0.0,0.889693,0.034667,0.88517,0.035249,True,False
4,dsads,42,0.0,0.846382,0.038628,0.836779,0.044555,False,True
5,dsads,42,0.0,0.846382,0.038628,0.836779,0.044555,False,False
6,dsads,1,0.2,0.87182,0.023271,0.866301,0.024313,False,False
7,dsads,1,0.2,0.841009,0.018433,0.829856,0.023474,True,False
8,dsads,1,0.2,0.869627,0.017252,0.862005,0.009672,False,True
9,dsads,42,0.2,0.845395,0.013123,0.832022,0.020177,True,False


In [3]:
# Assuming df_results is defined somewhere above

def filter_and_process_df(df, sha_enc_condition, miss_mod_condition):
    """
    Filter DataFrame based on conditions, drop specified columns, reset index, and average every two rows.
    """
    filtered_df = df[(df['Ablate ShaEnc'] == sha_enc_condition) & (df['Ablate MissModGen'] == miss_mod_condition)]
    processed_df = filtered_df.drop(columns=['Ablate ShaEnc', 'Ablate MissModGen', 'Seed']).reset_index(drop=True)
    return average_every_two_rows(processed_df)

def average_every_two_rows(df):
    """
    Returns a new DataFrame with every two rows averaged.
    """
    # Ensure numeric columns are of type float
    numeric_cols = ['Miss Rate', 'Acc', 'Acc std', 'F1', 'F1 std']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

    # Calculating average of every two rows for the last four columns
    averaged_df = df.groupby(df.index // 2).agg({
        'Dataset': 'first',  # Keep the first dataset name
        'Miss Rate': 'first',  # Keep the first miss rate
        'Acc': 'mean',
        'Acc std': 'mean',
        'F1': 'mean',
        'F1 std': 'mean'
    }).reset_index(drop=True).round(3)

    return averaged_df

In [4]:
# Process and save DataFrames
df_full_model = filter_and_process_df(df_results, False, False)
df_full_model.to_csv("results_full_model.csv", index=False)

print("=" * 32, " Full Model ", "=" * 32)
df_full_model



Unnamed: 0,Dataset,Miss Rate,Acc,Acc std,F1,F1 std
0,dsads,0.0,0.862,0.037,0.857,0.038
1,dsads,0.2,0.867,0.028,0.857,0.031
2,dsads,0.4,0.878,0.037,0.87,0.037
3,dsads,0.6,0.851,0.023,0.838,0.025
4,realdisp,0.0,0.934,0.027,0.93,0.032
5,realdisp,0.1,0.929,0.032,0.925,0.036
6,realdisp,0.3,0.934,0.024,0.934,0.024
7,realdisp,0.5,0.919,0.019,0.918,0.02
8,realdisp,0.7,0.829,0.035,0.826,0.036


In [5]:
df_ablate_missing_gen = filter_and_process_df(df_results, False, True)
df_ablate_missing_gen.to_csv("results_ablate_missmodgen.csv", index=False)
print("=" * 32, " Ablate MissModGen True ", "=" * 32)
df_ablate_missing_gen



Unnamed: 0,Dataset,Miss Rate,Acc,Acc std,F1,F1 std
0,dsads,0.0,0.862,0.037,0.857,0.038
1,dsads,0.2,0.88,0.014,0.871,0.013
2,dsads,0.4,0.855,0.016,0.844,0.021
3,dsads,0.6,0.836,0.046,0.83,0.047
4,realdisp,0.0,0.934,0.027,0.93,0.032
5,realdisp,0.1,0.93,0.031,0.926,0.036
6,realdisp,0.3,0.915,0.02,0.912,0.023
7,realdisp,0.5,0.879,0.031,0.876,0.033
8,realdisp,0.7,0.768,0.039,0.766,0.04


In [6]:
df_ablate_shaenc = filter_and_process_df(df_results, True, False)
df_ablate_shaenc.to_csv("results_ablate_shaenc.csv", index=False)
print("=" * 32, " Ablate ShaEnc True ", "=" * 32)
df_ablate_shaenc



Unnamed: 0,Dataset,Miss Rate,Acc,Acc std,F1,F1 std
0,dsads,0.0,0.883,0.023,0.878,0.025
1,dsads,0.2,0.843,0.016,0.831,0.022
2,dsads,0.4,0.837,0.036,0.828,0.04
3,dsads,0.6,0.81,0.041,0.8,0.044
4,realdisp,0.0,0.94,0.023,0.939,0.025
5,realdisp,0.1,0.936,0.025,0.933,0.03
6,realdisp,0.3,0.906,0.029,0.903,0.031
7,realdisp,0.5,0.881,0.035,0.878,0.036
8,realdisp,0.7,0.782,0.044,0.78,0.044
