In [1]:
import os
import numpy as np
import pandas as pd
import re

root_path = "../Run_logs/logs/"
path_list = os.listdir(root_path)

def parse_all_info(path_name):
    model_index = path_name.find("model_")
    data_index = path_name.find("data_")
    seed_index = path_name.find("seed_")
    miss_rate_index = path_name.find("miss_rate_")
    ablate_shared_encoder_index = path_name.find("ablate_shared_encoder_")
    ablate_missing_modality_features_index = path_name.find("ablate_missing_modality_features_")
    
    model_name = path_name[model_index + 6 : data_index - 1]
    data_name = path_name[data_index + 5 : seed_index - 1]
    seed_end_index = path_name.find("_", seed_index + 5)
    seed = int(path_name[seed_index + 5:seed_end_index])
    miss_rate = float(path_name[miss_rate_index + 10 : miss_rate_index + 13])
    ablate_shared_encoder = path_name[ablate_shared_encoder_index + 22 : ablate_shared_encoder_index + 26]
    ablate_missing_modality_features = path_name[ablate_missing_modality_features_index + 33 : ablate_missing_modality_features_index + 37]
    
    if ablate_shared_encoder == "True":
        ablate_shared_encoder = True
    else:
        ablate_shared_encoder = False

    if ablate_missing_modality_features == "True":
        ablate_missing_modality_features = True
    else:
        ablate_missing_modality_features = False
    
    return model_name, data_name, seed, miss_rate, ablate_shared_encoder, ablate_missing_modality_features


data_list = ["dsads", "realdisp"]
seed_list = [1, 42]

def extract_final_scores(path):
    """Extracts mean and standard deviation of scores from the score.txt file."""
    with open(path) as f:
        lines = f.readlines()

    # Initialize dictionaries to hold mean and std values
    mean_scores = {}
    std_scores = {}

    # Define a regex pattern to match the metric names and their values
    pattern = r"(\w+ \w+ F1|Test \w+): ([0-9.]+)"
    
    for line in lines:
        if line.startswith("MEAN") or line.startswith("STD"):
            matches = re.findall(pattern, line)
            for match in matches:
                key, value = match
                key = key.replace("Test ", "")  # Remove the 'Test' prefix from the key
                if line.startswith("MEAN"):
                    mean_scores[key] = float(value)
                elif line.startswith("STD"):
                    std_scores[key] = float(value)
    
    return mean_scores, std_scores

def average_every_two_rows(df):
    # Ensure numeric columns are of type float
    numeric_cols = ['Miss Rate', 'Acc', 'Acc std', 'F1', 'F1 std']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

    # Calculating average of every two rows for the last four columns
    averaged_df = df.groupby(df.index // 2).agg({
        'Dataset': 'first',  # Keep the first dataset name
        'Miss Rate': 'first',  # Keep the first miss rate
        'Acc': 'mean',
        'Acc std': 'mean',
        'F1': 'mean',
        'F1 std': 'mean'
    })

    # Reset index to clean up the DataFrame
    averaged_df.reset_index(drop=True, inplace=True)

    return averaged_df


In [2]:
# Initialize an empty list to hold all parsed information and scores
data = []

# Assuming 'path_list' contains the relative paths to the folders with score.txt files
for folder in path_list:
    full_path = os.path.join(root_path, folder, "score.txt")
    if os.path.exists(full_path):
        model_name, data_name, seed, miss_rate, ablate_shared_encoder, ablate_missing_modality_features = parse_all_info(folder)
        mean_scores, std_scores = extract_final_scores(full_path)
        # Append data for Test Accuracy and weighted F1 Mean and STD
        data.append({
            'Dataset': data_name,
            'Seed': seed,
            'Miss Rate': miss_rate,
            'Acc': f"{mean_scores['Accuracy']:.3f}",
            'Acc std': f"{std_scores['Accuracy']:.3f}",
            'F1': f"{mean_scores['weighted F1']:.3f}",
            'F1 std': f"{std_scores['weighted F1']:.3f}",
            'Ablate ShaEnc': ablate_shared_encoder,
            'Ablate MissModGen': ablate_missing_modality_features
        })

df_results = pd.DataFrame(data)
df_results = df_results.sort_values(by=["Dataset", "Miss Rate", "Seed"]).reset_index(drop=True)
df_results

Unnamed: 0,Dataset,Seed,Miss Rate,Acc,Acc std,F1,F1 std,Ablate ShaEnc,Ablate MissModGen
0,dsads,1,0.0,0.879,0.036,0.877,0.031,False,False
1,dsads,1,0.0,0.876,0.012,0.871,0.015,True,False
2,dsads,1,0.0,0.879,0.036,0.877,0.031,False,True
3,dsads,42,0.0,0.89,0.035,0.885,0.035,True,False
4,dsads,42,0.0,0.846,0.039,0.837,0.045,False,True
5,dsads,42,0.0,0.846,0.039,0.837,0.045,False,False
6,dsads,1,0.2,0.872,0.023,0.866,0.024,False,False
7,dsads,1,0.2,0.856,0.024,0.847,0.026,True,False
8,dsads,1,0.2,0.891,0.02,0.89,0.018,False,True
9,dsads,42,0.2,0.845,0.031,0.835,0.04,True,False


In [3]:
# Assuming df_results is defined somewhere above

def filter_and_process_df(df, sha_enc_condition, miss_mod_condition):
    """
    Filter DataFrame based on conditions, drop specified columns, reset index, and average every two rows.
    """
    filtered_df = df[(df['Ablate ShaEnc'] == sha_enc_condition) & (df['Ablate MissModGen'] == miss_mod_condition)]
    processed_df = filtered_df.drop(columns=['Ablate ShaEnc', 'Ablate MissModGen', 'Seed']).reset_index(drop=True)
    return average_every_two_rows(processed_df)

def average_every_two_rows(df):
    """
    Returns a new DataFrame with every two rows averaged.
    """
    # Ensure numeric columns are of type float
    numeric_cols = ['Miss Rate', 'Acc', 'Acc std', 'F1', 'F1 std']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

    # Calculating average of every two rows for the last four columns
    averaged_df = df.groupby(df.index // 2).agg({
        'Dataset': 'first',  # Keep the first dataset name
        'Miss Rate': 'first',  # Keep the first miss rate
        'Acc': 'mean',
        'Acc std': 'mean',
        'F1': 'mean',
        'F1 std': 'mean'
    })

    # Reset index to clean up the DataFrame
    averaged_df.reset_index(drop=True, inplace=True)

    return averaged_df

In [4]:
# Process and save DataFrames
df_full_model = filter_and_process_df(df_results, False, False)
df_full_model.to_csv("results_full_model.csv", index=False)

print("=" * 32, " Full Model ", "=" * 32)
df_full_model



Unnamed: 0,Dataset,Miss Rate,Acc,Acc std,F1,F1 std
0,dsads,0.0,0.8625,0.0375,0.857,0.038
1,dsads,0.2,0.8665,0.0275,0.857,0.031
2,dsads,0.4,0.878,0.0365,0.8695,0.0375
3,dsads,0.6,0.8505,0.0225,0.838,0.025
4,realdisp,0.0,0.934,0.027,0.93,0.032
5,realdisp,0.1,0.929,0.0325,0.9255,0.0365
6,realdisp,0.3,0.934,0.024,0.9335,0.024
7,realdisp,0.5,0.919,0.019,0.9175,0.02
8,realdisp,0.7,0.8295,0.0355,0.826,0.036


In [5]:
df_ablate_missing_gen = filter_and_process_df(df_results, False, True)
df_ablate_missing_gen.to_csv("results_ablate_missmodgen.csv", index=False)
print("=" * 32, " Ablate MissModGen True ", "=" * 32)
print(df_ablate_missing_gen)

    Dataset  Miss Rate     Acc  Acc std      F1  F1 std
0     dsads        0.0  0.8625   0.0375  0.8570  0.0380
1     dsads        0.2  0.8845   0.0230  0.8810  0.0235
2     dsads        0.4  0.8570   0.0410  0.8485  0.0460
3     dsads        0.6  0.8580   0.0375  0.8500  0.0410
4  realdisp        0.0  0.9340   0.0270  0.9300  0.0320
5  realdisp        0.1  0.9190   0.0250  0.9160  0.0280
6  realdisp        0.3  0.9220   0.0285  0.9205  0.0295
7  realdisp        0.5  0.8725   0.0440  0.8700  0.0455
8  realdisp        0.7  0.7845   0.0585  0.7815  0.0620


In [6]:
df_ablate_shaenc = filter_and_process_df(df_results, True, False)
df_ablate_shaenc.to_csv("results_ablate_shaenc.csv", index=False)
print("=" * 32, " Ablate ShaEnc True ", "=" * 32)
print(df_ablate_shaenc)

    Dataset  Miss Rate     Acc  Acc std      F1  F1 std
0     dsads        0.0  0.8830   0.0235  0.8780  0.0250
1     dsads        0.2  0.8505   0.0275  0.8410  0.0330
2     dsads        0.4  0.8505   0.0390  0.8380  0.0360
3     dsads        0.6  0.8125   0.0325  0.8000  0.0400
4  realdisp        0.0  0.9400   0.0235  0.9390  0.0250
5  realdisp        0.1  0.9355   0.0265  0.9320  0.0315
6  realdisp        0.3  0.9000   0.0410  0.8950  0.0455
7  realdisp        0.5  0.8545   0.0220  0.8500  0.0250
8  realdisp        0.7  0.7225   0.0510  0.7155  0.0520
