# Analysis of the AMLTK Pipeline
OpenFE-like Analysis

### To Do for the User

In [34]:
# No more To Dos

### Imports


In [35]:
import os
import numpy as np

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tabulate import tabulate

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Read Files

In [36]:
path = os.path.abspath("")
files = os.listdir(path)
files = [f for f in files if os.path.isfile(path+'/'+f)]

In [37]:
df_all = pd.DataFrame(columns=['Dataset', 'Method', 'Fold', 'Value'])

for file in files:
    core_name = file[len('results_'):-len('.parquet')]
    parts = core_name.split('_')
    dataset = parts[0]
    method = parts[2]
    fold = parts[3]
    
    df = pd.read_parquet(file)
    df = df[df['status'] == 'success']
    
    metric_accuracy = df["metric:roc_auc_ovo [0.0, 1.0] (maximize)"].tolist()
    value = np.average(metric_accuracy)
    df_all = df_all._append({'Dataset': dataset, 'Method': method, 'Fold': fold, 'Value': value}, ignore_index=True)
print(df_all)

IndexError: list index out of range

In [None]:
# Standard Deviation
df_stddev = df_all.groupby(['Dataset', 'Method'])['Value'].std().reset_index()
df_stddev.rename(columns={'Value': 'StdDev'}, inplace=True)
df_stddev['StdDev'] = df_stddev['StdDev'].fillna(0)
# Average values
df_summary = df_all.groupby(['Dataset', 'Method'])['Value'].mean().reset_index().fillna(0)

In [None]:
# Add standard deviation to the averages values
df_summary = pd.merge(df_summary, df_stddev)
df_summary['Value_with_StdDev'] = df_summary.apply(lambda row: f"{row['Value']} ± {row['StdDev']}", axis=1)

In [None]:
# Print and write to file
pivot_table = df_summary.pivot(index='Dataset', columns='Method', values='Value_with_StdDev').fillna('0 ± 0')
pivot_table.reset_index(inplace=True)
pivot_table.to_parquet('tabular_data.parquet')
print(tabulate(pivot_table, headers='keys', tablefmt='pretty'))

### Boxplots

In [None]:
# Extract values
def extract_value(value_with_stddev):
    value, _ = value_with_stddev.split(' ± ')
    return float(value)

pivot_table_numeric = pivot_table.set_index('Dataset').applymap(extract_value).reset_index()

# Melt DataFrame to format for seaborn
melted_df = pd.melt(pivot_table_numeric, id_vars=['Dataset'], var_name='Type', value_name='Value')

plt.figure(figsize=(10, 6))
sns.boxplot(x='Type', y='Value', data=melted_df)
plt.xlabel("Type")
plt.ylabel("Value")
plt.title("Box Plot of Values by Type")
plt.xticks(rotation=45)  # Rotate x-axis labels if necessary
plt.grid(True)
plt.tight_layout()
plt.savefig('plots/box_plot.png')
plt.show()