In [18]:
from scipy.stats import mannwhitneyu, normaltest
import pandas as pd
import numpy as np
from math import sqrt
from matplotlib import pyplot as plt
from typing import Tuple, List
from statistics import mean

%matplotlib inline

In [33]:
def tsv_to_df(algorithm: str, id: int) -> pd.DataFrame:
    
    path = f"../results/single_run/{algorithm}/{id}.tsv"
    
    df = pd.read_csv(filepath_or_buffer=path, sep="\t", index_col=False, skipinitialspace=True)
    
    df.columns = df.columns.str.strip()
    
    rename_dict = {
        "avg" : "mean_training_se",
        "std" : "std__training_se",
        "min" : "min__training_se",
        "max" : "max_training_se",
    }
        
    return df.rename(columns=rename_dict) 
        


# read and store all log files into dataframes
tournament_logs = []
elexicase_logs = []

for n in range(1,101):
    tournament_logs.append(
        tsv_to_df("tournament", n)
    )
    elexicase_logs.append(
        tsv_to_df("e_lexicase", n)
    )
    

In [34]:
test = tournament_logs[0]
test

Unnamed: 0,gen,nevals,mean_training_se,std__training_se,min__training_se,max_training_se,elite_testing_mse,elite_testing_err_std
0,0,50,352409000.0,2462700000.0,22.5681,17591300000.0,,
1,1,14,581.914,616.87,22.5681,4713.32,27.44,44.3033
2,2,13,965.275,4031.65,22.5681,29165.1,27.44,44.3033
3,3,7,9285.95,62377.1,22.5681,445920.0,27.44,44.3033
4,4,7,515334.0,2711920.0,22.5681,17912500.0,27.44,44.3033
5,5,10,126.016,160.71,22.5681,605.458,27.44,44.3033


In [35]:
test["nevals"].iloc[2]

13

In [52]:
def to_master_record(dfs: List[pd.DataFrame]) -> pd.DataFrame:
    
    headers = dfs[0].columns.values.tolist()
    ngens = len(dfs[0]["gen"])
    
    master = pd.DataFrame(0, index=np.arange(ngens), columns=headers)
    
    def mean_stddev(arr: List) -> float:
        agg = 0.0
        for val in arr:
            agg += val ** 2
        return sqrt(agg / len(arr))
        
            
    
    for header in headers:
                
        for gen in range(ngens):
            
            vals = []
            
            for df in dfs:
                vals.append(
                    float(df[header].iloc[gen])
                )
        
            if not "std" in header:
                # TODO: Check if correct!
                master[header].loc[gen] = mean(vals)
            
            else:
                master[header].loc[gen] = mean_stddev(vals)
                
    return master



to_master_record(tournament_logs)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  master[header].loc[gen] = mean(vals)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  master[header].loc[gen] = mean_stddev(vals)


Unnamed: 0,gen,nevals,mean_training_se,std__training_se,min__training_se,max_training_se,elite_testing_mse,elite_testing_err_std
0,0,50.0,72067430000.0,4900325000000.0,144.530728,3601899000000.0,,
1,1,12.32,50273600.0,3198816000.0,135.263557,2513529000.0,134.531064,165.72044
2,2,11.8,510666.4,32014570.0,128.4806,24906150.0,128.053816,157.283951
3,3,11.38,11461250.0,682981600.0,118.798197,572357700.0,117.027291,142.951712
4,4,12.36,5748104.0,285844900.0,114.990169,286911800.0,113.47341,141.286382
5,5,12.03,45786150.0,3139989000.0,111.902936,2285297000.0,110.906479,136.094299


In [None]:
def get_dataframe(filepath: str) -> pd.DataFrame:

    # load results into pd.DataFrame
    data_path = f"../results/{filepath}"
    df = pd.read_csv(
        filepath_or_buffer=data_path,
        sep=",",
        header=0,
    )

    return df


rename_dict = {
    'tournament_selection_training_min_fitness': "fit_train_tournament",
    'tournament_selection_testing_min_fitness' : "fit_test_tournament",
    'epsilon_lexicase_selection_training_min_fitness' : "fit_train_elexicase",
    'epsilon_lexicase_selection_testing_min_fitness' : "fit_test_elexicase"
    
}

results_square = get_dataframe("multiple_runs_squared.csv")
results_square.rename(columns=rename_dict,inplace=True)

results_absolute = get_dataframe("multiple_runs_absolute.csv")
results_absolute.rename(columns=rename_dict,inplace=True)


In [None]:
# descriptive statistics for experiment run with MAE as fitness
results_absolute.describe().to_csv("../docs/rmd/tables/descriptive_stats_absolute.csv")
results_absolute.describe().to_markdown("../docs/rmd/tables/descriptive_stats_absolute.md")
results_absolute.describe()

In [None]:
# descriptive statistics for experiment run with MSE as fitness
results_square.describe().to_csv("../docs/rmd/tables/descriptive_stats_square.csv")
results_square.describe().to_markdown("../docs/rmd/tables/descriptive_stats_square.md")
results_square.describe()

In [None]:
n_squared = len(results_square)
n_absolute = len(results_absolute)

In [None]:
# Check if data series are normal distributed at alpha=5%

HEADERS = results_square.columns.values.tolist()

print("MSE-Based: \n----")
for header in HEADERS:

    print(f"Data Series: {header}")
    statistic, pval = normaltest(results_square[header])

    print(f"Statistic: {statistic}\nP-Val: {pval}\n")



print("MAE-Based: \n----")
for header in HEADERS:

    print(f"Data Series: {header}")
    statistic, pval = normaltest(results_absolute[header])

    print(f"Statistic: {statistic}\nP-Val: {pval}\n")


In [None]:
def test_mannwhitneyu(df: pd.DataFrame, xheader: str, yheader: str) -> Tuple[float,float]:
    
    ALPHA = 0.05

    statistic, pval = mannwhitneyu(x = df[xheader],y = df[yheader])
    print(f"Statistic: {statistic}\nPVal: {pval}\nPVal < ALPHA: {pval < ALPHA}")

    if pval > ALPHA:
        print(f"Results supports H0 for alpha={ALPHA}\n H0: The distribution underlying sample {xheader} is the same as the distribution underlying sample {yheader}")

    else:
        print(f"H0 can be rejected for alpha={ALPHA}\nThe distribution underlying sample {xheader} is NOT the same as the distribution underlying sample {yheader}")
    
    return statistic, pval


In [None]:
fit_train_square_mwu = test_mannwhitneyu(results_square, "fit_train_tournament", "fit_train_elexicase")

plt.figure()

subplot_train_squared = results_square.boxplot(column=[HEADERS[0], HEADERS[2]])
subplot_train_squared.set_ylabel("MSE")
subplot_train_squared.set_title(f"Distribution for n={n_squared} runs")

plt.savefig("../docs/rmd/plots/box_train_square.png")

In [None]:
fit_train_abs_mwu = test_mannwhitneyu(results_absolute, "fit_train_tournament", "fit_train_elexicase")

plt.figure()

subplot_train_absolute = results_absolute.boxplot(column=[HEADERS[0], HEADERS[2]])
subplot_train_absolute.set_ylabel("MAE")
subplot_train_absolute.set_title(f"Distribution for n={n_absolute} runs")

plt.savefig("../docs/rmd/plots/box_train_absolute.png")

In [None]:
fit_test_square_mwu = test_mannwhitneyu(results_square, "fit_test_tournament", "fit_test_elexicase")

plt.figure()

subplot_test_squared = results_square.boxplot(column=[HEADERS[1], HEADERS[3]])
subplot_test_squared.set_ylabel("MSE")
subplot_test_squared.set_title(f"Distribution for n={n_squared} runs")

plt.savefig("../docs/rmd/plots/box_test_square.png")


In [None]:
fit_test_abs_mwu = test_mannwhitneyu(results_absolute, "fit_test_tournament", "fit_test_elexicase")

plt.figure()

subplot_test_absolute = results_absolute.boxplot(column=[HEADERS[1], HEADERS[3]])
subplot_test_absolute.set_ylabel("MAE")
subplot_test_absolute.set_title(f"Distribution for n={n_absolute} runs")

plt.savefig("../docs/rmd/plots/box_test_abs.png")



In [None]:
# save mwu results in csv files

ALPHA = 0.05
PATH = "../docs/rmd/tables/testing_mwu.csv"

with open(PATH, "w") as file:
    file.write("phase,fitness_function,statistic,p-val,lt_alpha\n")
    file.write(f"testing,mse,{fit_test_square_mwu[0]},{fit_test_square_mwu[1]},{fit_test_square_mwu[1] < ALPHA}\n")
    file.write(f"testing,mae,{fit_test_abs_mwu[0]},{fit_test_abs_mwu[1]},{fit_test_abs_mwu[1] < ALPHA}\n")

PATH = "../docs/rmd/tables/training_mwu.csv"

with open(PATH, "w") as file:
    file.write("phase,fitness_function,statistic,p-val,lt_alpha\n")
    file.write(f"training,mse,{fit_train_square_mwu[0]},{fit_train_square_mwu[1]},{fit_train_square_mwu[1] < ALPHA}\n")
    file.write(f"training,mae,{fit_train_abs_mwu[0]},{fit_train_abs_mwu[1]},{fit_train_abs_mwu[1] < ALPHA}\n")

    