# Evalutate Distance to Aggregation
Due to the number of columns that have null values we have two options when creating training data for our models:
1. Keep the null values and use a model that allows for nulls
1. Impute the null values 

In this notebook I will look at differnet ways to impute these values. This will be done by imputing using an aggregation method( i.e. mean, median etc) and calculating the difference each value in the column is to the mean, then summing the total 

i.e. for each value x  in null column:
        y= x - mean
     sum(y)

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv", index_col = 0)
sub = pd.read_csv("../input/tabular-playground-series-jun-2022/sample_submission.csv", index_col = 0)

# F Groupings

In [None]:
F_1_cols = [col for col in data.columns if "F_1" in col]
F_2_cols = [col for col in data.columns if "F_2" in col]
F_3_cols = [col for col in data.columns if "F_3" in col]
F_4_cols = [col for col in data.columns if "F_4" in col]
all_cols = [col for col in data.columns]

# Evaluate imputation methods per F-Grouping 
We will look at each group of columns (F_1, F_3 & F_4) and then use imputation methods to determine which imputation method most correctly represents the data 
Methods to be used: 
* Mean, median, mode aggregation 
* Groupby columns (we will use F_2 columns to group by as these are integers) with mean/median/mode aggregation

In [None]:
impute_type = "mean"

In [None]:
def baseline_measure(eval_cols,type_agg = "mean"):
    base_list = []
    for col in eval_cols:
        if type_agg == "mean":
            baseline = (np.abs(data[col] - data[col].mean())/len(data)).sum()
        elif type_agg == "std":
            baseline = (np.abs(data[col] - data[col].std())/len(data)).sum()
        elif type_agg == "median":
            baseline= (np.abs(data[col] - data[col].median())/len(data)).sum()
        base_list.append(baseline)

    plt.figure(figsize = (25,5))
    ax =sns.barplot(x = eval_cols, y = base_list)
    plt.bar_label(ax.containers[0])
    plt.title(f"Measurement of each columns closeness to its {type_agg}")
    plt.show()
    
    return base_list

In [None]:
#np.sum(np.abs((data['F_4_0'] - data.mean()['F_4_0'])/len(data)))

#(np.abs(data["F_4_1"] - data.groupby("F_2_1").transform('mean')["F_4_1"])/len(data)).sum()

In [None]:
def groupby_measures(eval_cols,type_agg = "mean"):
    fig, ax = plt.subplots(5,int(len(eval_cols)/5), figsize = (25,30))
    ax = ax.ravel()

    min_vals = {}
    for i, column in enumerate(eval_cols):   
        dist_list = []
        for groupby_col in F_2_cols:
            dist_agg =(np.abs(data[column] - data.groupby(groupby_col).transform(type_agg)[column])/len(data)).sum()
            dist_list.append( dist_agg)
        min_vals[column] = str(F_2_cols[np.argmin(dist_list)])+" : "+str(dist_list[np.argmin(dist_list)])

        sns.barplot(ax = ax[i], x=F_2_cols, y=dist_list)
        # ax[i].bar_label(ax[i].containers[0])
        ax[i].set(ylim = (np.min(dist_list), np.max(dist_list)))
        ax[i].set_title(f"{column}: best groupby {F_2_cols[np.argmin(dist_list)]} ={dist_list[np.argmin(dist_list)]}")
        ax[i].tick_params(labelrotation=90)
    plt.tight_layout()
    plt.show()
    
    return min_vals

In [None]:
def dataframe_formatter(baseline_mean,baseline_median,baseline_mode, groupings):
    baseline = pd.DataFrame(baseline_mean, index = groupings.keys(), columns=["baseline_mean"])
    baseline["baseline_median"] = baseline_median
    baseline["baseline_mode"] = baseline_mode
    df =pd.DataFrame({"Best_grp_col":[i.split(':')[0] for i in list(groupings.values())],
                 "Grp_val":[float(i.split(':')[1]) for i in list(groupings.values())]}, index =groupings.keys() )
    df = df.merge(baseline,left_index=True, right_index= True)

    return df

# F_1 cols 

In [None]:
base_f1_mean = baseline_measure(F_1_cols, type_agg = "mean")
base_f1_median = baseline_measure(F_1_cols, type_agg = "median")
base_f1_std = baseline_measure(F_1_cols, type_agg = "std")

In [None]:
f1_best_grps = groupby_measures(F_1_cols, type_agg = "median")

In [None]:
f_1_results = dataframe_formatter(base_f1_mean,base_f1_median, base_f1_std, f1_best_grps)
f_1_results.to_csv("F1_results.csv")
f_1_results.style.format('{:.4g}',subset= ['Grp_val','baseline_mean','baseline_median','baseline_mode']).highlight_min(subset = ['Grp_val','baseline_mean','baseline_median','baseline_mode'],color = 'lightgreen', axis = 1)

# F_3 cols 

In [None]:
base_f3_mean = baseline_measure(F_3_cols, type_agg = "mean")
base_f3_median = baseline_measure(F_3_cols, type_agg = "median")
base_f3_std = baseline_measure(F_3_cols, type_agg = "std")

In [None]:
f3_grps_median = groupby_measures(F_3_cols, type_agg = "median")

In [None]:
f3_grps_mean = groupby_measures(F_3_cols, type_agg = "mean")

In [None]:
f_3_results = dataframe_formatter(base_f3_mean,base_f3_median, base_f3_std, f3_grps_mean)
f_3_results.to_csv("f_3_results.csv")
f_3_results.style.format('{:.4g}',subset= ['Grp_val','baseline_mean','baseline_median','baseline_mode']).highlight_min(subset = ['Grp_val','baseline_mean','baseline_median','baseline_mode'],color = 'lightgreen', axis = 1)

# F_4 cols

In [None]:
base_f4_mean = baseline_measure(F_4_cols, type_agg = "mean")
base_f4_median = baseline_measure(F_4_cols, type_agg = "median")
base_f4_std = baseline_measure(F_4_cols, type_agg = "std")

In [None]:
f4_best_grps = groupby_measures(F_4_cols, type_agg = "median")

In [None]:
f_4_results = dataframe_formatter(base_f4_mean,base_f4_median, base_f4_std, f4_best_grps)
f_4_results.to_csv("f_4_results.csv")
f_4_results.style.format('{:.4g}',subset= ['Grp_val','baseline_mean','baseline_median','baseline_mode']).highlight_min(subset = ['Grp_val','baseline_mean','baseline_median','baseline_mode'],color = 'lightgreen', axis = 1)

In [None]:
# baseline = pd.DataFrame(base_f4, index = cols_to_eval, columns=["baseline_mean"])
# df =pd.DataFrame({"Best_grp_col":[i.split(':')[0] for i in list(f4_best_grps.values())],
#              "Grp_val":[float(i.split(':')[1]) for i in list(f4_best_grps.values())]}, index =f4_best_grps.keys() )
# df = df.merge(baseline,left_index=True, right_index= True)
# df.style.format('{:.4g}',subset= ['Grp_val','baseline_mean']).highlight_min(subset = ['Grp_val','baseline_mean'],color = 'lightgreen', axis = 1)

In [None]:
# min_vals = {}
# for i, column in enumerate(cols_to_eval):   
#     measure_list = []
    
#     #groupby evaluation
#     for groupby_col in F_2_cols:
#         measure =( not_missing[column] - not_missing.groupby(groupby_col).transform('mean')[column])/ len(not_missing)
#         measure_list.append( measure.sum())

#     min_vals[column] = str(F_2_cols[np.argmin(np.abs(measure_list))])+" : "+str(measure_list[np.argmin(np.abs(measure_list))])
# min_vals

In [None]:
# fig, ax = plt.subplots(5,3, figsize = (25,30))
# ax = ax.ravel()

# min_vals = {}
# for i, column in enumerate(cols_to_eval):   
#     measure_list = []
    
#     #groupby evaluation
#     for groupby_col in F_2_cols:
#         measure =( data[column] - data.groupby(groupby_col).transform('mean')[column])/ len(data)
#         measure_list.append( measure.sum())
#     min_vals[column] = str(F_2_cols[np.argmin(np.abs(measure_list))])+" : "+str(measure_list[np.argmin(np.abs(measure_list))])
    
#     sns.barplot(ax = ax[i], x=F_2_cols, y=measure_list)
#     # ax[i].bar_label(ax[i].containers[0])
#     ax[i].set_title(f"{column}: best groupby {F_2_cols[np.argmin(np.abs(measure_list))]} ={measure_list[np.argmin(np.abs(measure_list))]}")
#     ax[i].tick_params(labelrotation=90)
    
# plt.tight_layout()
# plt.show()