In [1]:
import pandas as pd
import ruamel.yaml as yaml
import os
import numpy as np
import sys
import sklearn.model_selection as ms

# ## Change to Root

# In[15]:


NO_CONFIG_ERR_MSG = """No config file found. Root directory is determined by presence of "config.yaml" file."""        

original_wd = os.getcwd()

# Number of times to move back in directory
num_retries = 10
for x in range(0, num_retries):
    # try to load config file    
    try:
        with open("config.yaml", 'r') as stream:
            cfg = yaml.safe_load(stream)
    # If not found move back one directory level
    except FileNotFoundError:
        os.chdir('../')
        # If reached the max number of directory levels change to original wd and print error msg
        if x+1 == num_retries:
            os.chdir(original_wd)
            print(NO_CONFIG_ERR_MSG)
            
# Add directory to PATH
path = os.getcwd()

if path not in sys.path:
    sys.path.append(path)

## Clean results so that similar outputs are organized by UL technique, dataset and metric/result

First clean based on one set and then expand to all others

## 1) Cleaning the Clustering Quality Metrics (SSE (minimize) for K Means, log likelihood (maximize) for GMM)

In [95]:
alg = 'BASE'
sse_base = pd.read_csv(f'{alg}/SSE.csv')
# Adjust column names
sse_base.columns = ['N_Clusters', 'Madelon_SSE', 'Cars_SSE']
cleaned_sse = (sse_base.assign(Clustering_Algorithm = 'K_Means') 
               .melt(id_vars=['Clustering_Algorithm', 'N_Clusters'], value_name='Value')
               .assign(Dataset= lambda X: X.variable.str.split('_').str.get(0))
               .assign(Metric= lambda X: X.variable.str.split('_').str.get(1))
               .drop(columns=['variable'])
)
cleaned_sse

def clean_clustering_metrics(fpath):
    metric_df = pd.read_csv(fpath)
    metric = fpath.split('/')[1].split('.')[0]
    # Correct Spelling mistake
    if metric == 'logliklihood':
        metric='log-likelihood'
    metric_df.columns = ['N_Clusters', 'Madelon_'+metric, 'Cars_'+metric]
    clean_metric_df = (metric_df
                   .melt(id_vars=['N_Clusters'], value_name='Value')
                   .assign(Dataset= lambda X: X.variable.str.split('_').str.get(0))
                   .assign(Metric= lambda X: X.variable.str.split('_').str.get(1))
                   .drop(columns=['variable'])
    )
    if clean_metric_df.Metric.unique()[0] == 'log-likelihood':
        clean_metric_df['Clustering_Algorithm'] = 'EM'
    elif clean_metric_df.Metric.unique()[0] == 'SSE':
        clean_metric_df['Clustering_Algorithm'] = 'K_Means'
    else:
        clean_metric_df['Clustering_Algorithm'] = np.nan
        
    column_order = ['Clustering_Algorithm', 'N_Clusters', 'Dataset', 'Metric', 'Value',]

    return clean_metric_df[column_order]
def clean_clustering_validation_metrics(algorithm_file_dir_prefix):
    sse_fpath = f'{algorithm_file_dir_prefix}/SSE.csv'
    em_fpath = f'{algorithm_file_dir_prefix}/logliklihood.csv'
    
    clean_sse = clean_clustering_metrics(sse_fpath)
    clean_em = clean_clustering_metrics(em_fpath)
    
    clean_clustering_metics_df = (pd.concat([clean_sse, clean_em],
                                         sort=False)
                               .reset_index(drop=True)
                              )
    return clean_clustering_metics_df
    

test_fpath1 = f'{alg}/SSE.csv'
test_fpath2 = f'{alg}/logliklihood.csv'

# display(clean_clustering_metrics(test_fpath1))
# display(clean_clustering_metrics(test_fpath2))
clean_clustering_validation_metrics('ICA')

Unnamed: 0,Clustering_Algorithm,N_Clusters,Dataset,Metric,Value
0,K_Means,2,Madelon,SSE,25919.169412
1,K_Means,5,Madelon,SSE,23805.080932
2,K_Means,10,Madelon,SSE,21680.907243
3,K_Means,15,Madelon,SSE,20389.563032
4,K_Means,20,Madelon,SSE,19498.187803
5,K_Means,25,Madelon,SSE,18810.762444
6,K_Means,30,Madelon,SSE,18196.739312
7,K_Means,35,Madelon,SSE,17761.460021
8,K_Means,40,Madelon,SSE,17291.711202
9,K_Means,2,Cars,SSE,20441.927175


## 2) Cleaning the classification/quality metrics of the clusters (accuracy when using cluster labels as predictions and Mutual information between cluster labels and target labels

In [106]:
test_acc_df = pd.read_csv('BASE/cars acc.csv')
(test_acc_df.rename(columns={'Unnamed: 0':'Clustering_Algorithm'})
 .melt(id_vars='Clustering_Algorithm', var_name='N_Clusters', value_name='Value',)
 .assign

)

def clean_clustering_classification_metric_df(fpath):
    metric_df = pd.read_csv(fpath)
    metric = fpath.split('/')[1].split('.')[0].split(' ')[1]
    # Correct Spelling mistake
    clean_metric_df = (metric_df.rename(columns={'Unnamed: 0':'Clustering_Algorithm'})
     .melt(id_vars='Clustering_Algorithm', var_name='N_Clusters', value_name='Value',)
    )
    if metric == 'acc':
        clean_metric_df['Metric'] = 'Accuracy'
    elif metric == 'adjMI':
        clean_metric_df['Metric'] = 'Mutual_Information'
    else:
        clean_metric_df['Metric'] = np.nan
    
    return clean_metric_df

def clean_clustering_classification_metrics(algorithm_file_dir_prefix, collection_of_dataset_names):
    clean_metric_dfs = []
    for dataset in collection_of_dataset_names:
        acc_fpath = f'{algorithm_file_dir_prefix}/{dataset} acc.csv'
        adjmi_fpath = f'{algorithm_file_dir_prefix}/{dataset} adjMI.csv'
#         print(acc_fpath)
#         print(adjmi_fpath)
        clean_acc = clean_clustering_classification_metric_df(acc_fpath)
        clean_acc['Dataset'] = dataset
        clean_adjmi = clean_clustering_classification_metric_df(adjmi_fpath)
        clean_adjmi['Dataset'] = dataset


        clean_classification_metric_df = (pd.concat([clean_acc, clean_adjmi],
                                             sort=False)
                                   .reset_index(drop=True)
                                  )
        clean_metric_dfs.append(clean_classification_metric_df)
    
    # Concat all together
    clean_metric_df = (pd.concat(clean_metric_dfs,
                                             sort=False)
                                   .reset_index(drop=True)
                                  )
    column_order = ['Clustering_Algorithm', 'N_Clusters', 'Dataset', 'Metric', 'Value',]


    return clean_metric_df[column_order]

def clean_all_clustering_non_grid_search_metrics(algorithm_file_dir_prefix, collection_of_dataset_names):
    """Clean the clustering metrics (Accuracy, Mutual Info, SSE for Kmeans and Likelihood for EM
    and pull into a single clean dataframe"""
    clean_clustering_validation_metrics_df = clean_clustering_validation_metrics(algorithm_file_dir_prefix)
    clean_clustering_classification_metrics_df = clean_clustering_classification_metrics(algorithm_file_dir_prefix, 
                                                                                         collection_of_dataset_names)
    clean_metric_df = (pd.concat([clean_clustering_validation_metrics_df, clean_clustering_classification_metrics_df],
                                             sort=False)
                                   .reset_index(drop=True)
                                  )
    return clean_metric_df

In [107]:
clean_all_clustering_non_grid_search_metrics('BASE', ['cars', 'Madelon'])

Unnamed: 0,Clustering_Algorithm,N_Clusters,Dataset,Metric,Value
0,K_Means,2,Madelon,SSE,901709.421231
1,K_Means,5,Madelon,SSE,891880.862735
2,K_Means,10,Madelon,SSE,885554.749546
3,K_Means,15,Madelon,SSE,881723.610571
4,K_Means,20,Madelon,SSE,878695.336027
5,K_Means,25,Madelon,SSE,875795.699936
6,K_Means,30,Madelon,SSE,873355.730479
7,K_Means,35,Madelon,SSE,870519.763084
8,K_Means,40,Madelon,SSE,867758.194699
9,K_Means,2,Cars,SSE,29401.376985


In [3]:
records = []
for algorithm in ['BASE', 'ICA', 'PCA', 'RP', 'RF']:
    for dataset in ['cars', 'madelon']:
        tmp_csv_scores = pd.read_csv(f'{algorithm}/{dataset} dim red.csv')
        best_n_components = tmp_csv_scores.sort_values(by='mean_test_score', ascending=False).filter(regex='components|filter').values[0][0]
        best_acc = tmp_csv_scores.sort_values(by='mean_test_score', ascending=False).mean_test_score.values[0]
        records.append((algorithm, dataset, best_n_components, best_acc))
        
        
best_N_components = pd.DataFrame(records, columns=['Algorithm', 'Dataset', 'N_Components_Maximizing_Test_Accuracy', 'Best_Test_Acc'])        
best_N_components.to_csv('best_N_components.csv', index=False)
best_N_components

Unnamed: 0,Algorithm,Dataset,N_Components_Maximizing_Test_Accuracy,Best_Test_Acc
0,ICA,cars,14,0.710069
1,ICA,madelon,15,0.599451
2,PCA,cars,12,0.855324
3,PCA,madelon,5,0.789011
4,RP,cars,35,0.824653
5,RP,madelon,45,0.545604
6,RF,cars,16,0.815972
7,RF,madelon,20,0.863736
