In [1]:
import pandas as pd
import ruamel.yaml as yaml
import os
import numpy as np
import sys
import sklearn.model_selection as ms

# ## Change to Root

# In[15]:


NO_CONFIG_ERR_MSG = """No config file found. Root directory is determined by presence of "config.yaml" file."""        

original_wd = os.getcwd()

# Number of times to move back in directory
num_retries = 10
for x in range(0, num_retries):
    # try to load config file    
    try:
        with open("config.yaml", 'r') as stream:
            cfg = yaml.safe_load(stream)
    # If not found move back one directory level
    except FileNotFoundError:
        os.chdir('../')
        # If reached the max number of directory levels change to original wd and print error msg
        if x+1 == num_retries:
            os.chdir(original_wd)
            print(NO_CONFIG_ERR_MSG)
            
# Add directory to PATH
path = os.getcwd()

if path not in sys.path:
    sys.path.append(path)

## Clean results so that similar outputs are organized by UL technique, dataset and metric/result

First clean based on one set and then expand to all others

## 1) Cleaning the Clustering Quality Metrics (SSE (minimize) for K Means, log likelihood (maximize) for GMM)

In [2]:
alg = 'BASE'
sse_base = pd.read_csv(f'{alg}/SSE.csv')
# Adjust column names
sse_base.columns = ['N_Clusters', 'Madelon_SSE', 'Cars_SSE']
cleaned_sse = (sse_base.assign(Clustering_Algorithm = 'K_Means') 
               .melt(id_vars=['Clustering_Algorithm', 'N_Clusters'], value_name='Value')
               .assign(Dataset= lambda X: X.variable.str.split('_').str.get(0))
               .assign(Metric= lambda X: X.variable.str.split('_').str.get(1))
               .drop(columns=['variable'])
)
cleaned_sse

def clean_clustering_metrics(fpath):
    metric_df = pd.read_csv(fpath)
    metric = fpath.split('/')[1].split('.')[0]
    # Correct Spelling mistake
    if metric == 'logliklihood':
        metric='log-likelihood'
    metric_df.columns = ['N_Clusters', 'Madelon_'+metric, 'Cars_'+metric]
    clean_metric_df = (metric_df
                   .melt(id_vars=['N_Clusters'], value_name='Value')
                   .assign(Dataset= lambda X: X.variable.str.split('_').str.get(0))
                   .assign(Metric= lambda X: X.variable.str.split('_').str.get(1))
                   .drop(columns=['variable'])
    )
    if clean_metric_df.Metric.unique()[0] == 'log-likelihood':
        clean_metric_df['Clustering_Algorithm'] = 'EM'
    elif clean_metric_df.Metric.unique()[0] == 'SSE':
        clean_metric_df['Clustering_Algorithm'] = 'K_Means'
    else:
        clean_metric_df['Clustering_Algorithm'] = np.nan
        
    column_order = ['Clustering_Algorithm', 'N_Clusters', 'Dataset', 'Metric', 'Value',]

    return clean_metric_df[column_order]
def clean_clustering_validation_metrics(algorithm_file_dir_prefix):
    sse_fpath = f'{algorithm_file_dir_prefix}/SSE.csv'
    em_fpath = f'{algorithm_file_dir_prefix}/logliklihood.csv'
    
    clean_sse = clean_clustering_metrics(sse_fpath)
    clean_em = clean_clustering_metrics(em_fpath)
    
    clean_clustering_metics_df = (pd.concat([clean_sse, clean_em],
                                         sort=False)
                               .reset_index(drop=True)
                              )
    return clean_clustering_metics_df
    

test_fpath1 = f'{alg}/SSE.csv'
test_fpath2 = f'{alg}/logliklihood.csv'

# display(clean_clustering_metrics(test_fpath1))
# display(clean_clustering_metrics(test_fpath2))
clean_clustering_validation_metrics('ICA')

Unnamed: 0,Clustering_Algorithm,N_Clusters,Dataset,Metric,Value
0,K_Means,2,Madelon,SSE,25919.169412
1,K_Means,5,Madelon,SSE,23805.080932
2,K_Means,10,Madelon,SSE,21680.907243
3,K_Means,15,Madelon,SSE,20389.563032
4,K_Means,20,Madelon,SSE,19498.187803
5,K_Means,25,Madelon,SSE,18810.762444
6,K_Means,30,Madelon,SSE,18196.739312
7,K_Means,35,Madelon,SSE,17761.460021
8,K_Means,40,Madelon,SSE,17291.711202
9,K_Means,2,Cars,SSE,20441.927175


## 2) Cleaning the classification/quality metrics of the clusters (accuracy when using cluster labels as predictions and Mutual information between cluster labels and target labels

In [17]:
test_acc_df = pd.read_csv('BASE/cars acc.csv')
(test_acc_df.rename(columns={'Unnamed: 0':'Clustering_Algorithm'})
 .melt(id_vars='Clustering_Algorithm', var_name='N_Clusters', value_name='Value',)
 .assign

)

def clean_clustering_classification_metric_df(fpath):
    metric_df = pd.read_csv(fpath)
    metric = fpath.split('/')[1].split('.')[0].split(' ')[1]
    # Correct Spelling mistake
    clean_metric_df = (metric_df.rename(columns={'Unnamed: 0':'Clustering_Algorithm'})
     .melt(id_vars='Clustering_Algorithm', var_name='N_Clusters', value_name='Value',)
    )
    if metric == 'acc':
        clean_metric_df['Metric'] = 'Accuracy'
    elif metric == 'adjMI':
        clean_metric_df['Metric'] = 'Mutual_Information'
    else:
        clean_metric_df['Metric'] = np.nan
    
    return clean_metric_df

def clean_clustering_classification_metrics(algorithm_file_dir_prefix, collection_of_dataset_names):
    clean_metric_dfs = []
    for dataset in collection_of_dataset_names:
        acc_fpath = f'{algorithm_file_dir_prefix}/{dataset} acc.csv'
        adjmi_fpath = f'{algorithm_file_dir_prefix}/{dataset} adjMI.csv'
#         print(acc_fpath)
#         print(adjmi_fpath)
        clean_acc = clean_clustering_classification_metric_df(acc_fpath)
        clean_acc['Dataset'] = dataset
        clean_adjmi = clean_clustering_classification_metric_df(adjmi_fpath)
        clean_adjmi['Dataset'] = dataset


        clean_classification_metric_df = (pd.concat([clean_acc, clean_adjmi],
                                             sort=False)
                                   .reset_index(drop=True)
                                  )
        clean_metric_dfs.append(clean_classification_metric_df)
    
    # Concat all together
    clean_metric_df = (pd.concat(clean_metric_dfs,
                                             sort=False)
                                   .reset_index(drop=True)
                                  )
    column_order = ['Clustering_Algorithm', 'N_Clusters', 'Dataset', 'Metric', 'Value',]


    return clean_metric_df[column_order]

def clean_all_clustering_non_grid_search_metrics(algorithm_file_dir_prefix, collection_of_dataset_names):
    """Clean the clustering metrics (Accuracy, Mutual Info, SSE for Kmeans and Likelihood for EM
    and pull into a single clean dataframe"""
    clean_clustering_validation_metrics_df = clean_clustering_validation_metrics(algorithm_file_dir_prefix)
    clean_clustering_classification_metrics_df = clean_clustering_classification_metrics(algorithm_file_dir_prefix, 
                                                                                         collection_of_dataset_names)
    clean_metric_df = (pd.concat([clean_clustering_validation_metrics_df, clean_clustering_classification_metrics_df],
                                             sort=False)
                                   .reset_index(drop=True)
                                  )
    clean_metric_df['Data_Perspective'] = algorithm_file_dir_prefix
    return clean_metric_df



Getting one such pairing of clustering metrics

In [66]:
clean_all_clustering_non_grid_search_metrics('BASE', ['Cars', 'Madelon']).head()

Unnamed: 0,Clustering_Algorithm,N_Clusters,Dataset,Metric,Value,Data_Perspective
0,K_Means,2,Madelon,SSE,901709.421231,BASE
1,K_Means,5,Madelon,SSE,891880.862735,BASE
2,K_Means,10,Madelon,SSE,885554.749546,BASE
3,K_Means,15,Madelon,SSE,881723.610571,BASE
4,K_Means,20,Madelon,SSE,878695.336027,BASE


In [67]:
%%time
all_algorithm_clustering_methods = pd.concat([clean_all_clustering_non_grid_search_metrics(algorithm, ['Cars', 'Madelon'])
                                    for algorithm in ['BASE', 'ICA', 'PCA', 'RP', 'RF']])

Wall time: 236 ms


# Export to results HDF

In [68]:
all_algorithm_clustering_methods.to_hdf('results/results.hdf', key='clustering', complib='blosc',complevel=9)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block1_values] [items->['Clustering_Algorithm', 'N_Clusters', 'Dataset', 'Metric', 'Data_Perspective']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


Applying to all algorithms 

These should be equal

In [69]:
all_algorithm_clustering_methods.Data_Perspective.value_counts()

RP      108
RF      108
ICA     108
BASE    108
PCA     108
Name: Data_Perspective, dtype: int64

In [70]:
all_algorithm_clustering_methods.Dataset.value_counts()

Madelon    270
Cars       270
Name: Dataset, dtype: int64

## Compile T-SNE Results to one table

In [31]:
test_alg = 'BASE'
test_data = 'madelon'
tsne_test = pd.read_csv(f'{test_alg}/{test_data}2D.csv').drop(columns=['Unnamed: 0'])
tsne_test.head()


Unnamed: 0,x,y,target
0,-19.816355,-39.049294,1.0
1,19.696314,-3.21764,-1.0
2,-16.734158,-17.890333,-1.0
3,5.546829,-13.654442,-1.0
4,-50.886955,-6.579526,-1.0


In [42]:
def pull_tsne(algorithm, dataset):
    tsne = pd.read_csv(f'{algorithm}/{dataset}2D.csv').drop(columns=['Unnamed: 0'])
    tsne['Dataset'] = dataset
    tsne['Data_Perspective'] = algorithm
    return tsne

tsne_dfs = [pull_tsne(algorithm, 'cars')
 for algorithm 
 in ['BASE', 'ICA', 'PCA', 'RP', 'RF']]+ [pull_tsne(algorithm, 'madelon')
 for algorithm 
 in ['BASE', 'ICA', 'PCA', 'RP', 'RF']]



pd.concat(tsne_dfs).groupby(by=['Dataset', 'Data_Perspective']).size()

Dataset  Data_Perspective
cars     BASE                1728
         ICA                 1728
         PCA                 1728
         RF                  1728
         RP                  1728
madelon  BASE                1820
         ICA                 1820
         PCA                 1820
         RF                  1820
         RP                  1820
dtype: int64

In [72]:
tsne_df = pd.concat(tsne_dfs).rename(columns={'x':'X', 'y':'Y', 'target':'Target'})
tsne_df.head()

Unnamed: 0,X,Y,Target,Dataset,Data_Perspective
0,-31.615511,-25.296206,0.0,cars,BASE
1,-17.212791,-12.54337,0.0,cars,BASE
2,-27.641699,-27.702856,0.0,cars,BASE
3,-19.924063,-29.720591,0.0,cars,BASE
4,-17.961168,-15.16667,0.0,cars,BASE


In [73]:
tsne_df.to_hdf('results/results.hdf', key='tsne', complib='blosc',complevel=9)

## Compile GridSearch Results to one table

In [152]:
def pull_grid_search(algorithm, dataset, clustering=True):
    """Given a data unsupervised learning algorithm, dataset name and whether it's 
    cluster related or not, pull grid search results"""
    # Load the clustering data if necesary
    if clustering:
        # Load in Grid Searches from Clustering efforts
        cluster_alg = 'GMM'
        cluster1_df = pd.read_csv(f'{algorithm}/{dataset} cluster {cluster_alg}.csv').drop(columns=['Unnamed: 0'])
        cluster1_df['Data_Perspective'] = algorithm        
        cluster1_df['Dataset'] = dataset
        cluster1_df['Clustering_Algorithm'] = cluster_alg
        cluster1_df['Clustered_Data'] = 1
        # Rename N components so that the concatenation works with columns aligned
        n_components_colname = cluster1_df.filter(regex='(_n_|filt)').columns.values.tolist()[0]
        cluster1_df.rename(columns={n_components_colname:'N_Components/Clusters/Features'}, inplace=True)
        # Remove Individual Split columns
        split_columns = cluster1_df.filter(regex='split').columns.values.tolist()
        cluster1_df = cluster1_df.drop(columns=split_columns)

        cluster_alg = 'Kmeans'
        cluster2_df = pd.read_csv(f'{algorithm}/{dataset} cluster {cluster_alg}.csv').drop(columns=['Unnamed: 0'])
        cluster2_df['Data_Perspective'] = algorithm        
        cluster2_df['Dataset'] = dataset        
        cluster2_df['Clustering_Algorithm'] = cluster_alg
        cluster2_df['Clustered_Data'] = 1    
        n_components_colname = cluster2_df.filter(regex='(_n_|filt)').columns.values.tolist()[0]
        cluster2_df.rename(columns={n_components_colname:'N_Components/Clusters/Features'}, inplace=True)
        # Remove Individual Split columns
        split_columns = cluster2_df.filter(regex='split').columns.values.tolist()
        cluster2_df = cluster2_df.drop(columns=split_columns)
        
        
    # There's no dimension reduction for BASE data
    if algorithm != 'BASE':
        grid_search_df = pd.read_csv(f'{algorithm}/{dataset} dim red.csv').drop(columns=['Unnamed: 0'])
        grid_search_df['Data_Perspective'] = algorithm        
        grid_search_df['Dataset'] = dataset
        grid_search_df['Clustering_Algorithm'] = 'None'
        grid_search_df['Clustered_Data'] = 0 
        n_components_colname = grid_search_df.filter(regex='(_n_|filt)').columns.values.tolist()[0]
        grid_search_df.rename(columns={n_components_colname:'N_Components/Clusters/Features'}, inplace=True) 
        # Remove Individual Split columns
        split_columns = grid_search_df.filter(regex='split').columns.values.tolist()
        grid_search_df = grid_search_df.drop(columns=split_columns)        
    
  
    if clustering & (algorithm != 'BASE'):         
        clean_grid_search =  pd.concat([cluster1_df, cluster2_df, grid_search_df])
        return clean_grid_search
    elif clustering & (algorithm == 'BASE'):
        clean_grid_search =  pd.concat([cluster1_df, cluster2_df])
        return clean_grid_search
    else:
        return 'Either clustering or grid search not found'
    
    

    
pull_grid_search('BASE', 'Cars').groupby(by=['Data_Perspective',
 'Dataset',
 'Clustering_Algorithm',
 'Clustered_Data']).size()

Data_Perspective  Dataset  Clustering_Algorithm  Clustered_Data
BASE              Cars     GMM                   1                 180
                           Kmeans                1                 180
dtype: int64

In [153]:
pull_grid_search('BASE', 'Cars').shape

(360, 17)

In [154]:
pull_grid_search('ICA', 'Cars').shape

(560, 17)

In [155]:
pull_grid_search('ICA', 'Cars').groupby(by=['Data_Perspective',
 'Dataset',
 'Clustering_Algorithm',
 'Clustered_Data']).size()

Data_Perspective  Dataset  Clustering_Algorithm  Clustered_Data
ICA               Cars     GMM                   1                 180
                           Kmeans                1                 180
                           None                  0                 200
dtype: int64

In [156]:
grid_search_dfs = [pull_grid_search(algorithm, 'Cars')
 for algorithm 
 in ['BASE', 'ICA', 'PCA', 'RP', 'RF']] + [pull_grid_search(algorithm, 'Madelon')
 for algorithm 
 in ['BASE', 'ICA', 'PCA', 'RP', 'RF']]



pd.concat(grid_search_dfs).groupby(by=['Dataset', 'Data_Perspective']).size()

Dataset  Data_Perspective
Cars     BASE                360
         ICA                 560
         PCA                 540
         RF                  560
         RP                  560
Madelon  BASE                360
         ICA                 620
         PCA                 620
         RF                  620
         RP                  620
dtype: int64

## Export grid search columns

In [158]:
clean_grid_search_df = pd.concat(grid_search_dfs)
clean_grid_search_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_NN__alpha,param_NN__hidden_layer_sizes,N_Components/Clusters/Features,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score,Data_Perspective,Dataset,Clustering_Algorithm,Clustered_Data
0,0.084124,0.010926,0.001003,8.064048e-07,0.1,"(50, 50)",2,"{'NN__alpha': 0.1, 'NN__hidden_layer_sizes': (...",0.700231,0.001624,1,0.700232,0.000405,BASE,Cars,GMM,1
1,0.104676,0.010379,0.001203,0.0002456904,0.1,"(50, 50)",5,"{'NN__alpha': 0.1, 'NN__hidden_layer_sizes': (...",0.700231,0.001624,1,0.682722,0.033508,BASE,Cars,GMM,1
2,0.152606,0.038785,0.001604,0.0004908651,0.1,"(50, 50)",10,"{'NN__alpha': 0.1, 'NN__hidden_layer_sizes': (...",0.700231,0.001624,1,0.700232,0.000405,BASE,Cars,GMM,1
3,0.216791,0.041077,0.002212,0.0005207766,0.1,"(50, 50)",15,"{'NN__alpha': 0.1, 'NN__hidden_layer_sizes': (...",0.575231,0.249102,164,0.704856,0.008844,BASE,Cars,GMM,1
4,0.238133,0.028133,0.002708,0.0004008064,0.1,"(50, 50)",20,"{'NN__alpha': 0.1, 'NN__hidden_layer_sizes': (...",0.633681,0.13342,145,0.709349,0.01834,BASE,Cars,GMM,1


In [159]:
clean_grid_search_df.to_hdf('results/results.hdf', key='grid_search', complib='blosc',complevel=9)