In [1]:
import pandas as pd
import ruamel.yaml as yaml
import os
import numpy as np
import sys
import sklearn.model_selection as ms

# ## Change to Root

# In[15]:


NO_CONFIG_ERR_MSG = """No config file found. Root directory is determined by presence of "config.yaml" file."""        

original_wd = os.getcwd()

# Number of times to move back in directory
num_retries = 10
for x in range(0, num_retries):
    # try to load config file    
    try:
        with open("config.yaml", 'r') as stream:
            cfg = yaml.safe_load(stream)
    # If not found move back one directory level
    except FileNotFoundError:
        os.chdir('../')
        # If reached the max number of directory levels change to original wd and print error msg
        if x+1 == num_retries:
            os.chdir(original_wd)
            print(NO_CONFIG_ERR_MSG)
            
# Add directory to PATH
path = os.getcwd()

if path not in sys.path:
    sys.path.append(path)

## Load in CV results

In [2]:
cv_scores = pd.read_csv('RF/cars dim red.csv')
cv_scores.sort_values(by='mean_test_score', ascending=False).head()

Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_NN__alpha,param_NN__hidden_layer_sizes,param_filter__n,params,split0_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
97,97,0.61403,0.12356,0.00762,0.000491,0.01,"(100, 25, 100)",16,"{'NN__alpha': 0.01, 'NN__hidden_layer_sizes': ...",0.65896,...,0.815972,0.08293,1,0.999276,0.999276,1.0,0.938495,0.992775,0.985964,0.023879
147,147,0.600995,0.113285,0.006818,0.000401,0.001,"(100, 25, 100)",16,"{'NN__alpha': 0.001, 'NN__hidden_layer_sizes':...",0.656069,...,0.815394,0.08309,2,0.999276,0.997829,1.0,0.938495,0.992775,0.985675,0.023725
197,197,0.598591,0.270181,0.005814,0.001603,0.0001,"(100, 25, 100)",16,"{'NN__alpha': 0.0001, 'NN__hidden_layer_sizes'...",0.644509,...,0.8125,0.087693,3,0.986252,0.998553,1.0,0.938495,0.992775,0.983215,0.02288
47,47,0.637091,0.131684,0.00742,0.000491,0.1,"(100, 25, 100)",16,"{'NN__alpha': 0.1, 'NN__hidden_layer_sizes': (...",0.66185,...,0.802662,0.076585,4,0.999276,1.0,1.0,0.968162,0.992052,0.991898,0.01224
111,111,0.274529,0.015997,0.006216,0.000401,0.001,"(50,)",4,"{'NN__alpha': 0.001, 'NN__hidden_layer_sizes':...",0.768786,...,0.802083,0.050735,5,0.855282,0.829233,0.853835,0.797395,0.846821,0.836513,0.021642


In [3]:
records = []
for algorithm in ['ICA', 'PCA', 'RP', 'RF']:
    for dataset in ['cars', 'madelon']:
        tmp_csv_scores = pd.read_csv(f'{algorithm}/{dataset} dim red.csv')
        best_n_components = tmp_csv_scores.sort_values(by='mean_test_score', ascending=False).filter(regex='components|filter').values[0][0]
        best_acc = tmp_csv_scores.sort_values(by='mean_test_score', ascending=False).mean_test_score.values[0]
        records.append((algorithm, dataset, best_n_components, best_acc))
        
        
best_N_components = pd.DataFrame(records, columns=['Algorithm', 'Dataset', 'N_Components_Maximizing_Test_Accuracy', 'Best_Test_Acc'])        
best_N_components.to_csv('best_N_components.csv', index=False)
best_N_components

Unnamed: 0,Algorithm,Dataset,N_Components_Maximizing_Test_Accuracy,Best_Test_Acc
0,ICA,cars,14,0.710069
1,ICA,madelon,15,0.599451
2,PCA,cars,12,0.855324
3,PCA,madelon,5,0.789011
4,RP,cars,35,0.824653
5,RP,madelon,45,0.545604
6,RF,cars,16,0.815972
7,RF,madelon,20,0.863736
