In [1]:
import pandas as pd
import ruamel.yaml as yaml
import os
import numpy as np
import sys
import sklearn.model_selection as ms

# ## Change to Root

# In[15]:


NO_CONFIG_ERR_MSG = """No config file found. Root directory is determined by presence of "config.yaml" file."""        

original_wd = os.getcwd()

# Number of times to move back in directory
num_retries = 10
for x in range(0, num_retries):
    # try to load config file    
    try:
        with open("config.yaml", 'r') as stream:
            cfg = yaml.safe_load(stream)
    # If not found move back one directory level
    except FileNotFoundError:
        os.chdir('../')
        # If reached the max number of directory levels change to original wd and print error msg
        if x+1 == num_retries:
            os.chdir(original_wd)
            print(NO_CONFIG_ERR_MSG)
            
# Add directory to PATH
path = os.getcwd()

if path not in sys.path:
    sys.path.append(path)

## Load in CV results

In [2]:
records = []
for algorithm in ['ICA', 'PCA', 'RP', 'RF']:
    for dataset in ['cars', 'madelon']:
        tmp_csv_scores = pd.read_csv(f'{algorithm}/{dataset} dim red.csv')
        best_n_components = tmp_csv_scores.sort_values(by='mean_test_score', ascending=False).filter(regex='components|filter').values[0][0]
        best_acc = tmp_csv_scores.sort_values(by='mean_test_score', ascending=False).mean_test_score.values[0]
        records.append((algorithm, dataset, best_n_components, best_acc))
        
        
best_N_components = pd.DataFrame(records, columns=['Algorithm', 'Dataset', 'N_Components_Maximizing_Test_Accuracy', 'Best_Test_Acc'])        
best_N_components.to_csv('results/best_N_components.csv', index=False)
best_N_components

Unnamed: 0,Algorithm,Dataset,N_Components_Maximizing_Test_Accuracy,Best_Test_Acc
0,ICA,cars,14,0.710069
1,ICA,madelon,15,0.599451
2,PCA,cars,12,0.855324
3,PCA,madelon,5,0.789011
4,RP,cars,14,0.803819
5,RP,madelon,45,0.545604
6,RF,cars,16,0.815972
7,RF,madelon,20,0.863736


In [3]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from time import clock
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans as kmeans
from sklearn.mixture import GaussianMixture as GMM
from collections import defaultdict
from helpers import cluster_acc, myGMM,nn_arch,nn_reg
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import sys

## Script Form

In [62]:
# out = '{}/'.format(sys.argv[1])
out = '{}/'.format(sys.argv[1])

np.random.seed(0)
cars = pd.read_hdf(out+'datasets.hdf','cars')
carsX = cars.drop('Class',1).copy().values
carsY = cars['Class'].copy().values

madelon = pd.read_hdf(out+'datasets.hdf','madelon')        
madelonX = madelon.drop('Class',1).copy().values
madelonY = madelon['Class'].copy().values


madelonX = StandardScaler().fit_transform(madelonX)
carsX= StandardScaler().fit_transform(carsX)

def fit_tsne_output_record_kl_divergence(dataset, algorithm, dataset_name, perplexity):
    tsne_transformer = TSNE(verbose=0, perplexity = perplexity, random_state=5)
    data_2D = tsne_transformer.fit(dataX) 
    record = (algorithm, dataset_name, tsne_transformer.perplexity, tsne_transformer.kl_divergence_)
    record = pd.DataFrame([record], 
                          columns=['Data_Perspective', 'Dataset', 'Perplexity', 'KL_Divergence'])
    
    return record

cars_perplexities =  pd.concat([fit_tsne_output_record_kl_divergence(carsX, out[:-1], 'Cars', perplexity) for perplexity in np.arange(0, 70, 5)])
madelon_perplexities =  pd.concat([fit_tsne_output_record_kl_divergence(madelonX, out[:-1], 'Madelon', perplexity) for perplexity in np.arange(0, 70, 5)])

try:
    perplexity_results = pd.read_csv('results/tsne_perplexity_search/perplexity_kl_divergence_results.csv')
except FileNotFoundError:
    perplexity_results = pd.DataFrame(columns = ['Data_Perspective', 'Dataset', 'Perplexity', 'KL_Divergence'])

perplexity_results = pd.concat([perplexity_results, cars_perplexities, madelon_perplexities])
perplexity_results.to_csv('results/tsne_perplexity_search/perplexity_kl_divergence_results.csv', index=False)

FileNotFoundError: File -f/datasets.hdf does not exist

## Script for Outputting the best TSNE projections

In [None]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from time import clock
from sklearn.preprocessing import StandardScaler
import sys

out = '{}/'.format(sys.argv[1])

np.random.seed(0)
cars = pd.read_hdf(out+'datasets.hdf','cars')
carsX = cars.drop('Class',1).copy().values
carsY = cars['Class'].copy().values

madelon = pd.read_hdf(out+'datasets.hdf','madelon')        
madelonX = madelon.drop('Class',1).copy().values
madelonY = madelon['Class'].copy().values


madelonX = StandardScaler().fit_transform(madelonX)
carsX= StandardScaler().fit_transform(carsX)


perplexity_results = pd.read_csv('results/tsne_perplexity_search/perplexity_kl_divergence_results.csv')
# Remove Perplexity of 0
perplexity_results = perplexity_results[perplexity_results['Perplexity'] != 0]

min_kl = perplexity_results.sort_values(by='KL_Divergence').drop_duplicates(subset=['Data_Perspective', 'Dataset'], keep='first')
min_kl_perplexity_car = min_kl.query('Data_Perspective == @out'
                                             '& Dataset == "Cars"').Perplexity.values[0]
min_kl_perplexity_madelon = min_kl.query('Data_Perspective == @out'
                                             '& Dataset == "Madelon"').Perplexity.values[0]


# %% For chart 4/5
# Madelon perplexity set to 50 b/c it's high dimensional and points likely not dense like Cars
madelonX2D = TSNE(verbose=10, perplexity = min_kl_perplexity_madelon, random_state=5).fit_transform(madelonX)
carsX2D = TSNE(verbose=10, perplexity = min_kl_perplexity_car, random_state=5).fit_transform(carsX)

madelon2D = pd.DataFrame(np.hstack((madelonX2D,np.atleast_2d(madelonY).T)),columns=['x','y','target'])
cars2D = pd.DataFrame(np.hstack((carsX2D,np.atleast_2d(carsY).T)),columns=['x','y','target'])

madelon2D.to_csv(out+'madelon2D.csv')
cars2D.to_csv(out+'cars2D.csv')

In [66]:
perplexity_results = pd.read_csv('results/tsne_perplexity_search/perplexity_kl_divergence_results.csv')

## Finding the minium perplexities

I think this would find the minimum

In [67]:
perplexity_results = pd.read_csv('results/tsne_perplexity_search/perplexity_kl_divergence_results.csv')
perplexity_results = perplexity_results[perplexity_results['Perplexity'] != 0]
perplexity_results.sort_values(by='KL_Divergence').drop_duplicates(subset=['Data_Perspective', 'Dataset'], keep='first')

Unnamed: 0,Data_Perspective,Dataset,Perplexity,KL_Divergence
5,PCA/,Cars,25,-0.008926
145,PCA,Cars,25,-0.008926
57,ICA/,Cars,5,0.106277
261,ICA,Cars,5,0.106277
29,RF/,Cars,5,0.516804
201,RF,Cars,5,0.516804
381,BASE,Cars,5,0.519568
113,BASE/,Cars,5,0.519568
85,RP/,Cars,5,0.556801
321,RP,Cars,5,0.556801


In [63]:
perplexity_results

Unnamed: 0,Data_Perspective,Dataset,Perplexity,KL_Divergence
0,PCA/,Cars,0,0.755179
1,PCA/,Cars,5,0.108601
2,PCA/,Cars,10,0.024544
3,PCA/,Cars,15,-0.002569
4,PCA/,Cars,20,0.000970
5,PCA/,Cars,25,-0.008926
6,PCA/,Cars,30,-0.004170
7,PCA/,Cars,35,0.037688
8,PCA/,Cars,40,0.064255
9,PCA/,Cars,45,0.084534


## Dev Area

In [37]:
pd.DataFrame(columns = ['Data_Perspective', 'Dataset', 'Perplexity', 'KL_Divergence'])

Unnamed: 0,Data_Perspective,Dataset,Perplexity,KL_Divergence


In [None]:
def fit_tsne_output_record_kl_divergence(dataset, algorithm, dataset_name, perplexity):
    tsne_transformer = TSNE(verbose=0, perplexity = perplexity, random_state=5)
    data_2D = tsne_transformer.fit(dataX) 
    record = (algorithm, dataset_name, tsne_transformer.perplexity, tsne_transformer.kl_divergence_)
    record = pd.DataFrame([record], 
                          columns=['Data_Perspective', 'Dataset', 'Perplexity', 'KL_Divergence'])
    
    return record

In [27]:
def fit_tsne_output_kl_divergence(algorithm, dataset, perplexity):
    data =  pd.read_hdf(out+'/datasets.hdf', dataset)
    dataX = data.drop('Class',1).copy().values
    tsne_transformer = TSNE(verbose=0, perplexity = perplexity, random_state=5)
    data_2D = tsne_transformer.fit_transform(dataX) 
    record = (algorithm, dataset, tsne_transformer.perplexity, tsne_transformer.kl_divergence_)
    record = pd.DataFrame([record], 
                          columns=['Data_Perspective', 'Dataset', 'Perplexity', 'KL_Divergence'])
    
    return record

In [28]:
tsne_record_one = fit_tsne_output_kl_divergence('PCA', 'cars', 30)

In [32]:
%%time
pca_perplexities = [fit_tsne_output_kl_divergence('PCA', 'cars', perpleixity) for perpleixity in np.arange(0, 70, 5)]

Wall time: 4min 57s


In [33]:
pca_perplexities_df = pd.concat(pca_perplexities)

In [35]:
pca_perplexities_df.sort_values(by='KL_Divergence')

Unnamed: 0,Data_Perspective,Dataset,Perplexity,KL_Divergence
0,PCA,cars,25,-0.015928
0,PCA,cars,30,-0.013913
0,PCA,cars,20,-0.01238
0,PCA,cars,15,-0.005159
0,PCA,cars,35,0.017502
0,PCA,cars,10,0.021492
0,PCA,cars,40,0.052163
0,PCA,cars,45,0.067785
0,PCA,cars,50,0.07934
0,PCA,cars,5,0.108223


In [44]:
min_kl_divergence = pca_perplexities_df.groupby(by=['Data_Perspective', 'Dataset']).min().KL_Divergence.values[0]
pca_perplexities_df.query('KL_Divergence == @min_kl_divergence')

Unnamed: 0,Data_Perspective,Dataset,Perplexity,KL_Divergence
0,PCA,cars,25,-0.015928


In [51]:
pca_perplexities_df.sort_values("KL_Divergence").groupby(['Data_Perspective', 'Dataset'], as_index=False)

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x000002517E830518>

In [47]:
pca_perplexities_df.groupby(by=['Data_Perspective', 'Dataset']).min()

Unnamed: 0_level_0,Unnamed: 1_level_0,Perplexity,KL_Divergence
Data_Perspective,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1
PCA,cars,0,-0.015928


In [26]:
pd.DataFrame([tsne_record_one], columns=['Data_Perspective', 'Dataset', 'Perplexity', 'KL_Divergence'])

Unnamed: 0,Data_Perspective,Dataset,Perplexity,KL_Divergence
0,PCA,cars,30,-0.013913


In [8]:
tsne_test.kl_divergence_

-0.004170433152467012

In [9]:
tsne_test.perplexity

30

In [7]:
# %% For chart 4/5
# Madelon perplexity set to 50 b/c it's high dimensional and points likely not dense like Cars
madelonX2D = TSNE(verbose=10, perplexity = 30, random_state=5).fit_transform(madelonX)
carsX2D = TSNE(verbose=10, perplexity = 30, random_state=5).fit_transform(carsX)

# madelon2D = pd.DataFrame(np.hstack((madelonX2D,np.atleast_2d(madelonY).T)),columns=['x','y','target'])
# cars2D = pd.DataFrame(np.hstack((carsX2D,np.atleast_2d(carsY).T)),columns=['x','y','target'])

# madelon2D.to_csv(out+'madelon2D.csv')
# cars2D.to_csv(out+'cars2D.csv')

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1820 samples in 0.002s...
[t-SNE] Computed neighbors for 1820 samples in 0.104s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1820
[t-SNE] Computed conditional probabilities for sample 1820 / 1820
[t-SNE] Mean sigma: 0.603542
[t-SNE] Computed conditional probabilities in 0.141s
[t-SNE] Iteration 50: error = 77.9336243, gradient norm = 0.0002723 (50 iterations in 1.801s)
[t-SNE] Iteration 100: error = 77.8725586, gradient norm = 0.0237675 (50 iterations in 2.098s)
[t-SNE] Iteration 150: error = 76.3171997, gradient norm = 0.0146989 (50 iterations in 1.446s)
[t-SNE] Iteration 200: error = 76.1424561, gradient norm = 0.0023276 (50 iterations in 1.323s)
[t-SNE] Iteration 250: error = 76.1413879, gradient norm = 0.0002232 (50 iterations in 1.186s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 76.141388
[t-SNE] Iteration 300: error = 1.5900210, gradient norm = 0.0010523 (50 iterations in 1.142s)