## I. Extract recorded data and describe status

In [None]:
import os
import json
import numpy as np

experiment_dir = "../experiments/random_architectures_with_loss"
dir_depth      = 4

# Get list of json files
filenames = [f for f in os.listdir(experiment_dir) if os.path.isfile(os.path.join(experiment_dir, f))]
data = []

# Get content
for filename in filenames:
    full_path = os.path.join(experiment_dir, filename)
    config = json.load( open( full_path) )
    # Fix for old files where there was a typo
    if 'FPT' in config:
        FPT_key = 'FPT'
    elif 'FTP' in config:
        FPT_key = 'FTP'
    else:
        assert(False)
    if not 'Results' in config:
        data_element = { 'filename': filename, **config[FPT_key], 'learning_curve_train': [], 'learning_curve_acc': [] }
    elif not 'learning_curve_acc' in config['Results']:
        data_element = { 'filename': filename, **config[FPT_key], **config['Results'], 'learning_curve_acc': [] }
    else:
        data_element = { 'filename': filename, **config[FPT_key], **config['Results'] }
    data.append( data_element )
    
for e in data:
    # Clean up
    #del e['quantiles']
    del e['cumulative']
    del e['density']
    # Format
    print(e['filename'], ':')
    quantile_count = len(e['quantiles'])
    print( f'  |- {quantile_count} quantiles available')
    samples_count  = len(e['learning_curve_train'])
    print( f'  |- {samples_count} training samples available' )
    rounded_losses = np.round( e['learning_curve_train'], decimals=4 )
    print( f'  |- Losses ', rounded_losses )
    rounded_acc = np.round( e['learning_curve_acc'], decimals=4 )
    print( f'  |- Accuracies ', rounded_acc )

In [None]:
scatter_data = {}

for i in range(11):
    scatter_data[i] = []

for e in data:
    train_sample = e['learning_curve_train']
    print(e.keys())
    for s in train_sample:
        for i in range(11):
            if s < 0.5: # Remove frozen outliers
                scatter_data[i].append( (e['quantiles'][i], s, e))

# Data points for each quantile
for i in range(11):
    print( i, ':', len(scatter_data[i]) )

## II. Scatter plots Loss vs quantile

In [None]:
import matplotlib.pyplot as plt
import scipy
from scipy import stats

# Means 

# Quantiles
for i in range(1,10):
    # Bivariate analysis of quantiles and losses
    x = [np.log10(x+0.01) for (x,y,e) in scatter_data[i]]
    y = [y for (x,y,e) in scatter_data[i]]
    spearman = scipy.stats.spearmanr(x, y)
    pearson  = scipy.stats.pearsonr(x, y)
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)
    print("Correlation measures: ")
    print("Spearman:", spearman )
    print("Pearson :", pearson  )
    print("R2      :", r_value)
    # Plot
    plt.scatter( x, y , alpha=0.4)
    plt.title(f'''FPT quantile at {i*0.1} vs Loss''')
    plt.xlabel("FPT quantile")
    plt.ylabel("Loss")
    plt.savefig(f'''FTP_quantile{i*10}.png''')
    plt.show()


In [None]:
!pip install scipy


## III. Interactive plot using Plotly

In [None]:
!pip install plotly pandas

In [None]:
import plotly.express as px
import pandas

# Quantiles
for i in range(9,10):
    # Bivariate analysis of quantiles and losses
    x = [np.log10(x+0.01) for (x,y,e) in scatter_data[i]]
    y = [y for (x,y,e) in scatter_data[i]]
    # Additionnal data
    more = [e for (x,y,e) in scatter_data[i]]
    filenames  = [ e['filename'] for e in more]
    widths     = [ str(e['width_ratios']) for e in more]
    accuracies = np.array( [ np.mean(e['learning_curve_acc']) for e in more] )
    # Correlations
    spearman = scipy.stats.spearmanr(x, y)
    pearson  = scipy.stats.pearsonr(x, y)
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)
    print("Correlation measures: ")
    print("Spearman:", spearman )
    print("Pearson :", pearson  )
    print("R2      :", r_value)
    data_dict = {
        'log_quantile' : x,
        'loss'         : y,
        'filename'     : filenames,
        'width_profile': widths,
        'accuracy'     : np.round( accuracies*100, decimals=2 )
    }
    #print(more)
    df = pandas.DataFrame.from_dict( data_dict )
    fig = px.scatter( data_dict, x='log_quantile', y='loss', 
                      color='accuracy',
                      hover_data=['filename', 'width_profile', 'accuracy']
                    )
    fig.show()

## II. Bis. Accuracies

In [None]:
scatter_data = {}

for i in range(11):
    scatter_data[i] = []

for e in data:
    train_sample = e['learning_curve_acc']
    for s in train_sample:
        for i in range(11):
            #if s < 0.5: # Remove frozen outliers
                scatter_data[i].append( (e['quantiles'][i], s, e))

# Data points for each quantile
for i in range(11):
    print( i, ':', len(scatter_data[i]) )

In [None]:
import matplotlib.pyplot as plt
import scipy
from scipy import stats

# Means 

# Quantiles
for i in range(8,10):
    # Bivariate analysis of quantiles and losses
    x = [np.log10(x+0.01) for (x,y,e) in scatter_data[i]]
    y = [y*100 for (x,y,e) in scatter_data[i]]
    spearman = scipy.stats.spearmanr(x, y)
    pearson  = scipy.stats.pearsonr(x, y)
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)
    print("Correlation measures: ")
    print("Spearman:", spearman )
    print("Pearson :", pearson  )
    print("R2      :", r_value)
    # Plot
    plt.scatter( x, y , alpha=0.4)
    plt.title(f'''FPT quantile at {i*0.1} vs Loss''')
    plt.xlabel("FPT quantile")
    plt.ylabel("Accuracy in %")
    plt.ylim(90,100)
    plt.savefig(f'''FTP_accuraries{i*10}.png''')
    plt.show()

In [None]:
import plotly.express as px
from plotly.graph_objs import *
import pandas

# Quantiles
for i in range(9,10):
    full_data = scatter_data[i]
    full_data = [ (x,y,e) for (x,y,e) in full_data if y>0.9]
    # Bivariate analysis of quantiles and losses
    x = [np.log10(x+0.01) for (x,y,e) in full_data]
    y = [y*100 for (x,y,e) in full_data]
    # Additionnal data
    more = [e for (x,y,e) in full_data]
    filenames  = [ e['filename'] for e in more]
    widths     = [ str(e['width_ratios']) for e in more]
    layer_sizes = [ np.cumprod(e['width_ratios'])*784 for e in more] 
    sizes       = [ np.log10(np.sum( e[:-1]*e[1:] )) for e in layer_sizes]
    accuracies  = np.array( [ np.mean(e['learning_curve_acc']) for e in more] )
    # Correlations
    spearman = scipy.stats.spearmanr(x, y)
    pearson  = scipy.stats.pearsonr(x, y)
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)
    print("Correlation measures: ")
    print("Spearman:", spearman )
    print("Pearson :", pearson  )
    print("R2      :", r_value)
    data_dict = {
        'dot_size'     : 1,
        'log_quantile' : x,
        'accuracy'     : y,
        'filename'     : filenames,
        'width_profile': widths,
        'log10_parameter_count': sizes,
        'accuracy'     : np.round( accuracies*100, decimals=2 )
    }
    df = pandas.DataFrame.from_dict( data_dict )
    layout = Layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)'
    )
    fig = px.scatter( data_dict, 
                      size='dot_size',
                      size_max=10,
                      x='log_quantile', y='accuracy', 
                      color='log10_parameter_count',
                      hover_data=['filename', 'width_profile', 'accuracy'],
                      color_continuous_scale=["green", "orange", "red"]
                    )
    fig.update_layout({
        'plot_bgcolor' : 'rgba(0, 0, 0, 0)',
        'paper_bgcolor': 'rgba(0, 0, 0, 0)',
    })
    fig.show()

## III. Regression on all quantiles

In [None]:
!pip install statsmodels

In [None]:
Xtrain = []
Ytrain = []
for i in range(1,10):
    # Bivariate analysis of quantiles and losses
    x = [np.log10(x+0.01) for (x,y,e) in scatter_data[i]]
    x = np.array(x)
    Xtrain.append(x)
    Ytrain = [y for (x,y,e) in scatter_data[i]]
Xtrain = np.stack( Xtrain ).T
Ytrain = np.array( Ytrain )

print(Xtrain.shape)
print(Ytrain.shape)

import statsmodels.api as sm
X1train = sm.add_constant(Xtrain)
print(X1train.shape)
reg = sm.OLS(Ytrain,X1train)
resReg = reg.fit()
print(resReg.summary())

## IV. Attempt at plot for all quantiles

In [None]:
import matplotlib.pyplot as plt
import scipy
from scipy import stats

# Means 

# Quantiles
fig = plt.figure( figsize = (10,6) )
ax  = fig.add_subplot( 111 )
final_x = []
final_y = []
final_c = []
print("Correlation measures: (correlation, p_value)")
for i in range(1,10):
    # Bivariate analysis of quantiles and losses
    x = [np.log10(x+0.001) for (x,y) in scatter_data[i]]
    #x = [x for (x,y) in scatter_data[i]]
    y = [y for (x,y) in scatter_data[i]]
    spearman = scipy.stats.spearmanr(x, y)
    pearson  = scipy.stats.pearsonr(x, y)
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)
    print("Spearman:", spearman[0], spearman[1] )
    print("Pearson :", pearson  )
    print("R2      :", r_value)
    # Plot
    color = [i*0.1]*len(x)
    final_x = final_x + x
    final_y = final_y + y
    final_c = final_c + color
#
scale = 50
plt.scatter( final_x, final_y, alpha=0.4, c=final_c, s=scale, cmap='plasma')
plt.colorbar()
plt.title(f'''FPT quantiles vs Loss''')
plt.xlabel("($log_{10}$ of) FPT quantile")
plt.ylabel("Loss")
plt.savefig(f'''FTP_quantile{i*10}.png''')
plt.show()
