### 1. Regional OLS Results

In [7]:
import numpy as np
import pandas as pd

# loading simulation results
df_results = pd.read_csv('results/regional-results/ROLS - all - n_res=100 - 2019-07-10.csv')
df_results.head()

Unnamed: 0,dataset_name,random_state,alpha0,sigma0,nEpochs,cm_tr,cm_ts
0,vc2c,127815836,0.1,3.0,100,[156 13 15 64],[32 9 7 14]
1,vc2c,127815836,0.1,10.0,100,[153 16 16 63],[32 9 6 15]
2,vc2c,127815836,0.1,6.5,100,[162 7 21 58],[34 7 9 12]
3,vc2c,127815836,0.1,10.0,300,[157 12 21 58],[32 9 8 13]
4,vc2c,127815836,0.1,6.5,300,[155 14 19 60],[33 8 7 14]


Results for each data set:

In [8]:
from load_dataset import datasets

test_size     = 0.2
scaleType     = 'min-max'
n_resamplings = 100

num = 3

alphas = [0.1, 0.3, 0.5]     # np.linspace(0.1, 0.5,  num=num).tolist()
sigmas = [3.0, 6.5, 10.0]    # np.linspace(3,    10,   num=num).tolist()
epochs = [100]               # np.linspace(100,  500, num=num, dtype='int').tolist()
    
som_params = [
    {
     "alpha0"  : alpha0
    ,"sigma0"  : sigma0
    ,"nEpochs" : nEpochs
    }
    for alpha0       in alphas
    for sigma0       in sigmas
    for nEpochs      in epochs
]

header = list(som_params[0].keys()) + ['Minimum', 'Maximum', 'Median', 'Mean', 'Std. Deviation']

df_ds = {}
for dataset_name in datasets: # For this specific dataset
    print(dataset_name)
    df = df_results.loc[df_results['dataset_name'] == dataset_name] # get simulation results

    count = 0
    df_data   = np.zeros((len(som_params), len(header))) # matriz que guardará resultados numéricos
    for params in som_params:
        df_case = df.loc[(df['alpha0']  == params['alpha0']) & 
                         (df['sigma0']  == params['sigma0']) &
                         (df['nEpochs'] == params['nEpochs'])]

        # converting confusion matrix from string to numpy array
        cm_ts = np.array([[int(x) for x in result[1:-1].split()] for result in df_case['cm_ts'].values])

        #data = cm_ts
        length = cm_ts.shape[1]
        cm_side = int(np.sqrt(length))

        acc   = [0]*len(cm_ts)
        for i in range(len(cm_ts)):
            cm = np.reshape(cm_ts[i], (cm_side,cm_side))
            acc[i] = np.trace(cm)/np.sum(cm)

        df_data[count,:] = np.matrix([
            params['alpha0'], params['sigma0'], params['nEpochs'],
            min(acc), max(acc), np.median(acc), np.mean(acc), np.std(acc)])
        count+=1


    df_ds[dataset_name] = pd.DataFrame(df_data, columns=header)
    print(df_ds[dataset_name].head()) # TODO: display
    print('-'*100,'\n'*2)

vc2c
   alpha0  sigma0  nEpochs   Minimum   Maximum    Median      Mean  \
0     0.1     3.0    100.0  0.677419  0.919355  0.838710  0.827097   
1     0.1     6.5    100.0  0.693548  0.903226  0.838710  0.828871   
2     0.1    10.0    100.0  0.725806  0.951613  0.838710  0.832419   
3     0.3     3.0    100.0  0.709677  0.935484  0.830645  0.829839   
4     0.3     6.5    100.0  0.725806  0.935484  0.838710  0.832097   

   Std. Deviation  
0        0.050134  
1        0.046573  
2        0.045040  
3        0.045240  
4        0.047470  
---------------------------------------------------------------------------------------------------- 


vc3c
   alpha0  sigma0  nEpochs   Minimum   Maximum    Median      Mean  \
0     0.1     3.0    100.0  0.645161  0.919355  0.822581  0.811129   
1     0.1     6.5    100.0  0.693548  0.919355  0.814516  0.810806   
2     0.1    10.0    100.0  0.709677  0.903226  0.822581  0.814032   
3     0.3     3.0    100.0  0.693548  0.919355  0.822581  0.81564

Taking the best values by higher mean in accuracy.

In [9]:
data = np.array([df.sort_values('Mean', ascending=False).iloc[0,:].values for df in df_ds.values()])
idx_label = list(df_ds.keys())
df_rols = pd.DataFrame(data, columns=header, index=[idx_label])
df_rols

Unnamed: 0,alpha0,sigma0,nEpochs,Minimum,Maximum,Median,Mean,Std. Deviation
vc2c,0.1,10.0,100.0,0.725806,0.951613,0.83871,0.832419,0.04504
vc3c,0.3,10.0,100.0,0.677419,0.919355,0.822581,0.816452,0.04875
wf24f,0.5,10.0,100.0,0.812271,0.888278,0.858516,0.858178,0.01381
wf4f,0.5,10.0,100.0,0.836081,0.938645,0.879579,0.882894,0.023216
wf2f,0.1,3.0,100.0,0.769231,0.960623,0.908883,0.911016,0.023783
pk,0.3,10.0,100.0,0.692308,0.974359,0.871795,0.868974,0.055576


# Globlal OLS

In [None]:
from multiprocessing import Pool
import tqdm

data = [None]*len(cases)

pool = Pool()
data =[result for result in tqdm.tqdm(pool.imap_unordered(evalGOLS,cases), total=len(cases))]
pool.close()
pool.join()

results = np.vstack(data)
header  = ["dataset_name", "random_state", "cm_tr", "cm_ts"]
results_df = pd.DataFrame(results, columns=header)

  0%|                                                                                          | 0/600 [00:00<?, ?it/s]

In [24]:
results_df.head()

NameError: name 'results_df' is not defined

Processing results (taking the best values by higher mean in accuracy):

In [25]:
header = ['Minimum', 'Maximum', 'Median', 'Mean', 'Std. Deviation']

data      = np.zeros(( len(datasets.keys()), len(header) ))
idx_label = [' ']*len(datasets.keys())
count=0
for dataset_name in datasets: # For this specific dataset
    df = results_df.loc[results_df['dataset_name'] == dataset_name] # get simulation results
    
    # converting confusion matrices to numpy matrix
    cm_ts = np.array([array for array in df['cm_ts'].values])
       
    length = cm_ts.shape[1]
    cm_side = int(np.sqrt(length))
    acc   = [0]*len(cm_ts)
    for i in range(len(cm_ts)):
        cm = np.reshape(cm_ts[i], (cm_side,cm_side))
        acc[i] = np.trace(cm)/np.sum(cm)

    data[count,:] = np.array([min(acc), max(acc), np.median(acc), np.mean(acc), np.std(acc)])
    idx_label[count] = dataset_name
    count+=1
    
df_ols = pd.DataFrame(data, columns=header, index=[idx_label])
df_ols

NameError: name 'results_df' is not defined

# Comparing results:

In [26]:
header = ['Dataset', 'Model']+list(df_ols.columns)

temp_rols = df_rols.rename_axis('Dataset').reset_index().loc[:,[x for x in header if x!='Model']]
temp_rols.insert(1,'Model',['ROLS']*len(datasets.keys()))

temp_ols = df_ols.rename_axis('Dataset').reset_index()
temp_ols.insert(1,'Model',['OLS']*len(datasets.keys()))

print(
    pd.concat([temp_ols,temp_rols]).sort_index()
) # TODO: display

NameError: name 'df_ols' is not defined