# K-SSHIBA

In this notebook we want to analyse the response of the kernel version of the previously presented SSHIBA algorithm with different datasets and baselines.

## Loading datasets

First of all we will load the specified dataset to analyse:

In [7]:
import numpy as np
import os
import pickle
from time import time
import scipy.io as sio

dirpath = os.getcwd()
foldername = os.path.basename(dirpath)
(prv_fold,foldername) = os.path.split(dirpath)
os.sys.path.append(prv_fold +'/lib/')
os.sys.path.append(prv_fold +'\\lib\\')
import sshiba
import sshiba_areas

database = 'Satellite' #Here we specify the desired database
print('Loaded database: '+database)

file = 'data_'+database

(prv_2_fold,foldername) = os.path.split(prv_fold)
X = np.loadtxt(prv_2_fold+'/Databases/'+file+'/data.txt')
Y = np.loadtxt(prv_2_fold+'/Databases/'+file+'/labels.txt')[:,np.newaxis]

# =================================================== #
# Don't run, just to generate folds and save in a file
# =================================================== #

# from sklearn.model_selection import StratifiedKFold

# skf_tst = StratifiedKFold(n_splits=10, shuffle = True)
# fold_tst =[f for  i, f in enumerate(skf_tst.split(X, Y))]
# dict_fold_val = {}
# for ii, f_tst in enumerate(fold_tst):
#     pos_tr = f_tst[0]
#     skf_val = StratifiedKFold(n_splits=10, shuffle = True)
#     fold_val =[f for  i, f in enumerate(skf_val.split(X[pos_tr], Y[pos_tr]))]
#     dict_fold_val[ii]=fold_val

# pickle.dump([fold_tst, dict_fold_val], open( 'folds_'+database+'.p', "wb" ))

# =================================================== #

[fold_tst, dict_fold_val] = pickle.load(open('folds_'+database+'.p','rb'))

Loaded database: satellite


Once the database is loaded and the partitions are defined we can start to analyse the performance of the algorithm on different scenarios.

## K-PCA

In this section we will calculate the performance of the kernel version of PCA on this database. In particular, we will not validate the parameters associated to this algorithm ($\gamma$ and $K_c$), they will be statistically determined.

In [12]:
def rbf_kernel_sig(X1, X2, sig=0):
    size1 = X1.shape[0];
    size2 = X2.shape[0];
    if X1.ndim==1:
        X1 = X1[:,np.newaxis]
        X2 = X2[:,np.newaxis]
    G = (X1* X1).sum(axis=1)
    H = (X2* X2).sum(axis=1)
    Q = np.tile(G, [size2,1]).T
    R = np.tile(H, [size1,1])
    KK=np.dot(X1,X2.T)
    dist=(Q + R - 2*KK)
    if sig == 0:  # Then, we estimate its value
        aux = dist-np.tril(dist)
        aux = aux.reshape(size1**2,1)
        sig = np.sqrt(0.5*np.mean(aux[np.where(aux>0)]))             
    K = np.exp(-dist/sig**2);
    return K, sig

In [11]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

filename = 'Results/Baselines_'+database+'.pkl'
if os.path.exists(filename):
    print ("Loading existing model...")
    my_dict = pickle.load( open( filename, "rb" ) )
    if 'KPCA' in my_dict['models']:
        results = my_dict['models']
        print ("... Model loaded")
    else:
        results['KPCA'] = np.zeros((len(fold_tst),))
else:
    results = {}
    results['KPCA'] = np.zeros((len(fold_tst),))

for i in np.arange(len(fold_tst)):
    
    print('---------> Fold '+str(i)+' <---------')   
    
    if results['KPCA'][i] ! = 0:
        # Splitting the data into training and test sets.
        pos_tr = fold_tst[i][0]
        pos_tst =  fold_tst[i][1]
        Y_tr = Y[pos_tr,:] 
        Y_tst = Y[pos_tst,:]
        X_tr = X[pos_tr,:]    
        X_tst = X[pos_tst,:]

        # Generating RBF kernel and calculating the gamma value.
        K_tr, sig = rbf_kernel_sig(X_tr, X_tr)
        K_tst, sig = rbf_kernel_sig(X_tst, X_tr, sig = sig)

        # Defining the feature extracting algorithm, PCA.
        pca = PCA()
        P_tr = pca.fit_transform(K_tr)
        P_tst = pca.fit_transform(K_tst)

        # Selecting the latent factors that explain 95% of the variance.
        Kc = 0
        while np.sum(pca.explained_variance_ratio_[:Kc]) < 0.95:
            Kc = Kc + 1 
        Kc_PCA[fold] = Kc
        P_tr = P_tr[:,:Kc]
        P_tst = P_tst[:,:Kc]

        # Training the linear classifier. Hyperparamiters determined using grid search cross validation.
        grid = {"C": np.logspace(-5,5,11)}# l1 lasso l2 ridge
        clf = SVC(kernel = 'linear')
        clf_cv = GridSearchCV(clf, grid, cv=10)
        clf_cv.fit(P_tr,Y_tr)
        results['KPCA'][i] = clf_cv.score(P_tst,Y_tst)   
        print('KPCA accuracy: %0.2f%%' %(ACC_KPCA[i]*100))
        with open(filename, 'wb') as output:
            pickle.dump(results, output, pickle.HIGHEST_PROTOCOL)
    else:
        print('Fold previously trained. KPCA accuracy: %0.2f%%' %(results['KPCA'][i]*100))
print('KPCA mean accuracy: %0.2f +/- %0.2f%%' %(np.mean(results['KPCA']*100) , np.std(results['KPCA']*100))

---------> Fold 0 <---------
---------> Fold 1 <---------


KeyboardInterrupt: 

In [None]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

pca = PCA()
P_tr = pca.fit_transform(K_tr)
P_tst = pca.fit_transform(K_tst)

Kc = 0
while np.sum(pca.explained_variance_ratio_[:Kc]) < 0.95:
    Kc = Kc + 1 
Kc_PCA[fold] = Kc
P_tr = P_tr[:,:Kc]
P_tst = P_tst[:,:Kc]

# Grid search cross validation
grid = {"C": np.logspace(-5,5,11)}# l1 lasso l2 ridge
clf = SVC(kernel = 'linear')
clf_cv = GridSearchCV(clf, grid, cv=10)
clf_cv.fit(P_tr,Y_tr)
ACC[i] = clf_cv.score(P_tst,Y_tst)