In [1]:
import CodaPCA
import CodaCl 
import PCACl
import numpy as np
from runpca import read_csv
import os
import sklearn
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression
#change module for newer sklearn versions
from sklearn.model_selection  import cross_val_score
from sklearn.model_selection  import KFold
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import torch
import dill
import importlib
importlib.reload(CodaCl)

<module 'CodaCl' from 'C:\\Users\\u5801283\\Documents\\Honours\\honours_final\\honours\\Code\\coda-pca-orig\\coda\\codes\\CodaCl.py'>

In [2]:
def enhanced_cross_val(features, targets, folds):
    assert len(features) == len(targets), "Mismatch in length of features and targets"
    kfold_scores = []
    for train, test in folds:
        Y_train = targets[train]
        X_train = features[train]
               
        Y_test = targets[test]
        X_test = features[test]
                
        model = LogisticRegression(multi_class='auto', solver='lbfgs')
        model.fit(X_train, Y_train)
        y_pred = model.predict(X_test)
        kfold_scores.append(sklearn.metrics.accuracy_score(Y_test,y_pred))
                
    return kfold_scores

In [3]:
def coda_val(features, targets, n_components, folds, nn_shape,  lr,lam, epochs):
    kfold_scores = []
    for train, test in folds:        
        Y_train = targets[train]
        X_train = features[train]
        
       
        Y_test = targets[test]
        X_test = features[test]
        
        
        model = CodaCl.CoDA_Cl(features.shape[1], n_components, len(np.unique(targets)), nn_shape[0], nn_shape[1])


        val_arr, train_arr = model.fit(X_train, Y_train, lam, lr, train_size = int(len(X_train)*(3/4)), epochs=epochs)
        
        #print ("Plot!")
        
#         plt.plot(val_arr, c="red")
#         plt.plot(train_arr, c="blue")
        
#         plt.ylim(bottom=min(train_arr))

#         plt.show()
        
        pred = model.predict(torch.FloatTensor(X_test))
        
        pred = pred.exp().detach()     
        _, index = torch.max(pred,1)  
        pred = pred.numpy()
        index = index.numpy()
        
        kfold_scores.append(sklearn.metrics.accuracy_score(Y_test,index))
                
    return kfold_scores, val_arr, train_arr


In [4]:
def PCA_Classification(data, targets, param_list, filename, verbose=False, normalised=False):
    
    #set up pandas dataframe for result storage
    df = pd.DataFrame(
        {'Parameters':[],
         'CoDACl Scores':[],
         'CoDAPCA Scores':[],
         'CLRPCA Scores':[], 
         'PCA Scores':[],
         'Naive Scores':[],
         'CoDACl Val Loss':[], 
         'CoDACl Train Loss':[] 
        }).set_index('Parameters')
    
    #TODO add percentage update!
    df.to_csv(os.path.join(os.getcwd(),str(filename)+str(".csv")))
    #repeat experiments with reshuffled data and new folds for fairer results
    trials = 3
    for trial in range(0,trials): 
        for params in param_list:

            #TODO: run several times then average
            nn_dims = params[0]
            lr = params[1]
            lam = params[2]
            n_epochs = params[3]

            n_features = nn_dims[0]
            n_components = nn_dims[1]
            nn_shape= nn_dims[2]

            features = data[:, :n_features]

            if normalised == False:
                #normalise the compositional features
                features = np.array([feat/sum(feat) for feat in features])

            #encode targets
            le = LabelEncoder()
            le.fit(targets)
            targets = le.transform(targets)


            #keep folds constant across each algorithm for fair results
            kf = KFold(4)
            folds = [i for i in kf.split(features)] 


            #set up baseline algorithms:

            #CoDA-PCA:
            pca_coda = CodaPCA.NonParametricCodaPCA(n_components)
            Y_coda = pca_coda.fit_transform(np.array(features, dtype=np.float32))

            #CLR-PCA:
            pca_clr = CodaPCA.CLRPCA(n_components)
            Y_clr = pca_clr.fit_transform(np.array(features, dtype=np.float32))

            #PCA:
            sc = StandardScaler()
            features_ = sc.fit_transform(np.array(features, dtype=np.float32))
            pca_reg = decomposition.PCA(n_components)
            pca_reg.fit(features_)
            Y_pca = pca_reg.transform(features_)


            coda_score = enhanced_cross_val(Y_coda, np.array(targets), folds)
            clr_score = enhanced_cross_val(Y_clr, np.array(targets), folds) 
            pca_score = enhanced_cross_val(Y_pca, np.array(targets), folds)
            naive_score = enhanced_cross_val(features, np.array(targets), folds)


            codacl_score, val_arr, train_arr = coda_val(np.array(features,dtype=np.float32), np.array(targets), n_components, folds, nn_shape, lr, lam, n_epochs)


            #read/write each time to keep results if it crashes

            df = pd.read_csv(os.path.join(os.getcwd(),str(filename)+str(".csv"))).set_index('Parameters')
            df_row  = "trial{} nn {} lr {} lam {} epochs {} num_features {} low_dimension {}".format(trial, nn_shape, lr, lam, n_epochs, n_features, n_components)

            df.loc[df_row] = [codacl_score, coda_score, clr_score, pca_score, naive_score, val_arr, train_arr]

            df.to_csv(os.path.join(os.getcwd(),str(filename)+str(".csv")))



            if verbose:
                print("CoDA-PCA:")
                print(coda_score)
                print("CLR-PCA:")
                print(clr_score)
                print("PCA Classification:")
                print(pca_score)
                print ("Naive Classification:")
                print (naive_score)
                print ("Coda Cl:")
                print (codacl_score)

    return 


In [5]:
import pandas as pd
import numpy as np
diet_data = pd.read_hdf('atlas.h5','atlas')

data = diet_data.to_numpy()

np.random.shuffle(data)

targets = data[:,-7]

features = data[:,:130]
targets

array(['CentralEurope', 'CentralEurope', 'CentralEurope', 'SouthEurope',
       'CentralEurope', 'CentralEurope', 'Scandinavia', 'CentralEurope',
       'Scandinavia', 'CentralEurope', 'Scandinavia', 'CentralEurope',
       'Scandinavia', 'CentralEurope', 'CentralEurope', 'Scandinavia',
       'CentralEurope', 'CentralEurope', 'CentralEurope', 'CentralEurope',
       'Scandinavia', 'Scandinavia', 'Scandinavia', 'SouthEurope',
       'CentralEurope', 'Scandinavia', 'CentralEurope', 'CentralEurope',
       'Scandinavia', 'CentralEurope', 'Scandinavia', 'CentralEurope',
       'CentralEurope', 'Scandinavia', 'SouthEurope', 'UKIE',
       'Scandinavia', 'Scandinavia', 'Scandinavia', 'CentralEurope',
       'Scandinavia', 'CentralEurope', 'CentralEurope', 'CentralEurope',
       'CentralEurope', 'CentralEurope', 'CentralEurope', 'CentralEurope',
       'CentralEurope', 'Scandinavia', 'Scandinavia', 'CentralEurope',
       'CentralEurope', 'CentralEurope', 'Scandinavia', 'CentralEurope',
   

In [6]:
#experiments: should only need to specify the data and the params, then get a result file back
import itertools
#format [original size, low level dimension, [[encoder_shape], [decoder_shape]]
nn_dims = [[10,5, [[7,],[7,]]], [15,5, [[10,],[10,]]], [20,5, [[15,],[15,]]],[20,10, [[15,],[15,]]], [50,10, [[30,],[30,]]],[50,5, [[20,],[20,]]], [100,20, [[40,],[40,]]], [100,10, [[20,],[20,]]]]
lr_vals = [1e-3, 1e-4]
lam_vals = [0,1e-12,1e-11,1e-10,1e-8,1e-5,1e-2,1]
epoch_vals = [100, 200, 300, 400, 500, 1000]
param_list = list(itertools.product(*[nn_dims, lr_vals, lam_vals, epoch_vals]))
len(param_list)

768

In [25]:
#run experiments with location as target
np.random.shuffle(data)

features = data[:,:130]
targets = data[:,-7]
PCA_Classification(features, targets, param_list,filename="atlas_test_location")

  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, ke























































































































  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, 























































































































  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, ke

























































































































In [None]:
#run experiments with weight as target

np.random.shuffle(data)

features = data[:,:130]
targets = data[:,-3]
PCA_Classification(features, targets, param_list,filename="atlas_test_weight")

In [7]:
#run experiments with gender as target

np.random.shuffle(data)


features = data[:,:130]
targets = data[:,-8]
PCA_Classification(features, targets, param_list,filename="atlas_test_gender")

  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )

  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  gradU -= gradU.mean( 1, keepdims=True )
  gradU -= gradU.mean( 1, keepdims=True )
