In [1]:
import numpy as np
import scipy.io as io
import pandas as pd
import sys
import os
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['VECLIB_MAXIMUM_THREADS'] = '1'
os.environ['NUMEXPR_NUM_THREADS'] = '1'
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold, StratifiedKFold
from IPython.display import clear_output

In [3]:
directory = '/home/data/Project_ASD/' + os.environ["USER"]+'/' #feel free to upload data from /Project_ASD/seyda
baseMethod='baseline50'
filename = '/X_all-baseline50-floating2.npz'
data = np.load(directory+filename, allow_pickle=True)
X_all = data['X_all'] # n_trtype, n_conds,n_subj, n_time, n_elec 
subjects = data['subjects']
n_trtype, n_conds,n_subj, n_time, n_elec = X_all.shape
subj2exclude=['10594'] #subj2exclude=['10056','12144','12360','1160','12005']
ind_ex = [i for i in range(subjects.shape[0]) if subjects[i,2] in subj2exclude]
X_all = np.delete(X_all, ind_ex, axis=2)
subjects = np.delete(subjects, ind_ex, axis=0)
print('Removed '+str(len(ind_ex))+' subjects')
#check for nan values. do not delte this. change indices to find nans in dfr dimensions
idx = pd.isnull(X_all.mean((0,1,3,4)))== False #RuntimeWarning should be ok
X_all = X_all[:,:,idx]
subjects = subjects[idx]
assert subjects.shape[0] == X_all.shape[2]
n_subj = subjects.shape[0]

In [52]:
test_pca = np.arange(1,n_subj,15) #   
test_C=np.logspace(-3, 0, 3)  #-3, 0, 41) np.logspace(-5, 0, 6)
pca = PCA()
param_grid = {'pca__n_components': test_pca,'clf__C': test_C}
print('Testing '+ str(param_grid['pca__n_components'].shape[0]) +' different n_pca and '+str(param_grid['clf__C'].shape[0]) +' regularization parameters')

Testing 3 different n_pca and 3 regularization parameters


In [53]:
clfs={}
clfs['LR-L2']= LogisticRegression(max_iter=5000, tol=1.0) #default=’l2’
clfs['LR-L1']= LogisticRegression(penalty='l1', solver='liblinear', max_iter=5000, tol=1.0)
clfs['LDA']=  LinearDiscriminantAnalysis(tol=1e-3)
clfs['QDA'] = QuadraticDiscriminantAnalysis(tol=1e-3)
clf_key ='LR-L1' #this is the only line you are likely to edit 
pipe = Pipeline(steps=[('pca', pca), ('clf', clfs[clf_key])])

In [54]:
n_trtype, n_conds,n_subj, n_time, n_elec = X_all.shape
y=subjects[...,0]
search = np.zeros((n_trtype,5), dtype=object)
X_all_40elec=np.take(X_all, range(160)[::4], 4)

for i in range(n_trtype):
    for j in range(n_conds):
        clear_output(wait=True)
        print(i, j)
        X1 = X_all[i,j].reshape(n_subj,n_time*n_elec)**2        
        #X -= X.mean(0, keepdims=True)
        #X /= X.std(0, keepdims=True)
        search[i,j] = GridSearchCV(pipe, param_grid, cv=5, n_jobs=15, verbose=0)
        search[i,j].fit(X1, y);
    X2 = X_all_40elec[i].reshape(n_subj,n_time*40*n_conds)**2
    search[i,4] = GridSearchCV(pipe, param_grid, cv=5, n_jobs=15, verbose=0)
    search[i,4].fit(X2, y);
    assert X1.shape == X2.shape
    
exp_identifier=clf_key+'-'+str(y.shape[0])+'x'+str(X1.shape[1])+'features'+'-pca'+str(param_grid['pca__n_components'].shape[0])+'-C'+str(param_grid['clf__C'].shape[0])


7 3


In [55]:
#calculate and write best scores 
import time
year, month, day, hour, min = map(str, time.strftime("%Y %m %d %H %M").split())

best = np.zeros((n_trtype,5,3), dtype=object)
for i in range(n_trtype):
    for j in range(5):
        best[i,j,0]=search[i,j].best_score_*100
        best[i,j,1]=search[i,j].best_params_['clf__C']
        best[i,j,2]=search[i,j].best_params_['pca__n_components']
with open('best.csv','a') as fd:
    fd.write("\n"+ year+'-'+month+'-'+day+'-'+hour+min+"\n")
    fd.write(exp_identifier+"\n")   
    fd.write(str(best)+"\n")

best

array([[[58.21428571428571, 0.03162277660168379, 16],
        [63.21428571428571, 0.03162277660168379, 16],
        [54.64285714285714, 0.001, 1],
        [63.21428571428571, 0.03162277660168379, 16],
        [49.999999999999986, 0.001, 1]],

       [[55.35714285714286, 0.001, 1],
        [57.857142857142854, 0.001, 1],
        [55.35714285714286, 0.001, 1],
        [57.857142857142854, 0.001, 1],
        [55.00000000000001, 0.03162277660168379, 16]],

       [[62.857142857142854, 0.001, 16],
        [70.71428571428571, 0.001, 16],
        [65.35714285714285, 0.03162277660168379, 16],
        [65.71428571428571, 0.001, 1],
        [73.92857142857143, 0.03162277660168379, 16]],

       [[62.85714285714285, 0.001, 16],
        [62.85714285714285, 0.001, 16],
        [70.71428571428571, 0.001, 16],
        [68.21428571428572, 0.001, 16],
        [66.42857142857143, 0.001, 16]],

       [[71.07142857142857, 0.001, 1],
        [73.92857142857143, 0.001, 1],
        [68.92857142857143, 0.001

In [56]:
# Add notes if you want 
#with open('best.csv','a') as fd:
#    fd.write("Now I will starting to use the real dataset with all four conditions. recently calculated")


In [None]:
# save cv_results  
mean_score = np.zeros((n_trtype,5), dtype=object)
std_score = np.zeros((n_trtype,5), dtype=object)
for i in range(n_trtype):
    for j in range(5):
        for k in range(test_C.shape[0]):
            std_score[i,j]=search[i,j].cv_results_['std_test_score'].reshape(test_C.shape[0],test_pca.shape[0])
            mean_score[i,j]=search[i,j].cv_results_['mean_test_score'].reshape(test_C.shape[0],test_pca.shape[0])
#exp_identifier looks like this: LR-L2-pca12-C21-78080features
save_name=year+'-'+month+'-'+day+'-'+hour+min+'-'+exp_identifier+'.npz' 
np.savez(directory+'/scores/'+save_name, mean_score=mean_score, std_score=std_score,test_C=test_C,test_pca=test_pca,best=best)


In [None]:
X1 = X_all[i,:,j].reshape(n_conds,n_time*n_elec)**2        
search[i,j] = GridSearchCV(pipe, param_grid, cv=2, n_jobs=15, verbose=0) #cv is number of splits 
search[i,j].fit(X1, y);


In [80]:
# COMPARE SUBJECT SCORES, DECODING CONDITIONS 
# n_trtype, n_conds,n_subj, n_time, n_elec = X_all.shape

test_pca =np.array([1,5,10,20,30]) 
test_C=np.logspace(-3, 0, 4) 
pca = PCA()
param_grid = {'pca__n_components': test_pca,'clf__C': test_C}
clf_key ='LR-L1' #this is the only line you are likely to edit 
pipe = Pipeline(steps=[('pca', pca), ('clf', clfs[clf_key])])

y=np.arange(1,5,1)
y=np.concatenate((y,y,y,y,y,y,y,y),axis=0)
search = np.zeros((n_subj), dtype=object)

for j in range(n_subj):
    clear_output(wait=True)
    print(i, j)
    X1 = X_all[:,:,j].reshape(n_trtype*n_conds,n_time*n_elec)**2        
    search[j] = GridSearchCV(pipe, param_grid, cv=4, n_jobs=15, verbose=0) #cv is number of splits 
    search[j].fit(X1, y);

exp_identifier=clf_key+'-compareSubjScores-'+str(y.shape[0])+'x'+str(X1.shape[1])+'features'+'-pca'+str(param_grid['pca__n_components'].shape[0])+'-C'+str(param_grid['clf__C'].shape[0])


0 37


In [81]:
#calculate and write best scores 
import time
year, month, day, hour, min = map(str, time.strftime("%Y %m %d %H %M").split())

best = np.zeros((n_subj,3), dtype=object)
for j in range(n_subj):
    best[j,0]=search[j].best_score_*100
    best[j,1]=search[j].best_params_['clf__C']
    best[j,2]=search[j].best_params_['pca__n_components']
with open('best.csv','a') as fd:
    fd.write("\n"+ year+'-'+month+'-'+day+'-'+hour+min+"\n")
    fd.write(exp_identifier+"\n")   
    fd.write(str(best)+"\n")

best

array([[50.0, 0.1, 20],
       [50.0, 1.0, 20],
       [40.625, 0.01, 20],
       [37.5, 0.01, 1],
       [37.5, 0.001, 10],
       [40.625, 0.1, 10],
       [37.5, 1.0, 20],
       [37.5, 0.01, 20],
       [40.625, 0.001, 10],
       [40.625, 0.001, 5],
       [50.0, 1.0, 20],
       [37.5, 0.001, 1],
       [56.25, 0.1, 20],
       [40.625, 0.01, 10],
       [37.5, 0.01, 10],
       [37.5, 0.01, 5],
       [43.75, 1.0, 10],
       [34.375, 1.0, 5],
       [43.75, 0.001, 5],
       [40.625, 0.01, 20],
       [43.75, 1.0, 20],
       [46.875, 0.01, 20],
       [50.0, 1.0, 20],
       [37.5, 0.01, 10],
       [40.625, 0.1, 20],
       [37.5, 0.01, 20],
       [43.75, 1.0, 20],
       [43.75, 0.01, 20],
       [43.75, 0.01, 10],
       [50.0, 0.1, 20],
       [40.625, 0.01, 10],
       [31.25, 0.001, 5],
       [43.75, 0.001, 20],
       [40.625, 0.01, 20],
       [56.25, 1.0, 20],
       [53.125, 0.1, 20],
       [53.125, 1.0, 20],
       [53.125, 1.0, 5]], dtype=object)

In [82]:
# save cv_results  
mean_score = np.zeros((n_subj), dtype=object)
std_score = np.zeros((n_subj), dtype=object)
for j in range(n_subj):
    for k in range(test_C.shape[0]):
        std_score[j]=search[j].cv_results_['std_test_score'].reshape(test_C.shape[0],test_pca.shape[0])
        mean_score[j]=search[j].cv_results_['mean_test_score'].reshape(test_C.shape[0],test_pca.shape[0])
#exp_identifier looks like this: LR-L2-pca12-C21-78080features
save_name=year+'-'+month+'-'+day+'-'+hour+min+'-'+exp_identifier+'.npz' 
np.savez(directory+'/scores/'+save_name, mean_score=mean_score, std_score=std_score,test_C=test_C,test_pca=test_pca,best=best)


In [None]:
# COMPARE SUCCESS RATES  ACROSS TIME INTERVALS 
# IN PROGRESS
# n_trtype, n_conds,n_subj, n_time, n_elec = X_all.shape

test_pca =np.array([1,4]) 
test_C=np.logspace(-3, 0, 10) 
pca = PCA()
param_grid = {'pca__n_components': test_pca,'clf__C': test_C}
clf_key ='LR-L1' #this is the only line you are likely to edit 
pipe = Pipeline(steps=[('pca', pca), ('clf', clfs[clf_key])])

y=np.arange(0,8,1)
y=np.concatenate((y,y,y,y),axis=0)
search = np.zeros((n_subj), dtype=object)

from ms2time import ms2time
for w in range(9):
    ind_time=ms2time(w*100+50, w*100+150)
    
 