In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import scipy.io as io

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold, StratifiedKFold

import sys
from IPython.display import clear_output

os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['VECLIB_MAXIMUM_THREADS'] = '1'
os.environ['NUMEXPR_NUM_THREADS'] = '1'

def verbose(text):    
    clear_output(wait=True)    
    print(text)
    sys.stdout.flush() 
    
def errorfill(x, y, yerr, color=None, label=None, alpha_fill=0.3, ax=None):
    ax = ax if ax is not None else plt.gca()
    if color is None:
        color = ax._get_lines.get_next_color()
    if np.isscalar(yerr) or len(yerr) == len(y):
        ymin = y - yerr
        ymax = y + yerr
    elif len(yerr) == 2:
        ymin, ymax = yerr
    ax.plot(x, y, color=color, label=label)
    ax.fill_between(x, ymax, ymin, color=color, label=label, alpha=alpha_fill)
    return ax

In [None]:
filename = '../shareddata/Project_ASD/ERPs.npz'
# filename = '../shareddata/Project_ASD/ERPs-b50.npz'
# filename = '../shareddata/Project_ASD/ERPs-b100.npz'
data = np.load(filename, allow_pickle=True)

In [None]:
X_asd_ = data['ERPs_ASD'] # [participants, catch/target, C1/C2/C3/C4]
X_typ_ = data['ERPs_TYP']

In [None]:
n_asd = X_asd_.shape[0]
n_typ = X_typ_.shape[0]
n_elec, n_t, _ = X_asd_[0,0,0].shape

X_all = np.zeros((2,4,n_asd+n_typ,n_t,n_elec)) # catch/target | C1/C2/C3/C4
y = np.concatenate((np.zeros(n_asd),np.ones(n_typ)))

for k in range(4):
    X_all[0,k] = np.concatenate([X_asd_[i,0,k].mean(2, keepdims=True)
                                 for i in range(n_asd)]+
                                [X_typ_[i,0,k].mean(2, keepdims=True)
                                 for i in range(n_typ)],2).T
    X_all[1,k] = np.concatenate([X_asd_[i,1,k].mean(2, keepdims=True)
                                 for i in range(n_asd)]+
                                [X_typ_[i,1,k].mean(2, keepdims=True)
                                 for i in range(n_typ)],2).T
idx = np.isnan(X_all.mean((0,1,3,4)))==False
X_all = X_all[:,:,idx]
y = y[idx]

In [None]:
n_tc, n_conds, n_participants, n_t, n_elec = X_all.shape

In [None]:
pca = PCA()
lr = 1
test_pca = np.arange(1,38,1)
test_C = np.logspace(-5, 0, 6)
n_C = test_C.shape[0]
n_pca = test_pca.shape[0]

if lr:
    clf = LogisticRegression(max_iter=5000, tol=1.0)
    param_grid = {
        'pca__n_components': test_pca,
        'clf__C': ,
    }
else:
    clf = LinearDiscriminantAnalysis(tol=1e-3)
    param_grid = {
        'pca__n_components': test_pca,
    }

# clf = QuadraticDiscriminantAnalysis(tol=1e-3)
pipe = Pipeline(steps=[('pca', pca), ('clf', clf)])
# search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=1, verbose=10)

In [None]:
search_all = np.zeros((n_tc,n_conds), dtype=object)
for i in range(n_tc):
    for j in range(n_conds):
        X = X_all[i,j].reshape(n_participants,n_t*n_elec)
        
        # using also squared signal improve performances
        #X = np.concatenate([X_all[i,j].reshape(n_participants,n_t*n_elec),
        #                   X_all[i,j].reshape(n_participants,n_t*n_elec)**2],
        #                   axis=1)
        
        X -= X.mean(0, keepdims=True)
        X /= X.std(0, keepdims=True)
        search_all[i,j] = GridSearchCV(pipe, param_grid,
                                       cv=5, n_jobs=15, verbose=0)
        search_all[i,j].fit(X, y);
        verbose('%i,%i'%(i,j))

In [None]:
fig, ax = plt.subplots(n_tc,n_conds,figsize=(12,6))
for i in range(n_tc):
    for j in range(n_conds):
        print("Best parameter (CV score=%0.3f):" % search_all[i,j].best_score_)
        print(search_all[i,j].best_params_)
        if lr:
            for k in range(6):
                errorfill(test_pca,search_all[i,j].
                          cv_results_['mean_test_score']
                          .reshape(6,37)[k].T,
                          search_all[i,j].cv_results_['std_test_score']
                          .reshape(6,37)[k].T/np.sqrt(5), ax=ax[i,j])
                ax[i,j].set_ylim(0.4,0.9)
            
        else:
            errorfill(test_pca,search_all[i,j].cv_results_['mean_test_score'],
                      search_all[i,j].cv_results_['std_test_score']/np.sqrt(5),
                      ax=ax[i,j])    
        ax[i,j].set_ylim(0.4,0.95)
        


## Things to try
- Concatenate the conditions C1/C2/C3/C4
- Use non-linear methods like kernel methods (but we are already overfitting)
- Craft our own Quadratic Discriminant Analysis with diagonal cov matrices
- Use L1 penalty + logistic regression (highly relevant to find a relevant low
dimensional space)
- Use brute force PCA components selection (using all intervals [i,j])