# Classifier Selection 

Comparing multiple dimensionality reduction techniques on the full dataset, it seems that it might be possible to find a linear classifier that seperates these classes.  I'm going to give two basic classifiers a shot on both the full data and the DMN subset. However, since I have only $200$ samples I'm going to need to do some unsupervised dimensionality reduction on the full set of autocorrelation values. 

In [1]:

import sys
sys.path.append('..') #workaround to deal with directory issues in notebooks

import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import TSNE
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import (KNeighborsClassifier,
                               NeighborhoodComponentsAnalysis)

from xgboost import XGBClassifier

from src.models import train_model 
from src.features import load_features


In [2]:
data_dir='../data/'
class_labels,two_class_labels,pos_str,neg_str,clus_co,ar_array,num_regions,num_subjs=load_features(data_dir)

In [12]:
x=np.concatenate((ar_array,pos_str,neg_str,clus_co),axis=0).transpose() # sets up correctly for scikit learn input later


#do model selection based on a test train split
x_train, x_test, y_train, y_test = train_test_split(x, two_class_labels, test_size=.20, random_state=42,stratify=two_class_labels)

cv=StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
scale=StandardScaler()
x_train_sc=scale.fit_transform(x_train)
x_test_sc=scale.transform(x_test)

#pipelines=[pipe_svc,pipe_XGB,pipe_pca_svc,pipe_pca_XGB,pipe_MI_svc,pipe_MI_XGB]
pipelines=["svc","xgb"]
subsets={"Auto":np.array([*range(num_regions,2*num_regions+1)])}

feat_names=['Auto','Pos','Neg','CC']



In [13]:
fit_models,score,estimator=train_model.multi_subset_pipeline(x_train_sc,y_train,cv,subsets,pipelines,save_flag=False)

In [15]:
print(score)

[0.5618181818181818, 0.5336363636363636]


In [None]:
All_feat_test=list()
All_feat_test_jack=list()
for pipe, param, name in zip(pipelines,params,names):
    print(f'Classifier Evaluation for {name}')
    search=GridSearchCV(estimator=pipe,param_grid=param,scoring=scoring,refit='Bal_Acc',cv=CV)
    search.fit(X_train_sc, y_train)
    All_feat_test.append(search.score(X_test_sc, y_test))
    All_feat_test_jack.append(jackknife_variance(X_test_sc,y_test,search))
    print(f'For all features')
    print(f'Best score on training data is {search.best_score_}')
    print(f'Using a classifier with the following parameters {search.best_estimator_} \n')
    print(f'On the test set the classifier has an accuracy of {All_feat_test[-1]} with variance {All_feat_test_jack[-1]} and the following report \n')
    print(classification_report(y_test,search.predict(X_test_sc)))
