In [5]:
import dill
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, balanced_accuracy_score

In [22]:
main_dir = '../models/'

In [23]:
fldrs = [os.path.join(main_dir, x) for x in os.listdir(main_dir) if os.path.isdir(os.path.join(main_dir, x))]

In [24]:
# Get best models based on the pseudometric csv file to be used as selected models
top_rfe_ml_per_model_ = {}
for fldr in fldrs:
    df_psudo_scores = pd.read_csv(os.path.join(fldr,'pseudo_metrics.csv'), index_col=[0,1])
    max_loc = df_psudo_scores['acc'].argmax()
    rfe, ml = df_psudo_scores.index[max_loc]
    top_rfe_ml_per_model_[fldr] = (rfe, ml)

In [25]:
top_rfe_ml_per_model_

{'../models/20210817_134614': ('lsvm', 'gradboost'),
 '../models/20210815_213208_perc_ubuntu_comm': ('lsvm', 'gradboost'),
 '../models/20210816_230313_perc_ubuntu_cog': ('lsvm', 'gradboost'),
 '../models/20210815_170500_perc_ubuntu_mot': ('lsvm', 'gradboost'),
 '../models/20210712_122841_ubuntu_std_mot': ('lsvm', 'gradboost'),
 '../models/20210711_175827_ubuntu_std_awareness': ('lr', 'gradboost')}

In [26]:
production_models = {}
for fldr in fldrs:
    behav_name = fldr.split('_')[-1]
    df = pd.read_csv(os.path.join(fldr, 'group_df_afterFixation.csv'), index_col=0)
    cols_2_del = ['DX_GROUP','AGE_AT_SCAN ', 'SEX']
    for col in df.columns:
        if 'categories_' in col:
            cols_2_del.append(col)
        elif 'SRS_' in col:
            cols_2_del.append(col)
    df.drop(cols_2_del, axis=1, inplace=True)
    
    with open(os.path.join(fldr,'normalizer.p'), 'rb') as f:
        normalizer = dill.load(f)
    with open(os.path.join(fldr,'ML_obj.p'), 'rb') as f:
        ml_obj = dill.load(f)
    with open(os.path.join(fldr, 'FS_obj.p'), 'rb') as f:
        fs_obj = dill.load(f)
    X = df.drop('my_labels', axis=1)
    y = df['my_labels'].values
    Xs = normalizer.transform(X)
    
    selected_rfe = top_rfe_ml_per_model_[fldr][0]
    selected_ml = top_rfe_ml_per_model_[fldr][1]
    
    Xselected = fs_obj[selected_rfe].transform(Xs)
    trained_obj = ml_obj[selected_rfe][selected_ml].best_estimator_.fit(Xselected, y)
    yhat = trained_obj.predict(Xselected)
    score = balanced_accuracy_score(y, yhat)
    print(f'{behav_name}: {confusion_matrix(y,yhat)}')

    production_models[behav_name] = {
        'normalizer': normalizer,
        'rfe': fs_obj[selected_rfe],
        'ml': trained_obj,
        'score': score
    }

134614: [[ 13 176]
 [  5 180]]
comm: [[166   0]
 [166   0]]
cog: [[125   2]
 [126   1]]
mot: [[  2 114]
 [  0 116]]
mot: [[  0 116]
 [  0 116]]
awareness: [[11 94]
 [ 8 97]]


In [13]:
with open('../selected_models_for_production/trained_normalizer_rfe_models.p', 'wb') as f:
    dill.dump(production_models, f)

In [27]:
production_models


{'134614': {'normalizer': StandardScaler(),
  'rfe': RFECV(cv=5, estimator=LinearSVC(max_iter=1000000000), n_jobs=-1,
        scoring='balanced_accuracy', verbose=3),
  'ml': GradientBoostingClassifier(criterion='mse', learning_rate=4.1,
                             loss='exponential', max_depth=38, min_samples_leaf=3,
                             min_samples_split=3, n_estimators=450, subsample=0.5,
                             warm_start=True),
  'score': 0.5208780208780209},
 'comm': {'normalizer': StandardScaler(),
  'rfe': RFECV(cv=5, estimator=LinearSVC(max_iter=1000000000), n_jobs=-1,
        scoring='balanced_accuracy', verbose=3),
  'ml': GradientBoostingClassifier(max_depth=26, min_samples_leaf=9,
                             min_samples_split=6, n_estimators=150, subsample=0.7,
                             warm_start=True),
  'score': 0.5},
 'cog': {'normalizer': StandardScaler(),
  'rfe': RFECV(cv=5, estimator=LinearSVC(max_iter=1000000000), n_jobs=-1,
        scoring='bala

In [4]:
# Get the ubuntu models

In [12]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
MAX_ITR = 1e9
CLC_DICT = {
    'lsvm': lambda: LinearSVC(max_iter=MAX_ITR),
    'pagg': lambda: PassiveAggressiveClassifier(max_iter=MAX_ITR),
    'lr': lambda: LogisticRegression(max_iter=MAX_ITR),
    'sgd': lambda: SGDClassifier(max_iter=MAX_ITR),
    'ridge': lambda: RidgeClassifier(max_iter=MAX_ITR),
    'knn': KNeighborsClassifier,
    'xgb': XGBClassifier,
    'gnb': GaussianNB,
    'rf': RandomForestClassifier,
    'svm': lambda: SVC(max_iter=MAX_ITR),
    'nn': lambda: MLPClassifier(max_iter=MAX_ITR),
    'gradboost': GradientBoostingClassifier
}

In [16]:
def load_classifier_from_hyperparameterJson(json_fldr):
    json_files = [x for x in os.listdir(json_fldr)]
    clc_dict = {}
    for file in json_files:
        rfe, ml = file.split('_')
        clc_dict[rfe] = {}
    for file in json_files:
        full_path = os.path.join(json_fldr, file)
        print(full_path)
        with open(full_path, 'r') as f:
            data = json.load(f)
        rfe_clc, ml_clc = file.split('_')
        clc = CLC_DICT[ml_clc]()
        for x in clc.get_params().keys():
            if x not in data.keys():
                print(f'{file} missing {x}')
        hypparam_dict = {x:data[x] for x in clc.get_params().keys()}
        clc.set_params(**hypparam_dict)
        clc_dict[rfe_clc][ml_clc] = clc
    return clc_dict

In [17]:
clc_dict = load_classifier_from_hyperparameterJson('../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/')

../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/lsvm_svm
../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/lsvm_lr
../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/lsvm_lsvm
../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/xgb_xgb
../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/lr_knn
../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/lsvm_xgb
../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/lr_lr
../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/xgb_svm
../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/lsvm_gradboost
../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/lsvm_rf
../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/rf_svm
../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/lsvm_knn
../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/xgb_knn
../models/20210815_170500_perc_ubuntu_mot/ML_obj_hyperparams/rf_xgb
../models/20210815_170500_p

In [18]:
clc_dict

{'lsvm': {'svm': SVC(C=10, degree=4, kernel='sigmoid', max_iter=1000000000),
  'lr': LogisticRegression(C=0.1, max_iter=1000000000, penalty='none',
                     solver='newton-cg'),
  'lsvm': LinearSVC(C=0.1, loss='hinge', max_iter=1000000000),
  'xgb': XGBClassifier(base_score=None, booster='gblinear', colsample_bylevel=None,
                colsample_bynode=None, colsample_bytree=1.0, gamma=100,
                gpu_id=None, importance_type='gain', interaction_constraints=None,
                learning_rate=0.01, max_delta_step=None, max_depth=None,
                min_child_weight=10, missing=None, monotone_constraints=None,
                n_estimators=100, n_jobs=None, num_parallel_tree=None,
                random_state=None, reg_alpha=0, reg_lambda=0,
                scale_pos_weight=None, subsample=None, tree_method=None,
                validate_parameters=None, verbosity=None),
  'gradboost': GradientBoostingClassifier(criterion='mse', learning_rate=0.5, max_depth=36,


In [79]:
import json
from sklearn.base import BaseEstimator, TransformerMixin
import copy
import numpy as np
class RFEFeaturesBased(BaseEstimator, TransformerMixin):
    def __init__(self, fldr):
        self.data_dir = fldr
        if not os.path.exists(os.path.join(fldr, 'selected_feats.json')):
            raise FileExistsError(f'There is no selected_feats.json inside {fldr}')
        self.selected_feats_json = os.path.join(fldr, 'selected_feats.json')
        
        if not os.path.exists(os.path.join(fldr, 'group_df_beforeFixation.csv')):
            raise FileExistsError(f'There is no group_df_beforeFixation.csv inside {fldr}')
        self.df_dir = os.path.join(fldr, 'group_df_beforeFixation.csv')

        if not os.path.exists(os.path.join(fldr, 'pseudo_metrics.csv')):
            raise FileExistsError(f'There is no pseudo_metrics.csv inside {fldr}')
        self.metric_path = os.path.join(fldr, 'pseudo_metrics.csv')


    def _load_features_from_json(self):
        with open(self.selected_feats_json, 'r') as f:
            feats_dict = json.load(f)
        return feats_dict

    def fit(self, X=None, y=None, **params):
        df = pd.read_csv(self.df_dir, index_col='subj_id')
        df_psudo_scores = pd.read_csv(self.metric_path, index_col=[0,1])
        max_loc = df_psudo_scores['acc'].argmax()
        rfe, ml = df_psudo_scores.index[max_loc]
        self.columns_ = df.columns
        self.feats_dict_ = self._load_features_from_json()
        self.feats_names_ = self.feats_dict_[rfe]
        self.feats_indices_ = []
        names_list = np.zeros(len(self.feats_dict_[rfe]), dtype=np.int32)
        for idx, feat_names in enumerate(self.feats_dict_[rfe]):
            names_list[idx] = self.columns_.get_loc(feat_names)
        self.feats_indices_ = names_list.copy()
        return self

    def transform(self, X, y=None, **params):
        if isinstance(X, pd.DataFrame):
            Xselected = X.loc[:, self.feats_names_]
        elif isinstance(X, np.ndarray):
            Xselected = X[:, self.feats_indices_]
        else:
            raise TypeError("I am only expecting dataframe or numpy.array")

        return Xselected


In [80]:
rfe = RFEFeaturesBased("../models/20210815_170500_perc_ubuntu_mot")

In [81]:
rfe.fit()

RFEFeaturesBased(fldr=None)

In [82]:
df = pd.read_csv("../models/20210815_170500_perc_ubuntu_mot/group_df_afterFixation.csv", index_col='subj_id')


In [83]:
rfe.transform(df)

Unnamed: 0_level_0,area_lbankssts_PERC20,area_lbankssts_PERC60,area_lbankssts_PERC80,area_lcaudalmiddlefrontal_PERC40,area_lcuneus_PERC80,area_linferiorparietal_PERC60,area_linferiorparietal_PERC80,area_linferiortemporal_PERC60,area_listhmuscingulate_PERC20,area_listhmuscingulate_PERC40,...,thickness_rmedialorbitofrontal_PERC80,thickness_rparahippocampal_PERC60,thickness_rparsorbitalis_PERC40,thickness_rsupramarginal_PERC20,volume_lparstriangularis_PERC80,volume_lrostralanteriorcingulate_PERC20,volume_lrostralanteriorcingulate_PERC80,volume_rfrontalpole_PERC80,volume_rinferiortemporal_PERC80,volume_rrostralanteriorcingulate_PERC20
subj_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NYU_1_29195,0.445095,0.777230,0.926085,0.612405,0.981610,0.792177,0.989764,0.785807,0.335082,0.522797,...,3.189817,3.026245,2.786597,2.283281,3.600507,0.683663,2.732946,6.472891,4.166169,0.804925
NYU_1_29194,0.442522,0.806857,0.949495,0.620782,1.008944,0.767726,0.962077,0.786538,0.356588,0.571181,...,3.481484,2.466715,3.080608,2.439013,3.673253,0.832683,3.353953,4.999566,4.356665,0.765148
OHSU_1_28987,0.451809,0.794181,0.922682,0.591217,0.940065,0.735272,0.923287,0.723912,0.312253,0.500315,...,3.226116,3.288098,2.528215,2.366347,3.225922,0.841263,3.205217,4.288923,3.558860,0.765378
KKI_29393,0.385124,0.769144,0.942048,0.624207,1.009778,0.767182,0.967276,0.754234,0.357468,0.580646,...,3.424543,2.594172,2.495086,1.953830,3.311451,0.931516,3.973969,5.404278,3.847251,0.928768
TCD_1_29096,0.447873,0.792640,0.936872,0.622678,1.012486,0.780137,0.984089,0.793882,0.363551,0.563091,...,3.516568,2.863656,2.412265,1.996041,3.094408,0.815096,3.411021,5.245967,4.013608,0.772198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KKI_29442,0.427423,0.799410,0.944582,0.576907,0.980006,0.747623,0.947135,0.753068,0.323458,0.514181,...,3.265309,3.269646,2.641657,2.348416,3.233477,0.855393,3.304828,4.182753,3.301636,0.829448
KKI_29357,0.395082,0.765502,0.930529,0.606829,0.935128,0.754487,0.931458,0.741934,0.341402,0.537614,...,3.551058,3.102281,2.610434,2.454821,3.038855,0.884094,3.983093,4.583903,3.250320,0.872899
GU_1_28786,0.412694,0.820341,0.958457,0.601765,0.911506,0.742665,0.942035,0.738024,0.358867,0.555426,...,3.169122,3.037010,2.420666,2.352403,3.305272,0.856567,3.577724,4.003357,3.722314,0.802226
TCD_1_29121,0.449504,0.812920,0.971209,0.593670,1.032790,0.776601,0.978138,0.806237,0.373159,0.581450,...,3.245388,2.532937,2.708239,2.012580,3.389018,0.849250,3.523644,4.282467,3.327824,1.079282


In [85]:
with open("../models/20210815_170500_perc_ubuntu_mot/selected_feats.json", 'r') as f:
    data = json.load(f)

In [88]:
len(data['xgb'])

77

In [89]:
data['xgb']==rfe.transform(df).columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])