In [16]:
import dill
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score

In [2]:
main_dir = '../selected_models_for_production/'

In [6]:
fldrs = [os.path.join(main_dir, x) for x in os.listdir(main_dir) if os.path.isdir(os.path.join(main_dir, x))]

In [11]:
# Get best models based on the pseudometric csv file to be used as selected models
top_rfe_ml_per_model_ = {}
for fldr in fldrs:
    df_psudo_scores = pd.read_csv(os.path.join(fldr,'pseudo_metrics.csv'), index_col=[0,1])
    max_loc = df_psudo_scores['acc'].argmax()
    rfe, ml = df_psudo_scores.index[max_loc]
    top_rfe_ml_per_model_[fldr] = (rfe, ml)

In [12]:
top_rfe_ml_per_model_

{'../selected_models_for_production/20210711_143358_std_comm': ('lr', 'svm'),
 '../selected_models_for_production/20210711_175029_std_tot': ('lsvm', 'lsvm'),
 '../selected_models_for_production/20210711_185316_std_cog': ('lr', 'xgb'),
 '../selected_models_for_production/20210711_232703_std_man': ('lsvm', 'lsvm')}

In [23]:
production_models = {}
for fldr in fldrs:
    behav_name = fldr.split('_')[-1]
    df = pd.read_csv(os.path.join(fldr, 'group_df_afterFixation.csv'), index_col=0)
    cols_2_del = ['DX_GROUP','AGE_AT_SCAN ', 'SEX']
    for col in df.columns:
        if 'categories_' in col:
            cols_2_del.append(col)
        elif 'SRS_' in col:
            cols_2_del.append(col)
    df.drop(cols_2_del, axis=1, inplace=True)
    
    with open(os.path.join(fldr,'normalizer.p'), 'rb') as f:
        normalizer = dill.load(f)
    with open(os.path.join(fldr,'ML_obj.p'), 'rb') as f:
        ml_obj = dill.load(f)
    with open(os.path.join(fldr, 'FS_obj.p'), 'rb') as f:
        fs_obj = dill.load(f)
    X = df.drop('my_labels', axis=1)
    y = df['my_labels'].values
    Xs = normalizer.transform(X)
    
    selected_rfe = top_rfe_ml_per_model_[fldr][0]
    selected_ml = top_rfe_ml_per_model_[fldr][1]
    
    Xselected = fs_obj[selected_rfe].transform(Xs)
    trained_obj = ml_obj[selected_rfe][selected_ml].best_estimator_.fit(Xselected, y)
    yhat = trained_obj.predict(Xselected)
    print(f'{behav_name}: {confusion_matrix(y,yhat)}')
    production_models[behav_name] = {
        'normalizer': normalizer,
        'rfe': fs_obj[selected_rfe],
        'ml': trained_obj
    }

comm: [[166   0]
 [  0 166]]
tot: [[147  42]
 [ 41 144]]




Parameters: { "colsample_bytree", "gamma", "min_child_weight" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


cog: [[  0 127]
 [  0 127]]
man: [[166  20]
 [ 18 168]]


In [24]:
with open('../selected_models_for_production/trained_normalizer_rfe_models.p', 'wb') as f:
    dill.dump(production_models, f)

In [25]:
production_models


{'comm': {'normalizer': StandardScaler(),
  'rfe': RFECV(cv=5, estimator=LogisticRegression(max_iter=1000000000), n_jobs=-1,
        scoring='balanced_accuracy', verbose=3),
  'ml': SVC(C=0.1, coef0=100, degree=4, kernel='poly', max_iter=1000000000)},
 'tot': {'normalizer': StandardScaler(),
  'rfe': RFECV(cv=5, estimator=LinearSVC(max_iter=1000000000), n_jobs=-1,
        scoring='balanced_accuracy', verbose=3),
  'ml': LinearSVC(C=0.1, max_iter=1000000000)},
 'cog': {'normalizer': StandardScaler(),
  'rfe': RFECV(cv=5, estimator=LogisticRegression(max_iter=1000000000), n_jobs=-1,
        scoring='balanced_accuracy', verbose=3),
  'ml': XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
                colsample_bynode=None, colsample_bytree=0.8, gamma=100, gpu_id=-1,
                importance_type='gain', interaction_constraints=None,
                learning_rate=1, max_delta_step=None, max_depth=None,
                min_child_weight=0.01, missing=nan, monoto