In [1]:
%run functions.ipynb
%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib


In [2]:
X_train, X_test, y_train, y_test = load_train_and_test_parts()
print("Train and test sizes: {} {}".format(X_train.shape, X_test.shape))
print(
    "(1, 0) labels count in train test: {} {}".format(
        (np.count_nonzero(y_train == 1), np.count_nonzero(y_train == 0)),
        (np.count_nonzero(y_test == 1), np.count_nonzero(y_test == 0)),
    )
)

Train and test sizes: (726, 12179) (243, 12179)
(1, 0) labels count in train test: (289, 437) (104, 139)


In [3]:
n_iter = 1
cv_in = 2
cv_out = 5
scoring = 'roc_auc'

In [None]:
pca_pipeline, pca_params = get_dim_reduction_pipeline(PCA(random_state=random_state), {'n_components' : [50, 150, 200, 250]})
mlcc_pipeline, mlcc_params = get_dim_reduction_pipeline(MLCCWrapper(), {})
spca_pipeline, spca_params = get_dim_reduction_pipeline(SPCWrapper(), {'n_components' : [50, 150, 300, 450, 600, 750, 1000], 
                                                                       'threshold_val' : [0.01, 0.1, 0.5, 1, 3, 5, 10], 
                                                                       'threshold_mode': ['soft', 'hard', 'garrote'], 
                                                                       'max_iter': [25, 50, 100, 250]})
pls_pipeline, pls_params = get_dim_reduction_pipeline(PLSRegressionWrapper(), {'n_components' : [50, 150, 300, 450, 600, 750, 1000]}, 
                                                      standardizer=stubTransformer)

clf_params = {'lr' : np.linspace(0.0001, 0.1, 50) , 'module__dropout': np.linspace(0.0, 0.4, 30), 'module__num_hidden0': [5, 10, 20, 50], 
              'module__num_hidden1': [5, 10, 20, 50], 'optimizer__weight_decay': [0.001, 0.01, 0.1, 1, 5, 10]}
dim_params = {'n_components' : [100, 250, 500], 'transformer': [RandomForestClassifier(max_depth=5, min_samples_leaf=20, random_state=random_state),
                                                               RandomLogisticsRegressions(penalty='l1', C=0.5, n_variables=500)]}

early_stopping = EarlyStopping(scoring, patience=10, threshold_mode='abs', threshold=1e-3)
clf = NeuralNetClassifierWrapper(ClassifierModule, train_split=None, optimizer=torch.optim.Adam, callbacks=[skorch_scoring[scoring], early_stopping], 
                          max_epochs=1000, verbose=0)

fdnn_pipeline, fdnn_params = get_dim_reduction_pipeline(RepresentationTransformer(transformer=RandomForestClassifier()), dim_params, 
                                                        standardizer=stubTransformer, clf=clf, clf_params=clf_params)


In [None]:
all_scores_names = ['roc_auc', 'precision', 'recall', 'f1']
estimated_scores = {}
randomized_cvs = {}
models = {
    'PCA': {
         'model': pca_pipeline,
         'params': pca_params
    },
    'SPCA': {
         'model': spca_pipeline,
         'params': spca_params
    },
    'MLCC': {
         'model': mlcc_pipeline,
         'params': mlcc_params
    },
    'PLS': {
         'model': pls_pipeline,
         'params': pls_params
    },
    'FDNN': {
         'model': fdnn_pipeline,
         'params': fdnn_params
    },
    'Nearest shrunken centroid': {
        'model': NearestCentroidWrapper(),
        'params': {'shrink_threshold' : np.linspace(0, 5, 50)}
    },
    'Logistic regression': {
        'model': LogisticRegression(random_state=random_state),
        'params': {'C' : np.linspace(0.01, 0.6, 50), 'penalty' : ['l1'], 'solver' : ['liblinear']}
    },
    'Regularized discriminant analysis': {
        'model': LinearDiscriminantAnalysis(shrinkage='auto', solver='lsqr'),
        'params': {}
    },
    'Random forest': {
        'model': RandomForestClassifier(random_state=random_state),
        'params': {'n_estimators' : [1000, 2500, 5000], 'max_depth': [4, 6, 8], 'min_samples_leaf' : [20, 30], 
               "bootstrap": [True, False], "criterion": ["gini", "entropy"]}
    },
    'Elastic net': {
        'model': SGDClassifier(loss='log', penalty='elasticnet', max_iter=5000, tol=1e-3, random_state=random_state),
        'params': {'l1_ratio' : np.linspace(0.001, 1, 50), 'alpha' : np.linspace(0.0001, 0.5, 100)}
    },
    'Linear SVM': {
        'model': LinearSVC(penalty='l1', dual=False, max_iter=10000, random_state=random_state),
        'params': {'C' : np.linspace(0.01, 0.6, 50)}
    },
    'Ada Boost': {
        'model': AdaBoostClassifier(base_estimator= DecisionTreeClassifier(min_samples_leaf=10), random_state=random_state),
        'params': {'n_estimators' : [100, 250, 500], 
              "learning_rate": [0.01, 0.1, 1, 10],
              "base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "base_estimator__max_depth": [1, 2, 4, 8, None], 
              "base_estimator__min_samples_split": [25, 50, 100] 
             }
    },
    'Random logistic regression': {
        'model': RandomLogisticsRegressions(penalty='l1'),
        'params': {'C' : np.linspace(0.1, 5, 50), 'n_estimators' : [100, 250, 500, 1000], 'n_variables' : [100, 250, 500, 1000]}
    }
}

In [None]:
for model_name, model_specification in models.items():
    rcv = RandomizedSearchCV(estimator=model_specification['model'], param_distributions=model_specification['params'],
                             cv=cv_in, scoring=scoring, n_iter=n_iter, iid=False, random_state=random_state)
    randomized_cvs[model_name] = rcv

In [None]:
for name, rcv in randomized_cvs.items():
    nested_scores = cross_validate(rcv, X_train, y_train, scoring = all_scores_names, cv=cv_out)
    estimated_scores[name] = {}
    for score_name in all_scores_names:
        estimated_scores[name][score_name] = {
            'mean': np.mean(nested_scores['test_' + score_name]),
            'std': np.std(nested_scores['test_' + score_name])
        }
    print('Done for ' + name)

Done for PCA
Done for SPCA
Done for MLCC




Done for PLS


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Done for FDNN
Done for Nearest shrunken centroid
Done for Logistic regression


In [None]:
print_summarized_scores(estimated_scores, models, latex=True)