# Base Leaners - Grid Search

Performing grid search on base learners to find best parameters for each base learner

In [1]:
# import libraries
import warnings
import pandas as pd
import numpy as np

warnings.filterwarnings(action='ignore')

# import custom class
from tadpole import Tadpole

# base learners
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

# metrics
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix, make_scorer

# import visualization library
from matplotlib import pyplot as plt
import seaborn as sns


# set seed (for reproducibility)
np.random.seed(43)

%matplotlib inline
sns.set(style='whitegrid', palette="deep", font_scale=0.7, rc={"figure.figsize": [8, 5]})

### Data Loading + Pre-processing

In [2]:
# initialize class
tp = Tadpole(debug=True)

In [3]:
# load and pre-process tadpole dataset
tp.load()
print("Labels:", tp.label_dict)

loading tadpole dataset
pre-processing dataset
Labels: {0: 'AD', 1: 'CN', 2: 'MCI'}


In [4]:
# train-test split
tp.split()

splitting dataset to train and test datasets


In [5]:
# import variables from class instance (notebook compatibility)
label_dict = tp.label_dict
X, y, X_train, X_test, y_train, y_test = tp.X, tp.y, tp.X_train, tp.X_test, tp.y_train, tp.y_test

In [17]:
# define scoring parameter
scoring = {'BA': make_scorer(balanced_accuracy_score), 
           'AUC': make_scorer(roc_auc_score)}

Helper Functions

In [7]:
def showConfusionMatrix(clf, X, y):
    cm = confusion_matrix(y, clf.predict(X))
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(cm, cmap='GnBu')
    ax.grid(False)

    # For 3 categories
    ax.xaxis.set(ticks=(0, 1, 2), ticklabels=('Predicted ' + label_dict[0], 'Predicted ' + label_dict[1], 
                                              'Predicted ' + label_dict[2]))
    ax.yaxis.set(ticks=(0, 1, 2), ticklabels=('Actual ' + label_dict[0], 'Actual ' + label_dict[1], 
                                              'Actual ' + label_dict[2]))

    for i in range(3):
        for j in range(3):
            ax.text(j, i, cm[i, j], ha='center', va='center', color='black')
    plt.title('Test Data Confusion Matrix')
    plt.show()
    
def computeMetrics(model, X, y):
    """
    @description:
        function to compute performance metrics
    @arguments:
        model (sklearn) - trained model
        X (np.array) - features
        y (np.array) - target
    """
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)
    BCA = balanced_accuracy_score(y, y_pred)
    mAUC = roc_auc_score(y, y_prob, multi_class="ovr", average="macro")
    return BCA, mAUC

def report(model):
    """
    @description:
        function to report trained model performance
    @arguments:
        model (sklearn) - trained model
    """
    score_train = model.score(X_train, y_train)
    score_test = model.score(X_test, y_test)
    BCA_train, mAUC_train = computeMetrics(model, X_train, y_train)
    BCA_test, mAUC_test = computeMetrics(model, X_test, y_test)
    print("========Model Evaluation Report========")
    print("Train Score: ", score_train)
    print("Test Score: ", score_test)
    print("=======================================")
    print("Train BCA: {:.3f} | Train AUC: {:.3f}".format(BCA_train, mAUC_train))
    print("Test BCA: {:.3f} | Test AUC: {:.3f}".format(BCA_test, mAUC_test))
    print("=======================================")
    showConfusionMatrix(model, X_test, y_test)
    return [score_train, score_test, BCA_train, BCA_test, mAUC_train, mAUC_test]

def evaluate(baselearners):
    """
    @description:
        function to evaluate base learners
    @arguments:
        baselearners (sklearn) - list of trained models
    """
    metrics = []
    for name, model in baselearners.items():
        print(name)
        results = report(model)
        metrics.append([name] + results)
    return metrics

### Grid Search: Model Training

In [13]:
# define search grids
lra_param_grid = {'penalty'            : ['l1', 'l2'],
                  'dual'               : [True, False],
                  'tol'                : [1e-4, 1e-3, 1e-2, 0.1, 1],
                  'C'                  : np.logspace(-4, 4, 20),
                  'class_weight'       : [None, 'balanced'],
                  'solver'             : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

dtc_param_grid = {'criterion'          : ['gini', 'entropy'], 
                  'splitter'           : ['best', 'random'], 
                  'max_depth'          : [6, 7, 8, 9, 10], 
                  'min_samples_split'  : np.linspace(0.1, 2.0, 20, endpoint=True), 
                  'min_samples_leaf'   : np.linspace(0.1, 1, 10, endpoint=True), 
                  'max_features'       : ['auto', 'sqrt', 'log2'], 
                  'class_weight'       : [None, 'balanced']}

mlp_param_grid = {'hidden_layer_sizes' : [(50,50,50), (50,100,50), (100,)],
                  'activation'         : ['tanh', 'relu'],
                  'solver'             : ['sgd', 'adam'],
                  'alpha'              : [0.0001, 0.05], 
                  'learning_rate'      : ['constant','adaptive']}

svm_param_grid = {'kernel'             : ['rbf', 'linear'], 
                  'gamma'              : [1e-3, 1e-4], 
                  'tol'                : [1e-3, 1e-2, 0.1, 1],
                  'class_weight'       : [None, 'balanced'],
                  'C'                  : [1, 10, 100, 1000]}

#### Logistic Regression (LRA)

In [18]:
# define and train
lra = LogisticRegression(verbose=2, multi_class='auto')
lra_cv = tp.gridsearch(lra_param_grid, lra, scoring)

performing grid search
Fitting 5 folds for each of 4000 candidates, totalling 20000 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: multi_class must be in ('ovo', 'ovr')

#### Decision Trees (DT)

In [16]:
# define and train
dtc = DecisionTreeClassifier()
dtc_cv = tp.gridsearch(dtc_param_grid, dtc, scoring)

performing grid search
Fitting 5 folds for each of 24000 candidates, totalling 120000 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: multi_class must be in ('ovo', 'ovr')

#### Multi-layer Perceptron (ANN)

In [None]:
# define and train
mlp = MLPClassifier(verbose=True)
mlp_cv = tp.gridsearch(mlp_param_grid, mlp, scoring)

#### SVM

In [None]:
# define and train
svm = SVC(verbose=True, probability=True)
svm_cv = tp.gridsearch(svm_param_grid, svm, scoring)

### Grid Search: Evaluation

In [None]:
# define dictionary of all baselearners
baselearners = {'LRA': lra_cv.best_estimator_, 'DT': dtc_cv.best_estimator_, 
                'ANN': mlp_cvbest_estimator_, 'SVM': svm_cvbest_estimator_}

In [None]:
# evaluate
metrics = evaluate(baselearners)