Imports and constants

In [1]:
%pylab inline
import itertools
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    roc_auc_score,
    accuracy_score,
    auc,
    precision_recall_fscore_support, 
    recall_score,
    make_scorer,
    precision_recall_curve
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler, QuantileTransformer, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestCentroid
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import rpy2.robjects as robjects
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.cross_decomposition import PLSRegression
from sklearn.pipeline import Pipeline
import os
import pandas as pd
from IPython.display import display, HTML
pd.options.display.float_format = '{:,.3f}'.format
pd.options.display.width = 200
pd.options.display.column_space = 36

from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone
import seaborn as sns
sns.set(style="ticks")
import matplotlib.gridspec as gridspec
np.set_printoptions(precision=3)
random_state=42

import torch
from torch import nn
import torch.nn.functional as F
torch.manual_seed(0);
from skorch.classifier import NeuralNetClassifier, NeuralNetBinaryClassifier
from skorch.callbacks import EpochScoring, EarlyStopping, Callback


class_names = np.array(["No event", "Met event"])

Populating the interactive namespace from numpy and matplotlib


Helper functions

In [2]:
#Scoring 

skorch_auc_scoring = EpochScoring(scoring='roc_auc', lower_is_better=False, on_train=True)
skorch_recall_scoring = EpochScoring(scoring='recall', lower_is_better=False, on_train=True)

skorch_scoring = {'roc_auc': skorch_auc_scoring, 'recall' : skorch_recall_scoring}

class Scores():
    def __init__(self, dataset=None, labels=None, clf=None):
        
        if dataset is None or labels is None or clf is None:
            self.roc = 0
            self.accuracy = 0
            self.precision = 0
            self.recall = 0
            self.f1 = 0
            self.cm = None
        
        else:

            _, _, self.roc = calculate_roc_score(clf, dataset, labels)

            predictions = np.array(clf.predict(dataset), dtype=np.int32)
            labels = np.array(labels, dtype=np.int32)
            scores = precision_recall_fscore_support(labels, predictions, average='binary')
            self.accuracy = accuracy_score(labels, predictions)
            self.precision = scores[0]
            self.recall = scores[1]
            self.f1 = scores[2]
            self.cm = confusion_matrix(labels, predictions)
        
    def __str__(self):
        return 'ROC AUC: {:.3f}\nPrecision: {:.3f}\nRecall: {:.3f}\nF1: {:.3f}'.format(self.roc, self.precision,
                                                                                                       self.recall, self.f1)
    def __iadd__(self, other):
        self.roc += other.roc
        self.accuracy += other.accuracy
        self.precision += other.precision
        self.recall += other.recall
        self.f1 += other.f1
        return self
        
    def __itruediv__(self, other):
        self.roc /= other
        self.accuracy /= other
        self.precision /= other
        self.recall /= other
        self.f1 /= other
        return self
    
def print_scores(train_scores, test_scores, latex=False):
    data = [
        [train_scores.roc, train_scores.precision, train_scores.recall, train_scores.f1], 
        [test_scores.roc, test_scores.precision, test_scores.recall, test_scores.f1]
    ]
    
    frame = pd.DataFrame(data, columns=["ROC AUC", "Precision", "Recall", "F1"])
    frame.rename({0: 'train', 1:'test'}, axis='index', inplace=True)
    display(frame)
    if latex:
        print(frame.to_latex())
        
        
def print_summarized_scores(estimated_scores, models, latex=False):
    summarized_scores = []
    for name, scores in estimated_scores.items():
        model_summary = []
        for score in scores.values():
            model_summary += ['{:.3f} $\pm$ {:.3f}'.format(score['mean'], score['std'])]
        summarized_scores += [model_summary]

    frame = pd.DataFrame(summarized_scores, columns=["ROC AUC", "Precision", "Recall", "F1"])
    frame.rename({i : name for i, name in enumerate(models.keys())}, axis='index', inplace=True)
    display(frame)
    if latex:
        print(frame.to_latex())

In [3]:
def load_file(name):
    return np.genfromtxt(name, delimiter=",", skip_header=1)


def load_train_and_test_parts():
    X_train = load_file("data/microarray_train.csv")
    X_test = load_file("data/microarray_test.csv")
    y_train = load_file("data/labels_train.csv")
    y_test = load_file("data/labels_test.csv")
    return X_train, X_test, y_train, y_test


def flatten(nested_list):
    return [item for sublist in nested_list for item in sublist]

In [4]:
# General

def fit_models(
    train_set, train_labels, test_set, test_labels, plot_logit_weigths=False
):
    clf_logit = fit_clf(
        LogisticRegression(solver="liblinear", penalty="l1", C=0.3),
        train_set,
        train_labels,
        test_set,
        test_labels,
        "Logistics regression",
    )
    if plot_logit_weigths:
        plot_logit_weights(clf_logit, "Logistics regression coefficients")
    # clf_svm = fit_clf(SVC(gamma='scale', C=7, probability=True), train_set, train_labels, test_set, test_labels, 'SVM')
    clf_forest = fit_clf(
        RandomForestClassifier(max_depth=4, n_estimators=2000, min_samples_leaf=10),
        train_set,
        train_labels,
        test_set,
        test_labels,
        "Random forest",
    )
    return (clf_logit, clf_forest)

def fit_clf_print_scores(clf, train_set, train_labels, test_set, test_labels):
    clf, train_scores, test_scores = fit_clf_scores(clf, train_set, train_labels, test_set, test_labels)
    return print_after_fit(clf, train_scores, test_scores)

def clf_print_scores(clf, train_set, train_labels, test_set, test_labels):
    clf, train_scores, test_scores = clf_scores(clf, train_set, train_labels, test_set, test_labels)
    return print_after_fit(clf, train_scores, test_scores)

def fit_clf(clf, train_set, train_labels, test_set, test_labels, title):
    clf = clf.fit(train_set, train_labels)
    plot_clf_roc(clf, train_set, train_labels, test_set, test_labels, title)
    return clf

def fit_clf_scores(clf, train_set, train_labels, test_set, test_labels):
    clf = clf.fit(train_set, train_labels)
    return clf_scores(clf, train_set, train_labels, test_set, test_labels)

def clf_scores(clf, train_set, train_labels, test_set, test_labels):
    train_scores = Scores(train_set, train_labels, clf)
    test_scores = Scores(test_set, test_labels, clf)
    return clf, train_scores, test_scores

def print_after_fit(clf, train_scores, test_scores):
    if hasattr(clf, 'best_params_'):
        print(clf.best_params_)
    print_scores(train_scores, test_scores)
    return clf
    
# Cross validation

def fit_clf_with_cross_val(clf, train_set, train_labels, test_set, test_labels, title):
    fig, (ax1, ax2) = subplots(
        nrows=1, ncols=2, sharex=True, sharey=True, figsize=(16, 8)
    )
    fig.suptitle(title)
    fit_clf_cv(ax1, clf, train_set, train_labels, title="Train")
    plot_roc_curve(ax2, clf, test_set, test_labels, title="Test")
    plt.show()


def fit_clf_cv(ax, clf, X, y, title):
    cv = StratifiedKFold(n_splits=5)

    tprs = []
    aucs = []
    accs = [] 
    mean_fpr = np.linspace(0, 1, 100)

    i = 0
    for train, test in cv.split(X, y):
        model = clf.fit(X[train], y[train])
        probas_ = model.predict_proba(X[test])
        acc = model.score(X[test], y[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        accs.append(acc)
        ax.plot(
            fpr, tpr, lw=1, alpha=0.3, label="ROC fold %d (AUC = %0.3f)" % (i, roc_auc)
        )

        i += 1
    ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(
        mean_fpr,
        mean_tpr,
        color="b",
        label=r"Mean ROC (AUC = %0.3f $\pm$ %0.3f)" % (mean_auc, std_auc),
        lw=2,
        alpha=0.8,
    )

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(
        mean_fpr,
        tprs_lower,
        tprs_upper,
        color="grey",
        alpha=0.2,
        label=r"$\pm$ 1 std. dev.",
    )

    ax.set_xlim([-0.05, 1.05])
    ax.set_ylim([-0.05, 1.05])
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.set_title(title + r" ( Accuracy = %0.3f $\pm$ %0.3f )" % (np.mean(accs), np.std(accs)))
    ax.legend(loc="lower right")

# ROC


def calculate_roc_score(clf, dataset, labels):
    
    if hasattr(clf, 'predict_proba'):
        probs = clf.predict_proba(dataset)
        if len(probs.shape) > 1:
            probs = probs[:, 1]
    else:
        probs = clf.decision_function(dataset)
    fpr, tpr, _ = roc_curve(labels, probs)
    roc_auc = auc(fpr, tpr)
    
    return fpr, tpr, roc_auc

def plot_roc_curve(ax, clf, dataset, labels, title):

    fpr, tpr, roc_auc = calculate_roc_score(clf, dataset, labels)

    plot_roc(ax, fpr, tpr, roc_auc, title)
    
def plot_roc(ax, fpr, tpr, roc_auc, title):
    ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="r", label="Chance", alpha=0.8)
    ax.plot(fpr, tpr, label="ROC curve (AUC = {:.3f})".format(roc_auc))
    ax.set(
        xlabel="False Positive Rate",
        ylabel="True Positive Rate",
        title=title,
    )
    ax.legend(loc="lower right")


def plot_clf_roc(clf, train_set, train_labels, test_set, test_labels, title):
    fig, (ax1, ax2) = subplots(
        nrows=1, ncols=2, sharex=True, sharey=True, figsize=(12, 6)
    )
    fig.suptitle(title)
    plot_roc_curve(ax1, clf, train_set, train_labels, title="Train")
    plot_roc_curve(ax2, clf, test_set, test_labels, title="Test")
    plt.show()

# Logit wieghts

def plot_logit_weights(clf_logit, title):
    plt.figure()
    plt.title(title)
    plt.plot(np.arange(clf_logit.coef_.shape[1]), clf_logit.coef_[0])
    plt.show()


def plot_logit_weights_ax(ax, clf_logit, title):
    ax.set_title(title)
    sns.lineplot(np.arange(clf_logit.coef_.shape[1]), clf_logit.coef_[0], ax=ax)

In [5]:
stubTransformer = FunctionTransformer(lambda x : x)

In [None]:
#Adjusting threshold

def adjusted_classes(scores, t):
    return [1 if y >= t else 0 for y in scores]

def plot_precision_recall_vs_threshold(ax, labels, scores, clf_name, requsted=[0.9, 0.8, 0.7]):
    
    precisions, recalls, thresholds = precision_recall_curve(labels, scores)
    print(clf_name + ':')
    for thresh in requsted:
        index = np.argmin(recalls >= thresh) - 1
        print('For threshold {:.3f} recall is {:.3f} and precision is {:.3f}'.format(thresholds[index], recalls[index], precisions[index]))
    
    ax.set_title('Precision and recall for ' + clf_name)
    ax.plot(thresholds, precisions[:-1], "b--", label="Precision")
    ax.plot(thresholds, recalls[:-1], "g-", label="Recall")
    ax.set_ylabel("Score")
    ax.set_xlabel("Decision Threshold")
    ax.legend(loc='best')

MLCC functions

In [6]:
def get_mlcc_result_files():
    filenames = []
    for filename in os.listdir('./mlcc_results/'):
        if filename.endswith('.RData'):
            filenames.append(filename)
    return filenames
    


class MLCCWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, size=726, filename=''):
        self.size = size
        self.filename = filename
        self.PCAs = []
        
    def fit(self, X, y):
        self.segmentation, self.dimensionalities = self.get_result() 
        numb_clust = self.dimensionalities.shape[0]
        
        for i in range(numb_clust):
            cluster = X[:, self.segmentation == i]
            n_components = self.dimensionalities[i]
            if cluster.shape[1] < n_components:  # TODO - maybe mlcc shouldn't allow it
                print(
                    "WARNING! Dimensionality of a cluster was greater than the number of variables. Ignoring this cluster."
                )
            else:
                self.PCAs += [PCA(n_components=n_components).fit(cluster)]
                
        return self
        
    def transform(self, X):
        X_reduced = np.empty((X.shape[0], 0))
        numb_clust = self.dimensionalities.shape[0]
        
        for i in range(numb_clust):
            cluster = X[:, self.segmentation == i]
            X_reduced = np.concatenate(
                        (X_reduced, self.PCAs[i].transform(cluster)),
                        axis=1,
                    )
        return X_reduced
        
    
    def get_result(self):
        self.mlcc_result = tuple(self.read_mlcc_result(self.filename, self.size))
                
        self.mBIC = self.mlcc_result[1][0]
        
        segmentation = self.mlcc_result[0]
        dimensionalities = self.mlcc_result[2]
        return segmentation, dimensionalities
        
        
    def summary(self, best_index=None):
        filenames = get_mlcc_result_files()
        print(filenames)
        mlcc_results = []
        for filename in filenames:
            mlcc_results.append(tuple(self.read_mlcc_result(filename, self.size)))
                
        mBICs = list(map(lambda x : x[1][0], mlcc_results))
        max_index = np.argmax(mBICs)
        print('Highest mBIC is from {}'.format(filenames[max_index]))
        figure()
        lengthMBICs = len(mBICs)
        if best_index is not None:
            scatter(np.arange(lengthMBICs)[0:best_index], mBICs[0:best_index], c='b')
            scatter(np.arange(lengthMBICs)[best_index], mBICs[best_index], c='r')
            if best_index < lengthMBICs:
                scatter(np.arange(lengthMBICs)[(best_index+1):], mBICs[(best_index+1):], c='b')
            print(self.segmentation, self.dimensionalities)
        else:
            scatter(np.arange(lengthMBICs), mBICs)
        show()

    def read_mlcc_result(self, filename, train_size):
        robjects.r["load"]("./mlcc_results/{}".format(filename))
        s, m, b = robjects.r["res"]
        segmentation = np.asarray(s)
        numb_clust = np.max(s)
        mBIC = np.asarray(m)
        b.names = robjects.r("0:{}".format(numb_clust - 1))
        bases = dict(zip(b.names, map(list, list(b))))
        dimensionalities = np.empty(numb_clust, dtype=np.int32)
        for i in range(numb_clust):
            dimensionalities[i] = len(bases[str(i)]) // train_size
        return segmentation - 1, mBIC, dimensionalities


    def apply_mlcc_dim_reduction(self, X, segmentation, dimensionalities):
        numb_clust = dimensionalities.shape[0]
        X_reduced = np.empty((X.shape[0], 0))
        for i in range(numb_clust):
            cluster = X[:, segmentation == i]
            n_components = dimensionalities[i]
            if cluster.shape[1] < n_components:  # TODO - maybe mlcc shouldn't allow it
                print(
                    "WARNING! Dimensionality of a cluster was greater than the number of variables. Ignoring this cluster."
                )
            else:
                X_reduced = np.concatenate(
                    (X_reduced, PCA(n_components=n_components).fit_transform(cluster)),
                    axis=1,
                )
        return X_reduced

Random Logistic Regression (like Random Forest)

In [7]:
class RandomLogisticsRegressions(BaseEstimator, ClassifierMixin):
    def __init__(
        self,
        n_estimators=11,
        penalty="l2",
        tol=1e-4,
        C=1.0,
        solver="liblinear",
        n_variables=1000
    ):
        self.penalty = penalty
        self.tol = tol
        self.C = C
        self.solver = solver

        self.n_estimators = n_estimators
        self.n_variables = n_variables
        
        self.estimators_ = []
        self.indices = []

    def fit(self, X, y):
        
        self.estimators_ = [
            LogisticRegression(penalty=self.penalty, tol=self.tol, C=self.C, solver=self.solver)
            for x in np.arange(self.n_estimators)
        ]
        
        self.indices = np.array(
            [
                np.random.choice(
                    np.arange(X.shape[1]), size=self.n_variables, replace=False
                )
                for x in np.arange(self.n_estimators)
            ]
        )
        for i in np.arange(self.n_estimators):
            self.estimators_[i].fit(X[:, self.indices[i, :]], y)
        return self

    def predict(self, X):
        models_predictions = np.array(
            [
                model.predict(X[:, self.indices[i, :]])
                for i, model in enumerate(self.estimators_)
            ]
        )
        mean_predictions = np.mean(models_predictions, axis=0)
        return np.round(mean_predictions)

    def predict_proba(self, X):
        models_probs = np.array(
            [
                model.predict_proba(X[:, self.indices[i, :]])
                for i, model in enumerate(self.estimators_)
            ]
        )
        probabilities = np.mean(models_probs, axis=0)
        return probabilities

    def score(self, X, y):
        predictions = self.predict(X)
        return accuracy_score(y, predictions)

In [8]:
#only for RLR and RandomForest

class RepresentationTransformer(BaseEstimator, TransformerMixin):   
    def __init__(self, transformer, n_components = 2, probabilistic=True):
        self.transformer = transformer
        self.probabilistic = probabilistic
        self.n_components = n_components
        
    def fit(self, X, y):
        self.transformer.set_params(**{'n_estimators' : self.n_components})
        #reset transformer between fits
        self.transformer = clone(self.transformer)
        self.transformer.fit(X,y)
        return self
        
    def transform(self, X):
        if self.probabilistic:
            return self.get_probabilistic_representation(X)
        else:
            return self.get_representation(X)
        
    
    def fit_transform(self, X, y):
        self.fit(X,y)
        return self.transform(X)
    
    def get_representation(self, data):
        result = np.zeros((data.shape[0], len(self.transformer.estimators_)), dtype=np.int32)
        for i, estimator in enumerate(self.transformer.estimators_):
            result[:, i] = estimator.predict(data[:, self.get_indices(i, data)])
        return result

    def get_probabilistic_representation(self, data):
        result = np.zeros((data.shape[0], len(self.transformer.estimators_)), dtype=np.float32)
        for i, estimator in enumerate(self.transformer.estimators_):
            result[:, i] = estimator.predict_proba(data[:, self.get_indices(i, data)])[:, 1]
        return result

    def get_indices(self, i, data):
        if isinstance(self.transformer, RandomForestClassifier):
            return np.arange(data.shape[1])
        try:
            # assuming RLR type (hack because isinstance is not working here properly due to different modules)
            return self.transformer.indices[i, :]
        except:
            raise NotImplementedError("For this type it is not yet implemented: " +  str(type(self.transformer)))

In [None]:
# SAM transformer

def get_gene_names():
    dataset = pd.read_csv('data/microarray_data.csv', delimiter=",", index_col=0)
    gene_names = list(dataset.index.values)
    return gene_names

def get_sam_result_files():
    filenames = []
    for filename in os.listdir('./sam_results/'):
        if filename.endswith('.csv'):
            filenames.append(filename)
    return filenames
    

class SAMSelection(BaseEstimator, TransformerMixin):   
    def __init__(self, filename=''):
        self.filename = filename
        self.gene_names = get_gene_names()
        
    def fit(self, X, y):
        significant_genes = np.genfromtxt("./sam_results/{}".format(self.filename), skip_header=1, delimiter=',', dtype=np.str)
        significant_genes = [x.replace('"', '') for x in significant_genes]
        self.significant_genes_indices = [self.gene_names.index(x) for x in significant_genes]
        return self
        
    def transform(self, X):
        return X[:, self.significant_genes_indices]
        
    
    def fit_transform(self, X, y):
        self.fit(X,y)
        return self.transform(X)
    

In [9]:
# Neural net

class NeuralNetClassifierWrapper(NeuralNetClassifier):
    def fit(self, X, y):
        y = np.array(y, dtype=np.int64)
        return super().fit(X,y)

class ClassifierModule(nn.Module):
    def __init__(
            self,
            num_hidden0=10,
            num_hidden1=10,
            dropout=0.0
    ):
        super().__init__()
        self.num_hidden0 = num_hidden0
        self.num_hidden1 = num_hidden1
        self.dropout = dropout

        self.input_initialized = False
        self.dense0 = None
        self.dropout = nn.Dropout(dropout)
        self.dense1 = nn.Linear(num_hidden0, num_hidden1)
        self.output = nn.Linear(num_hidden1, 2)
        

    def forward(self, X, **kwargs):
        
        if not self.input_initialized:
            self.dense0 = nn.Linear(X.shape[-1], self.num_hidden0)
            self.input_initialized = True
            
        X = F.relu(self.dense0(X))
        X = self.dropout(X)
        X = F.relu(self.dense1(X))
        X = F.softmax(self.output(X), dim=-1)
        return X

In [10]:
import pywt


# Wrapper for SPC from easyspc package - function transform doesn't use k parameter anymore (n_components instead), renamed number
# of components to n_components, and changed default of threshold_val, got rid of prints

class SPCWrapper(BaseEstimator):
    def __init__(self, n_components=2, max_iter=50, threshold_val=10, threshold_mode='soft'):
    
        self.n_components=n_components
        self.max_iter=max_iter
        self.threshold_val=threshold_val
        self.threshold_mode = threshold_mode
    
    def fit(self, X_, y=None):
        
        U, s, V = np.linalg.svd(X_, full_matrices=True)  
        cnt = 0
        self.U = U
        self.W=V.T
        def normalize(vector):
            norm=np.linalg.norm(vector)
            if norm>0:
                return vector/norm
            else:
                return vector
            
        for i in range(self.max_iter):
            self.V = pywt.threshold(np.dot(U[:self.n_components],X_), self.threshold_val, mode=self.threshold_mode)
            self.U = np.dot(self.V,X_.T)
            self.U = np.array([normalize(u_i) for u_i in self.U])
        self.V = np.array([normalize(v_i) for v_i in self.V])
        return self
    
    def transform(self, X_):
        X_reduced_spca = np.dot(X_, np.dot(self.V[:self.n_components].T, self.V[:self.n_components]))
        return X_reduced_spca
    

In [11]:
class PLSRegressionWrapper(PLSRegression):
    
    def transform(self, X):
        return super().transform(X)
    
    def fit_transform(self, X, Y):
        return self.fit(X,Y).transform(X)
    
class NearestCentroidWrapper(NearestCentroid):
    
    def decision_function(self, X):
        return self.predict(X)

In [12]:
DIM_REDUCTION = 'dim'
CLASSIFIER='clf'
STANDARDIZER='std'

def get_dim_reduction_pipeline(dim_reduction, dim_reduction_params, standardizer=StandardScaler(), 
                               clf=LogisticRegression(random_state=random_state),
                              clf_params={'C' : np.linspace(0.1, 1, 50), 'penalty' : ['l1'], 'solver' : ['liblinear']}):
    param_grid = transform_params(dim_reduction_params, clf_params)
    pipeline = Pipeline(steps=[(STANDARDIZER, StandardScaler()), (DIM_REDUCTION, dim_reduction), (CLASSIFIER, clf)])
    return pipeline, param_grid


def fit_dim_reduction_pipeline(dim_reduction, dim_reduction_params, X, y, standardizer=StandardScaler(), 
                               clf=LogisticRegression(random_state=random_state),
                              clf_params={'C' : np.linspace(0.1, 1, 50), 'penalty' : ['l1'], 'solver' : ['liblinear']}, 
                               n_iter=50, cv=4, scoring='roc_auc', n_jobs=3):
    
    pipeline, param_grid = get_dim_reduction_pipeline(dim_reduction, dim_reduction_params, standardizer, clf, clf_params)
    
    # fortunately, according to https://stackoverflow.com/questions/14955458/does-gridsearchcv-use-predict-or-predict-proba-when-using-auc-score-as-score-fu,
    # scoring='roc_auc implies using predict_proba'
    search = RandomizedSearchCV(pipeline, cv=cv, scoring=scoring, n_iter=n_iter, n_jobs=n_jobs, random_state=random_state,
                                iid=False, param_distributions=param_grid)
    
    search.fit(X, y)
    
    return search


def transform_params(dim_reduction_params, clf_params):
    
    params = {}
    
    for key, value in dim_reduction_params.items():
        params[DIM_REDUCTION + '__' + key] = value
        
    for key, value in clf_params.items():
        params[CLASSIFIER + '__' + key] = value 
    
    return params
    
    

In [13]:
def boxplot_hist(dataset, column_idx, ax_box, ax_hist):
    column_name = dataset.columns[column_idx]
    sns.despine(ax=ax_box, left=True)
    sns.boxplot(dataset[column_name], ax=ax_box)
    ax_box.set(yticks=[])
    ax_box.set(xticks=[])
    ax_box.set(xlabel='')
    ax_hist.get_shared_x_axes().join(ax_box, ax_hist)
    sns.despine(ax=ax_hist)
    sns.distplot(dataset[column_name], ax=ax_hist)
    ax_hist.set(xlabel='{} - {}'.format(column_idx, column_name))

Nice functions, maybe use it later

In [14]:
def plot_clf_cm(clf, train_set, train_labels, test_set, test_labels):
    test_labels_pred = clf.predict(test_set)
    train_labels_pred = clf.predict(train_set)
    test_cm = confusion_matrix(test_labels, test_labels_pred)
    train_cm = confusion_matrix(train_labels, train_labels_pred)
    fig, (ax1, ax2) = subplots(nrows=1, ncols=2, sharex=True, sharey=True)
    im = plot_confusion_matrix(
        ax1,
        train_cm,
        classes=class_names,
        normalize=True,
        title="Train confusion matrix",
    )
    im = plot_confusion_matrix(
        ax2, test_cm, classes=class_names, normalize=True, title="Test confusion matrix"
    )
    fig.subplots_adjust(right=0.8)
    cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
    fig.colorbar(im, cax=cbar_ax)
    show()
    
def plot_confusion_matrix(
    axis, cm, classes, normalize=False, title="Confusion matrix", cmap=cm.Blues
):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

    im = axis.imshow(cm, interpolation="nearest", cmap=cmap, vmin=0, vmax=1)
    axis.set(title=title, xlabel="Predicted label", ylabel="True label")
    tick_marks = np.arange(len(classes))
    axis.set_xticks(tick_marks)
    axis.set_xticklabels(classes)
    axis.set_yticks(tick_marks)
    axis.set_yticklabels(classes)

    fmt = ".2f" if normalize else "d"
    thresh = cm.max() / 2.0
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        axis.text(
            j,
            i,
            format(cm[i, j], fmt),
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black",
        )

    return im