In [67]:
def binaryClassification(data, location): # location needs to be of the format 'C:/Users/jihun/Documents/''
    # This function uses the following models:
    
        # logistic regression
        # 1. vanilla
        # 2. lasso
        # 3. ridge
        
        # discriminant analysis
        # 4. lnear
        # 5. quadratic
        # 6. nearest centroid
        
        # naive bayes
        # 7. gaussian naive bayes
        
        # nearest neighbors
        # 8. knn
        
        # gaussian process
        # 9. gaussian process classifier
        
        # support veector machine
        # 10. linear
        # 11. polynomial
        # 12. radial
        
        # tree methods
        # 13. random forest
        # 14. extreme random forest
        # 15. adaboost
        # 16. gradient boosting

        # neural network
        # 17. multilayer perceptron
    
    # import necessary libraries
    import pickle
    import numpy as np
    import pandas as pd
    from sklearn.svm import SVC
    import matplotlib.pyplot as plt
    from sklearn.externals import joblib
    from sklearn.naive_bayes import GaussianNB
    from sklearn.gaussian_process.kernels import RBF
    from sklearn.preprocessing import StandardScaler
    from sklearn.neural_network import MLPClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.gaussian_process import GaussianProcessClassifier
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
    from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
    from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score, cohen_kappa_score, f1_score, recall_score, precision_score, average_precision_score, brier_score_loss
    
    # define response and feature
    y = data.iloc[:,0] # response variable needs to be the first column
    X = data.iloc[:,1:]
    
    # split train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # model initializers
    models = [
        LogisticRegression(),
        LogisticRegression(penalty='l1'),
        LogisticRegression(penalty='l2'),
        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis(),
        NearestCentroid(),
        GaussianNB(),
        KNeighborsClassifier(),
        GaussianProcessClassifier(),
        SVC(kernel="linear", probability=True), 
        SVC(kernel="rbf", probability=True),
        SVC(kernel="poly", probability=True),
        RandomForestClassifier(),
        ExtraTreesClassifier(),
        AdaBoostClassifier(),
        GradientBoostingClassifier(),
        MLPClassifier()
    ]
    
    # fit a standard scaler
    scaler = StandardScaler()
    scaler.fit_transform(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # parameter space for grid search
    parameter_grid=[
        # Logistic Regression
        {'C':[1e6]},
        # Lasso Regression
        {'C':np.linspace(0.1,1,10),
        'solver':['saga']}, # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
        # Ridge Regression
        {'C':np.linspace(0.1,1,10),
        'solver':['saga']},
        # linear Discriminant Analysis
        {'solver': ['lsqr','eigen'],  
        'shrinkage': ['auto']},
        # Quadratic Discriminant Analysis
        {'reg_param':np.linspace(0,1,11)}, # Regularizes the covariance estimate as (1-reg_param)*Sigma + reg_param*np.eye(n_features)
        # Nearest Centroid
        {'metric':['euclidean','manhattan'], # The metric to use when calculating distance between instances in a feature array. 
         'shrink_threshold':[0.2,0.4,0.6,0.8,1,None]}, # Threshold for shrinking centroids to remove features.
        # Gaussian Naive Bayes
        {'var_smoothing':[1e-9]}, # Portion of the largest variance of all features that is added to variances for calculation stability
        # K Nearest Neighbor
        {'n_neighbors': [3,5,7,9,11],
        'weights':['uniform','distance']},
        # Gaussian Process
        {'optimizer':['fmin_l_bbfgs_b']},
        # Support Vector Regression with Linear Kernel
        {'C':[0.001,0.01,0.05,0.1,0.2,0.5,1]}, # regularization parameter; The strength of the regularization is inversely proportional to C. Must be strictly positive.
        # Support Vector Regression with Polynomial Kernel
        {'C':[0.001,0.01,0.05,0.1,0.2,0.5,1],
        'degree':[1,2,3], # Degree of the polynomial kernel function
        'gamma':['scale','auto']}, # Kernel coefficient
        # Support Vector Regression with RBF Kernel
        {'C':[0.001,0.01,0.05,0.1,0.2,0.5,1],
        'gamma':['scale','auto']}, 
        # Random Forest Regressor
        {'n_estimators': [500, 1000, 1500],
         'max_features': ['auto','log2'],
         'criterion' :['gini', 'entropy']},
        # Extreme Random Forest
        {'n_estimators': [500, 1000, 1500],
          'max_features': ['auto'],
          'criterion' :['gini', 'entropy']},
        # Adaboost
        {'n_estimators': [500, 1000, 1500],
         'learning_rate':[0.01,0.05,0.1,0.5,1,2]}, # Learning rate shrinks the contribution of each regressor 
        # Gradient Boosting Regressor
        {'n_estimators': [500, 1000, 1500],
         'learning_rate':[0.01,0.05,0.1,0.5,1,2], # learning rate shrinks the contribution of each tree
         'loss':['deviance','exponential'],
         'subsample':[0.7,0.85,1.0], # The fraction of samples to be used for fitting the individual base learners
         'max_depth': [2,3,4,5], # maximum depth of the individual regression estimators.
         'max_features':['auto']},
        # Multilayer Perceptron
        {'solver': ['lbfgs'],
        'learning_rate': ["constant", "invscaling", "adaptive"],
        'hidden_layer_sizes':[(5,5,5), (7,5,3), (10,)],
        'alpha': [0.001,0.01,0.1,1],
        'activation': ["relu", "tanh", 'identity'],
        'random_state':[97]}
    ] 

    modList = []
    result = np.repeat(1,9)
    
    colnames = ['Accuracy',
               'AUC',
               'Balanced Accuracy',
               'Kappa',
               'F1',
               'Recall',
               'Precision',
               'Average Precision',
               'Brier Score']
    
    rownames = [1,
                'Logistic Regression',
               'Lasso Regression',
               'Ridge Regression',
               'LDA',
               'QDA',
               'Nearest Centroid',
               'Gaussian Naive Bayes',
               'KNN',
               'SVM-Linear',
               'SVM-Polynomial',
               'SVM-RBF',
               'Random Forest',
               'Extreme Random Forest',
               'Adaboost',
               'Gradient Boosting',
               'MLP']
    
    # fit models via loop
    for clf,grid,name in zip(models, parameter_grid, rownames):
        # gridsearch
        mod = GridSearchCV(estimator=clf,
              param_grid=grid,
              cv=10)
        mod.fit(X_train,y_train)
        # save the model
        joblib.dump(clf, location+name+'.pkl') 
        modList.append(mod)
        # make prediction on test set
        predictions = mod.predict(X_test)  
        # calculate classification metrics
        metrics = np.array([
            accuracy_score(y_test, predictions),
            roc_auc_score(y_test, predictions),
            balanced_accuracy_score(y_test, predictions),
            cohen_kappa_score(y_test, predictions),
            f1_score(y_test, predictions),
            recall_score(y_test, predictions),
            precision_score(y_test, predictions),
            average_precision_score(y_test, predictions),
            brier_score_loss(y_test, predictions)
        ])
        # add the metrics to the result
        result = np.vstack((result,metrics))
        
    
    # save the final classification metric table

    result = pd.DataFrame(result, columns=colnames, index=rownames)
    result.to_csv(location + 'classification_model_results.csv')
    
    return result

        


In [23]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
bc = load_breast_cancer()
data = pd.DataFrame(data= np.c_[bc['target'], bc['data']],
                   columns=np.append(['target'], bc['feature_names']))

In [None]:
binaryClassification(data, 'C:/Users/jihun/Desktop/')











