In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_recall_curve, average_precision_score
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [3]:
data = pd.read_csv('data.csv')
data.drop('Unnamed', axis=1, inplace=True)
replace_class = {1:1, 2:0, 3:0, 4:0, 5:0}
data['y'] = data['y'].replace(replace_class)
X = data.drop('y', axis=1)
y = data['y']
X = X.values
y = y.values
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

In [4]:
# Grid search for hyperparameter tuning

from sklearn.model_selection import GridSearchCV
import warnings

"""
default parameters:
xgb: learning_rate=0.1, max_depth=3, n_estimators=100
rfc: n_estimators=100, max_depth=None, criterion='gini'
dtc: max_depth=None, criterion='gini'
lr: penalty='l2', C=1.0, solver='lbfgs'
svc: C=1.0, kernel='rbf', gamma='scale', degree=3
"""

clf_list = {
    # "xgb": xgb.XGBClassifier(), 
    # "rfc": RandomForestClassifier(), 
    # "dtc": DecisionTreeClassifier(), 
    # "lr": LogisticRegression(max_iter=1000), 
    "svc": SVC(probability=True)
}

# Define parameter grid for each classifier
param_grid = {
    # "xgb": {"learning_rate": [0.03, 0.1, 0.3], "n_estimators": [30, 100, 300]}, 
    # "rfc": {"n_estimators": [30, 100, 300], "criterion": ["gini", "entropy", "log_loss"]},
    # "dtc": {"criterion": ["gini", "entropy", "log_loss"], "max_features": [None, 'sqrt', 'log2']},
    # "lr": {"penalty": ["l1", "l2", "elasticnet", "None"], "C": [0.3, 1.0, 3.0], "solver": ["newton-cg", "lbfgs", "liblinear", "newton-cholesky", "sag", "saga"]},
    "svc": {"C": [0.3, 1.0, 3.0]}
}

warnings.filterwarnings('ignore')
for clf_name, clf in clf_list.items():
    grid_clf = GridSearchCV(clf, param_grid[clf_name], cv=5, verbose=4, return_train_score=True)
    grid_clf.fit(X, y)
    
    print(clf.__class__.__name__, clf_name)
    print("Best parameters: ", grid_clf.best_params_)
    print("Best cross-validation score: %.2f%%" % (grid_clf.best_score_ * 100.0))
warnings.filterwarnings('default')

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END .........C=0.3;, score=(train=0.970, test=0.965) total time=   8.4s
[CV 2/5] END .........C=0.3;, score=(train=0.969, test=0.966) total time=   8.1s
[CV 3/5] END .........C=0.3;, score=(train=0.970, test=0.966) total time=   7.2s
[CV 4/5] END .........C=0.3;, score=(train=0.970, test=0.966) total time=   6.9s
[CV 5/5] END .........C=0.3;, score=(train=0.970, test=0.962) total time=   6.6s
[CV 1/5] END .........C=1.0;, score=(train=0.983, test=0.970) total time=   5.9s
[CV 2/5] END .........C=1.0;, score=(train=0.982, test=0.973) total time=   5.9s
[CV 3/5] END .........C=1.0;, score=(train=0.983, test=0.972) total time=   6.7s
[CV 4/5] END .........C=1.0;, score=(train=0.982, test=0.973) total time=   6.0s
[CV 5/5] END .........C=1.0;, score=(train=0.982, test=0.970) total time=   6.5s
[CV 1/5] END .........C=3.0;, score=(train=0.993, test=0.975) total time=   5.9s
[CV 2/5] END .........C=3.0;, score=(train=0.994,

XGBClassifier xgb  
Best parameters:  {'learning_rate': 0.3, 'n_estimators': 300}  
Best cross-validation score: 97.39%  

RandomForestClassifier rfc  
Best parameters:  {'criterion': 'gini', 'n_estimators': 300}  
Best cross-validation score: 97.49%  

DecisionTreeClassifier dtc  
Best parameters:  {'criterion': 'entropy', 'max_features': 'log2'}  
Best cross-validation score: 93.87%  

LogisticRegression lr  
Best parameters:  {'C': 3.0, 'penalty': 'l1', 'solver': 'liblinear'}  
Best cross-validation score: 82.00%  

SVC svc  
Best parameters:  {'C': 3.0}  
Best cross-validation score: 97.61%  