In [1]:
import warnings
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import svm
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import tree
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
import xgboost as xgb
from collections import defaultdict

warnings.filterwarnings('ignore')

SEED = 57

In [2]:
df_tv = pd.read_csv('data/train_preprocessed_fold.csv', index_col='PassengerId')

In [3]:
classifiers = {
    'lr': linear_model.LogisticRegression,
    'svc': svm.SVC,
    'lsvc': svm.LinearSVC,
    'gnb': naive_bayes.GaussianNB,
    'mnb': naive_bayes.MultinomialNB,
    'knn': neighbors.KNeighborsClassifier,
    'dt': tree.DecisionTreeClassifier,
    'rf': ensemble.RandomForestClassifier,
    'ada': ensemble.AdaBoostClassifier,
    'gb': ensemble.GradientBoostingClassifier,
    'sgd': linear_model.SGDClassifier,
    'xgb': xgb.XGBClassifier
}

classifiers_params = {
    'lr': {
        'C': np.logspace(-3, 3, 7),
        'random_state': [SEED]
    },
    'svc': {
        'C': np.logspace(-3, 3, 7),
        'gamma': np.logspace(-3, 1, 5),
        'random_state': [SEED]
    },
    'lsvc': {
        'C': np.logspace(-3, 3, 7),
        'random_state': [SEED]
    },
    'gnb': {
        'var_smoothing': np.logspace(-12, -6, 7)
    },
    'mnb': {
        'alpha': np.linspace(0, 1, 11)
    },
    'knn': {
        'n_neighbors': np.linspace(3, 7, 5, dtype=int)
    },
    'dt': {
        'criterion': ["gini", "entropy", "log_loss"],
        'min_samples_leaf': [0.1],
        'max_depth': np.linspace(3, 7, 5, dtype=int),
        'random_state': [SEED]
    },
    'rf': {
        ''
        'criterion': ["gini", "entropy", "log_loss"],
        'min_samples_leaf': [0.1],
        'max_depth': np.linspace(3, 7, 5, dtype=int),
        'n_estimators': np.linspace(50, 300, 6, dtype=int),
        'random_state': [SEED]
    },
    'ada': {
        'n_estimators': np.linspace(50, 300, 6, dtype=int),
        'random_state': [SEED]
    },
    'gb': {
        'n_estimators': np.linspace(50, 300, 6, dtype=int),
        'min_samples_leaf': [0.1],
        'random_state': [SEED]
    },
    'sgd': {
        'penalty' : ['l2', 'l1', 'elasticnet'],
        'alpha': np.logspace(-7, -1, 7),
        'random_state': [SEED]
    },
    'xgb': {
        'eta': np.arange(0, 1.1, 0.1),
        'gamma': np.logspace(0, 4, 5),
        'lambda': np.logspace(0, 4, 5),
        'seed': [SEED]
    }
}

In [4]:
def calculate_accuracy(classifier, params, fold):
    df_tr = df_tv[df_tv.kfold != fold]
    df_vl = df_tv[df_tv.kfold == fold]
    xtr, ytr = df_tr.drop('Survived', axis=1), df_tr['Survived']
    xvl, yvl = df_vl.drop('Survived', axis=1), df_vl['Survived']
    ypd = classifier(**params).fit(xtr, ytr).predict(xvl)
    return metrics.accuracy_score(yvl, ypd)

In [5]:
accuracies = defaultdict(list)

for classifier_name, classifier in classifiers.items():
    for params in model_selection.ParameterGrid(classifiers_params[classifier_name]):
        acc = np.mean([calculate_accuracy(classifier, params, fold) for fold in range(5)])
        accuracies[classifier_name].append((acc, params))

    accuracies[classifier_name].sort(key=lambda x: x[0], reverse=True)
    print(f'classifier: {classifier_name}, params: {accuracies[classifier_name][0][1]} ==> {accuracies[classifier_name][0][0]}')


classifier: lr, params: {'C': 100.0, 'random_state': 57} ==> 0.802416671897558
classifier: svc, params: {'C': 100.0, 'gamma': 0.001, 'random_state': 57} ==> 0.744090138723244
classifier: lsvc, params: {'C': 1.0, 'random_state': 57} ==> 0.7699077270730024
classifier: gnb, params: {'var_smoothing': 1e-06} ==> 0.7889711882493253
classifier: mnb, params: {'alpha': 0.0} ==> 0.7867428284476807
classifier: knn, params: {'n_neighbors': 4} ==> 0.6139099868181533
classifier: dt, params: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 0.1, 'random_state': 57} ==> 0.7755194275312285
classifier: rf, params: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 0.1, 'n_estimators': 50, 'random_state': 57} ==> 0.7923294206264515
classifier: ada, params: {'n_estimators': 50, 'random_state': 57} ==> 0.8080597577051034
classifier: gb, params: {'min_samples_leaf': 0.1, 'n_estimators': 50, 'random_state': 57} ==> 0.8237838177138912
classifier: sgd, params: {'alpha': 0.01, 'penalty': 'l1',