In [1]:
import sqlite3
import numpy as np
import pandas as pd

from time import time

import matplotlib.pyplot as plt

from sklearn.metrics import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid
from sklearn import preprocessing, svm, metrics, tree, decomposition
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier

In [2]:
def precision_at_k(y_true, y_scores, k):
    threshold = np.sort(y_scores)[::-1][int(k*len(y_scores))]
    y_pred = np.asarray([1 if i >= threshold else 0 for i in y_scores])
    return metrics.precision_score(y_true, y_pred)

## Magicloop

In [3]:
def define_hyper_params():
    clfs = {
        'RF': RandomForestClassifier(n_estimators=100),
        'ET': ExtraTreesClassifier(n_estimators=10, criterion='entropy'),
        'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
        'LR': LogisticRegression(penalty='l2', C=1e5, solver='liblinear'),
        'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
        'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
        'NB': GaussianNB(),
        'DT': DecisionTreeClassifier(),
        'SGD': SGDClassifier(loss="hinge", penalty="l2"),
        'KNN': KNeighborsClassifier(n_neighbors=3) 
            }

    grid = { 
        'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
        'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]},
        'SGD': { 'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
        'ET': { 'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
        'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
        'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
        'NB' : {},
        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
        'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear', 'rbf']},
        'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
           }

    return clfs, grid

In [4]:
def magic_loop(models_to_run, clfs, grid, X, y, search = 1):
    for n in range(1, 2):
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        for index, clf in enumerate([clfs[x] for x in models_to_run]):
            print(models_to_run[index])
            parameter_values = grid[models_to_run[index]]
            try:
                if(search):
                    gs = GridSearchCV(clf, parameter_values, cv=5)
                    start = time()
                    y_pred_probs = gs.fit(X_train, y_train).predict_proba(X_test)[:,1]
                    y_score = gs.fit(X_train, y_train).decision_function(X_test)
                    print(precision_at_k(y_test, y_pred_probs, 0.05))
                    print("GridSearch time: " + (str)(time() - start))

                else:
                    start = time()
                    rs = RandomizedSearchCV(clf, parameter_values, cv=5)
                    y_pred_probs = rs.fit(X_train, y_train).predict_proba(X_test)[:,1]
                    print(precision_at_k(y_test, y_pred_probs, 0.05)) 
                    print("RandomizedSearch time: " + (str)(time() - start))
            except IndexError as e:
                print('Error:', e)
                continue

### Connect to Titanic DB

In [5]:
titanic = pd.read_sql_query("SELECT * from Titanic", sqlite3.connect("Titanic.db"))
titanic = titanic.fillna(value=np.nan)
lep = preprocessing.LabelEncoder()
for ftr in list(titanic):
    titanic[ftr] = lep.fit_transform(titanic[ftr].astype(str))  


### Parameters

In [8]:
clfs, grid = define_hyper_params()
models = ['KNN','RF','LR','ET','AB','GB','DT']
features = ['age', 'Class', 'Ticket', 'pounds', 'Group', 'Ship', 'Joined', 'Job', 'Boat', 'sex', 'boat_location']
X = titanic[features]
y = titanic['survived'].astype(int)

### Magic Lopp with GridSearch and RandomSearch

In [9]:
# GridSearch
#magic_loop(models, clfs, grid, X, y, search = 1)

#RandomSearch
magic_loop(models, clfs, grid, X, y, search = 0)


KNN
0.8549618320610687
RandomizedSearch time: 0.6100349426269531
RF
1.0
RandomizedSearch time: 99.84871101379395
LR
1.0
RandomizedSearch time: 1.2810733318328857
ET
1.0
RandomizedSearch time: 135.86377096176147
AB
1.0
RandomizedSearch time: 0.19901132583618164
GB
1.0
RandomizedSearch time: 13.362764358520508
DT
1.0
RandomizedSearch time: 0.16200900077819824
