In [3]:
import numpy as np 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import VotingClassifier

In [8]:
X = np.load('./tatanic_X_train.npy')
y = np.load('./tatanic_y_train.npy')

In [9]:
X[:5]

array([[0.27345609, 0.01415106, 0.        , 1.        , 0.        ,
        0.125     , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.473882  , 0.13913574, 0.        , 0.        , 1.        ,
        0.125     , 0.25      , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.32356257, 0.01546857, 0.        , 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 1.    

In [10]:
y[:5]

array([0., 1., 1., 1., 0.])

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [15]:
clflog = LogisticRegression(random_state=1)
clfdt = DecisionTreeClassifier(random_state=1)
clfgn = GaussianNB()
eclf_h = VotingClassifier(estimators=[('lr', clflog), ('rf', clfdt),
                                     ('gnb', clfgn)], voting='hard')
eclf_s = VotingClassifier(estimators=[('lr', clflog), ('rf', clfdt),
                                     ('gnb', clfgn)], voting='soft')

In [16]:
models = [clflog, clfdt, clfgn, eclf_h, eclf_s]

In [17]:
for model in models:
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    score = model.score(x_test, y_test)
    print(model)
    print(score)
    print('-'*20)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
0.8239700374531835
--------------------
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')
0.7490636704119851
--------------------
GaussianNB(priors=None, var_smoothing=1e-09)
0.3970037453183521
--------------------
VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state



In [18]:
from sklearn.model_selection import cross_val_score
for model in models:
    scores = cross_val_score(model, x_train, y_train, cv=5)
    print(scores)
    print(scores.mean())
    print('-'*20)

[0.824      0.824      0.824      0.77419355 0.79674797]
0.8085883031733543
--------------------
[0.784      0.808      0.744      0.80645161 0.82113821]
0.792717964857068
--------------------
[0.744      0.384      0.392      0.72580645 0.3902439 ]
0.5272100708103855
--------------------
[0.808      0.792      0.784      0.7983871  0.77235772]
0.7909489640702859
--------------------
[0.808      0.784      0.776      0.7983871  0.75609756]
0.7844969315499608
--------------------




In [19]:
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(random_state=1)
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)],
                        voting='hard')
eclf2 = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)],
                        voting='soft')

In [21]:
models = [clf1, clf2, eclf1, eclf2]
for model in models:
    scores = cross_val_score(model, x_train, y_train, cv=5)
    
    print(scores)
    print(scores.mean())
    print('-'*20)

[0.824      0.824      0.824      0.77419355 0.79674797]
0.8085883031733543
--------------------
[0.784      0.808      0.744      0.80645161 0.82113821]
0.792717964857068
--------------------
[0.8        0.832      0.792      0.84677419 0.84552846]
0.823260529766588
--------------------
[0.784      0.808      0.744      0.80645161 0.82113821]
0.792717964857068
--------------------




In [24]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], 
                       voting='hard')

In [26]:
c_params = [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0]
params ={ "lr__solver" : ['liblinear'], 
         "lr__penalty" : ["l2"], 
         "lr__C" : c_params, 
         "dt__criterion" : ["gini", "entropy"], 
         "dt__max_depth" : [10,8,7,6,5,4,3,2], 
         "dt__min_samples_leaf": [1,2,3,4,5,6,7,8,9] }

In [27]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(X,y)

In [30]:
grid.best_score_

0.84251968503937

In [31]:
grid.best_params_

{'dt__criterion': 'gini',
 'dt__max_depth': 10,
 'dt__min_samples_leaf': 5,
 'lr__C': 5.0,
 'lr__penalty': 'l2',
 'lr__solver': 'liblinear'}

In [32]:
c_params = [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0]
params ={"solver" : ['liblinear'], 
         "penalty" : ["l2"], "C" : c_params} 
grid = GridSearchCV(clf1, param_grid=params, cv=5)
grid = grid.fit(X, y)

In [33]:
grid.best_score_

0.8267716535433071

In [34]:
grid.best_params_

{'C': 5.0, 'penalty': 'l2', 'solver': 'liblinear'}