In [1]:

import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
train['Sex_binario'] = train['Sex'].map({"male": 0, "female": 1})

test['Sex_binario'] = test['Sex'].map({"male": 0, "female": 1})

In [3]:
X = train.select_dtypes(include=np.number).drop(["PassengerId", 'Survived'], axis=1).fillna(0)
y = train['Survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [4]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.pipeline import make_pipeline

In [5]:
kf = KFold(n_splits=2, random_state=0, shuffle=True)


second_level = np.zeros((X.shape[0], 4))

for tr, ts in kf.split(X,y):
    Xtr, Xval = X.iloc[tr], X.iloc[ts]
    ytr, yval = y.iloc[tr], y.iloc[ts]
    
    rf = RandomForestClassifier(n_estimators=100, n_jobs=6, random_state=10)
    rf.fit(Xtr, ytr)
    prf = rf.predict_proba(Xval)[:,1]
    prf_ = (prf > 0.5).astype(int)
    
    print("RF Accuracy: {} - Log Loss: {}".format(accuracy_score(yval, prf_), log_loss(yval, prf)))
    
    et = ExtraTreesClassifier(n_estimators=100, n_jobs=6, random_state=10)
    et.fit(Xtr, ytr)
    pet = et.predict_proba(Xval)[:,1]
    pet_ = (pet > 0.5).astype(int)
    
    print("ET Accuracy: {} - Log Loss: {}".format(accuracy_score(yval, pet_), log_loss(yval, pet)))
    
    lr1 = make_pipeline(StandardScaler(), LogisticRegression())
    lr1.fit(Xtr, ytr)
    plr1 = lr1.predict_proba(Xval)[:,1]
    plr1_ = (plr1 > 0.5).astype(int)
    
    print("LR StdScaler Accuracy: {} - Log Loss: {}".format(accuracy_score(yval, plr1_), log_loss(yval, plr1)))
    
    lr2 = make_pipeline(MinMaxScaler(), LogisticRegression())
    lr2.fit(Xtr, ytr)
    plr2 = lr2.predict_proba(Xval)[:,1]
    plr2_ = (plr2 > 0.5).astype(int)
    
    print("LR MinMax Accuracy: {} - Log Loss: {}".format(accuracy_score(yval, plr2_), log_loss(yval, plr2)))
    
    second_level[ts, 0] = prf
    second_level[ts, 1] = pet
    second_level[ts, 2] = plr1
    second_level[ts, 3] = plr2
    
    print()
    
# fatores de diversidade
second_level.shape

RF Accuracy: 0.7869955156950673 - Log Loss: 0.9094309116007115
ET Accuracy: 0.7802690582959642 - Log Loss: 1.7851321887744416
LR StdScaler Accuracy: 0.7713004484304933 - Log Loss: 0.45488997036355516
LR MinMax Accuracy: 0.7713004484304933 - Log Loss: 0.45728829978094115

RF Accuracy: 0.8292134831460675 - Log Loss: 0.6380995261890016
ET Accuracy: 0.8089887640449438 - Log Loss: 2.631965993490573
LR StdScaler Accuracy: 0.7955056179775281 - Log Loss: 0.45792319674193815
LR MinMax Accuracy: 0.8044943820224719 - Log Loss: 0.4610908178469295



(891, 4)

In [6]:
for tr, ts in kf.split(X,y):

    Xtr, Xval = second_level[tr], second_level[ts]
    ytr, yval = y.iloc[tr], y.iloc[ts]
    print(Xtr.shape, Xval.shape)
    print(ytr.shape, yval.shape)
    lr_stack = LogisticRegression(C=1.)
    lr_stack.fit(Xtr, ytr)
    plr_stack = lr_stack.predict_proba(Xval)[:,1]
    plr_stack_ = (plr_stack > 0.5).astype(int)
    
    print("Stack Accuracy: {}  Log loss: {}".format(accuracy_score(yval, plr_stack_), log_loss(yval, plr_stack)))
    print()

(445, 4) (446, 4)
(445,) (446,)
Stack Accuracy: 0.8004484304932735  Log loss: 0.4379016109901706

(446, 4) (445, 4)
(446,) (445,)
Stack Accuracy: 0.8157303370786517  Log loss: 0.4288782571629447



In [7]:
pd.DataFrame(np.corrcoef(second_level.T))

Unnamed: 0,0,1,2,3
0,1.0,0.966815,0.808317,0.808803
1,0.966815,1.0,0.756243,0.756547
2,0.808317,0.756243,1.0,0.995999
3,0.808803,0.756547,0.995999,1.0
