In [1]:
import warnings
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import svm
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import tree
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
import xgboost as xgb
from collections import defaultdict

warnings.filterwarnings('ignore')

SEED = 57

In [2]:
df_tv = pd.read_csv('data/train_preprocessed_fold.csv', index_col='PassengerId')
df_tt = pd.read_csv('data/test_preprocessed.csv', index_col='PassengerId')

In [3]:
df_tt.isna().sum()

Age             0
FamilySize      0
LogFare         0
Pclass_three    0
Pclass_two      0
Sex_male        0
Cabin_B         0
Cabin_C         0
Cabin_D         0
Cabin_E         0
Cabin_F         0
Cabin_G         0
Cabin_T         0
Embarked_Q      0
Embarked_S      0
Title_Miss      0
Title_Mr        0
Title_Mrs       0
Title_Others    0
dtype: int64

In [4]:
traning_cols = ['Age', 'FamilySize', 'LogFare', 'Pclass_three', 'Pclass_two', 'Sex_male', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_T', 'Embarked_Q', 'Embarked_S', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Others']
target_col = 'Survived'

In [5]:
classifiers = {
    'lr': linear_model.LogisticRegression,
    'svc': svm.SVC,
    'lsvc': svm.LinearSVC,
    'gnb': naive_bayes.GaussianNB,
    'mnb': naive_bayes.MultinomialNB,
    'knn': neighbors.KNeighborsClassifier,
    'dt': tree.DecisionTreeClassifier,
    'rf': ensemble.RandomForestClassifier,
    'ada': ensemble.AdaBoostClassifier,
    'gb': ensemble.GradientBoostingClassifier,
    'sgd': linear_model.SGDClassifier,
    'xgb': xgb.XGBClassifier
}

classifier_params = {
    'lr': {
        'params': {'C': 1.0, 'random_state': 57},
        'score': 0.829
    },
    'svc': {
        'params': {'C': 1000.0, 'gamma': 0.001, 'random_state': 57},
        'score': 0.830
    },
    'lsvc': {
        'params': {'C': 1.0, 'random_state': 57},
        'score': 0.827
    },
    'gnb': {
        'params': {'var_smoothing': 1e-06},
        'score': 0.757
    },
    'mnb': {
        'params': {'alpha': 0.0},
        'score': 0.781
    },
    'knn': {
        'params': {'n_neighbors': 4},
        'score': 0.791
    },
    'dt': {
        'params': {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 0.1, 'random_state': 57},
        'score': 0.775
    },
    'rf': {
        'params': {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 0.1, 'n_estimators': 250, 'random_state': 57},
        'score': 0.787
    },
    'ada': {
        'params': {'n_estimators': 100, 'random_state': 57},
        'score': 0.817
    },
    'gb': {
        'params': {'min_samples_leaf': 0.1, 'n_estimators': 250, 'random_state': 57},
        'score': 0.836
    },
    'sgd': {
        'params': {'alpha': 0.001, 'penalty': 'elasticnet', 'random_state': 57},
        'score': 0.824
    },
    'xgb': {
        'params': {'eta': 0.6, 'gamma': 1.0, 'lambda': 10.0, 'seed': 57},
        'score': 0.839
    }
}

In [6]:
def generate_meta_feature(classifier_name, classifier, params, fold):
    df_tr = df_tv[df_tv.kfold != fold]
    df_vl = df_tv[df_tv.kfold == fold]

    xtr, ytr = df_tr[traning_cols], df_tr[target_col]
    xvl = df_tr[traning_cols]

    ypd = classifier(**params).fit(xtr, ytr).predict(xvl)

    df_tv.loc[xvl.index, classifier_name] = ypd

In [7]:
for classifier_name, classifier in classifiers.items():
    params = classifier_params[classifier_name]['params']
    df_tv[classifier_name] = None
    for fold in range(5): generate_meta_feature(classifier_name, classifier, params, fold)

In [8]:
df_tv.head()

Unnamed: 0_level_0,Survived,Age,FamilySize,LogFare,Pclass_three,Pclass_two,Sex_male,Cabin_B,Cabin_C,Cabin_D,...,lsvc,gnb,mnb,knn,dt,rf,ada,gb,sgd,xgb
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0.275,0.181818,0.338125,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1,0.475,0.181818,0.685892,0.0,0.0,0.0,0.0,1.0,0.0,...,1,1,1,1,1,1,1,1,1,1
3,1,0.325,0.090909,0.350727,1.0,0.0,0.0,0.0,1.0,0.0,...,1,1,1,1,1,1,1,1,1,0
4,1,0.4375,0.181818,0.639463,0.0,0.0,0.0,0.0,1.0,0.0,...,1,1,1,1,1,1,1,1,1,1
5,0,0.4375,0.090909,0.352955,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
xtr, ytr = df_tv[traning_cols], df_tv[target_col]
for classifier_name, classifier in classifiers.items():
    params = classifier_params[classifier_name]['params']
    ypd = classifier(**params).fit(xtr, ytr).predict(df_tt[traning_cols])
    df_tt[classifier_name] = ypd

In [12]:
df_tt.head(5)

Unnamed: 0_level_0,Age,FamilySize,LogFare,Pclass_three,Pclass_two,Sex_male,Cabin_B,Cabin_C,Cabin_D,Cabin_E,...,lsvc,gnb,mnb,knn,dt,rf,ada,gb,sgd,xgb
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,0.453947,0.090909,0.348997,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
893,0.618421,0.181818,0.333195,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1,1,1,0,0,1,1,0,1,0
894,0.815789,0.090909,0.379604,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
895,0.355263,0.090909,0.363449,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
896,0.289474,0.272727,0.414494,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1,1,1,1,0,1,1,1,1,0


In [13]:
df_tv.to_csv('data/train_preprocessed_meta.csv')
df_tt.to_csv('data/test_preprocessed_meta.csv')