In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import pandas
pandas.set_option('display.max_rows', 1000)
pandas.set_option('display.max_columns', 1000)

%reload_ext autoreload
%autoreload 2

In [2]:
import time
from contextlib import contextmanager

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

In [3]:
import re
import numpy
import pandas
from sklearn.preprocessing import StandardScaler

In [4]:
def first_dataset():
    
    train = pandas.read_csv('train.csv')
    test = pandas.read_csv('test.csv')
    
    datasets = [train, test]
    
    def get_title(name):
        if re.search(' ([A-Za-z]+)\.', name):
            return re.search(' ([A-Za-z]+)\.', name).group(1)
        return ""

    
    for dataset in datasets:

        dataset['Cabin'] = dataset['Cabin'].apply(lambda x: 1 if type(x) == str else 0)
        
        dataset['Age'] = dataset['Age'].fillna(-1).astype(int)
        
        dataset['Fare'] = dataset['Fare'].fillna(-1).astype(int)

        dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
        
        dataset['Title'] = dataset['Name'].apply(get_title)
        dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
        dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
        title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
        dataset['Title'] = dataset['Title'].map(title_mapping)
        dataset['Title'] = dataset['Title'].fillna(-1)

        dataset['Embarked'] = dataset['Embarked'].fillna('S')
        dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
        dataset.drop(['PassengerId', 'Ticket', 'Name'], axis=1, inplace=True)
        
    X_train = train.drop(['Survived'], axis=1)
    y_train = train['Survived']
    X_test = test
    
    std = StandardScaler()
    std.fit(X_train)
    X_train = std.transform(X_train).astype(numpy.float32)
    X_test = std.transform(X_test).astype(numpy.float32)

    return {'X_train': X_train, 'X_test': X_test, 'y_train': y_train}
    
df = first_dataset()

In [5]:
import lightgbm as lgbm
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(df['X_train'], df['y_train'], test_size=0.2, random_state=0)

train_dataset = lgbm.Dataset(data=X_train, label=y_train, free_raw_data=False)
test_dataset = lgbm.Dataset(data=X_valid, label=y_valid, free_raw_data=False)
final_train_dataset = lgbm.Dataset(data=df['X_train'], label=df['y_train'], free_raw_data=False)

lgbm_params = {
    'boosting': 'dart', 
    'application': 'binary',
    'learning_rate': 0.05,
    'min_data_in_leaf': 20,
    'feature_fraction': 0.7,
    'num_leaves': 41,
    'metric': 'binary_logloss',
    'drop_rate': 0.15
}

evaluation_results = {}
clf = lgbm.train(train_set=train_dataset,
                 params=lgbm_params,
                 valid_sets=[train_dataset, test_dataset], 
                 valid_names=['Train', 'Test'],
                 evals_result=evaluation_results,
                 num_boost_round=500,
                 early_stopping_rounds=100,
                 verbose_eval=20
                )
                
clf_final = lgbm.train(train_set=final_train_dataset,
                      params=lgbm_params,
                      num_boost_round=500,
                      verbose_eval=0
                      )

y_pred = numpy.round(clf_final.predict(df['X_test'])).astype(int)

passengerId = pandas.read_csv('test.csv')['PassengerId']
dataframe = pandas.DataFrame({'PassengerId': passengerId, 'Survived': y_pred})

dataframe.to_csv('submission_single_lgbm_model.csv', index=False)



[20]	Train's binary_logloss: 0.464484	Test's binary_logloss: 0.487243
[40]	Train's binary_logloss: 0.408891	Test's binary_logloss: 0.446751
[60]	Train's binary_logloss: 0.393729	Test's binary_logloss: 0.442872
[80]	Train's binary_logloss: 0.378047	Test's binary_logloss: 0.436778
[100]	Train's binary_logloss: 0.37472	Test's binary_logloss: 0.436104
[120]	Train's binary_logloss: 0.365463	Test's binary_logloss: 0.432786
[140]	Train's binary_logloss: 0.35433	Test's binary_logloss: 0.424344
[160]	Train's binary_logloss: 0.35888	Test's binary_logloss: 0.426459
[180]	Train's binary_logloss: 0.337339	Test's binary_logloss: 0.416787
[200]	Train's binary_logloss: 0.324049	Test's binary_logloss: 0.413225
[220]	Train's binary_logloss: 0.317539	Test's binary_logloss: 0.415004
[240]	Train's binary_logloss: 0.308851	Test's binary_logloss: 0.415523
[260]	Train's binary_logloss: 0.302299	Test's binary_logloss: 0.416495
[280]	Train's binary_logloss: 0.2962	Test's binary_logloss: 0.41336
[300]	Train's bi

In [7]:
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline

from rgf.sklearn import RGFClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [8]:
#with timer('LightGBM'):

In [9]:
ds = Dataset(preprocessor=first_dataset, use_cache=False)

In [10]:
et_params = {'n_estimators': 100, 'max_features': 0.5, 'max_depth': 18, 'min_samples_leaf': 4, 'n_jobs': -1}
rf_params = {'n_estimators': 125, 'max_features': 0.2, 'max_depth': 25, 'min_samples_leaf': 4, 'n_jobs': -1}
rgf_params = {'algorithm': 'RGF_Sib', 'loss': 'Log'}

In [12]:
from keras.layers import Dense
from keras.models import Sequential

def NuralNetClassifier(X_train, y_train, X_test, y_test=None):
    input_dim = X_train.shape[1]
    
    model = Sequential()
    model.add(Dense(12, input_dim=input_dim, activation='relu'))
    model.add(Dense(6, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=30, batch_size=10, verbose=0)
    y_pred = numpy.ravel(model.predict(X_test))
    
    return y_pred

In [13]:
def LightGBMClassifier(X_train, y_train, X_test, y_test=None):
    lgbm_params = {
        'boosting': 'dart', 
        'application': 'binary',
        'learning_rate': 0.05,
        'min_data_in_leaf': 20,
        'feature_fraction': 0.7,
        'num_leaves': 41,
        'metric': 'binary_logloss',
        'drop_rate': 0.15
    }
    
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
    train_dataset = lgbm.Dataset(data=X_train, label=y_train, free_raw_data=False)
    test_dataset = lgbm.Dataset(data=X_valid, label=y_valid, free_raw_data=False)
    
    final_train_dataset = lgbm.Dataset(data=X_train, label=y_train, free_raw_data=False)
    
    evaluation_results = {}
    
    clf = lgbm.train(train_set=train_dataset,
                     params=lgbm_params,
                     valid_sets=[train_dataset, test_dataset], 
                     valid_names=['Train', 'Test'],
                     evals_result=evaluation_results,
                     num_boost_round=500,
                     early_stopping_rounds=100,
                     verbose_eval=0
                    )
    
    clf_final = lgbm.train(train_set=final_train_dataset,
                          params=lgbm_params,
                          num_boost_round=500,
                          verbose_eval=0
                          )

    y_pred = clf_final.predict(X_test)

    
    return y_pred

In [14]:
pipeline = ModelsPipeline(
    Classifier(estimator=LightGBMClassifier, dataset=ds, use_cache=False),
    Classifier(estimator=NuralNetClassifier, dataset=ds, use_cache=False),
    Classifier(estimator=RGFClassifier, dataset=ds, use_cache=False, parameters=rgf_params),
    Classifier(estimator=ExtraTreesClassifier, dataset=ds, use_cache=False, parameters=et_params),
    Classifier(estimator=RandomForestClassifier, dataset=ds, use_cache=False, parameters=rf_params),
    Classifier(estimator=LogisticRegression, dataset=ds, use_cache=False),
    Classifier(estimator=KNeighborsClassifier, dataset=ds, use_cache=False)
)

In [15]:
with timer('heamy single stacking model'):
    stack_ds = pipeline.stack(k=10, seed=0, add_diff=False, full_test=True)
    stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, use_cache=False)
    y_pred = stacker.predict()



[heamy single stacking model] done in 119 s


In [16]:
from sklearn.metrics import log_loss

In [17]:
pipeline2 = ModelsPipeline(
    Classifier(estimator=LightGBMClassifier, dataset=stack_ds, use_cache=False),
    Classifier(estimator=NuralNetClassifier, dataset=stack_ds, use_cache=False)
)

In [18]:
with timer('heamy multiple stacking model'):
    weights = pipeline2.find_weights(log_loss)
    predictions = pipeline2.weight(weights).execute()



Best Score (log_loss): 0.3998561112420261
Best Weights: [0.50000622 0.49999378]




[heamy multiple stacking model] done in 20 s


# NN抜き

In [19]:
pipeline = ModelsPipeline(
    Classifier(estimator=LightGBMClassifier, dataset=ds, use_cache=False),
    Classifier(estimator=RGFClassifier, dataset=ds, use_cache=False, parameters=rgf_params),
    Classifier(estimator=ExtraTreesClassifier, dataset=ds, use_cache=False, parameters=et_params),
    Classifier(estimator=RandomForestClassifier, dataset=ds, use_cache=False, parameters=rf_params),
    Classifier(estimator=LogisticRegression, dataset=ds, use_cache=False),
    Classifier(estimator=KNeighborsClassifier, dataset=ds, use_cache=False)
)

In [20]:
pipeline2 = ModelsPipeline(
    Classifier(estimator=LightGBMClassifier, dataset=stack_ds, use_cache=False),
    Classifier(estimator=LogisticRegression, dataset=stack_ds, use_cache=False)
)

In [21]:
with timer('heamy single stacking model'):
    stack_ds = pipeline.stack(k=10, seed=0, add_diff=False, full_test=True)
    stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, use_cache=False)
    y_pred = stacker.predict()



[heamy single stacking model] done in 64 s


In [22]:
with timer('heamy multiple stacking model'):
    weights = pipeline2.find_weights(log_loss)
    predictions = pipeline2.weight(weights).execute()



Best Score (log_loss): 0.39725201890402023
Best Weights: [0.30616325 0.69383675]




[heamy multiple stacking model] done in 12 s


In [23]:
from sklearn.ensemble import StackingClassifier
from keras.wrappers.scikit_learn import KerasClassifier

In [24]:
lgbm_params = {
        'boosting': 'dart', 
        'application': 'binary',
        'learning_rate': 0.05,
        'min_data_in_leaf': 20,
        'feature_fraction': 0.7,
        'num_leaves': 41,
        'metric': 'binary_logloss',
        'drop_rate': 0.15
}
keras_params = {'epochs': 10, 'batch_size': 10}

In [25]:
def build_fn():
    clf = Sequential()
    clf.add(Dense(12, input_dim=9, activation='relu'))
    clf.add(Dense(6, activation='relu'))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return clf

In [26]:
estimators = [
    ('lgb', lgbm.LGBMClassifier(**lgbm_params)),
    #('nn', KerasClassifier(build_fn=build_fn, **keras_params)),
    ('rgf', RGFClassifier(**rgf_params)),
    ('et', ExtraTreesClassifier(**et_params)),
    ('rf', RandomForestClassifier(**rf_params)),
    ('lr', LogisticRegression()),
    ('knn', KNeighborsClassifier())
]

In [27]:
 with timer('sklean single stacking model'):
    clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
    clf.fit(df['X_train'], df['y_train'])
    predictions = clf.predict(df['X_test'])

StackingClassifier(cv=None,
                   estimators=[('lgb',
                                LGBMClassifier(application='binary',
                                               boosting='dart',
                                               boosting_type='gbdt',
                                               class_weight=None,
                                               colsample_bytree=1.0,
                                               drop_rate=0.15,
                                               feature_fraction=0.7,
                                               importance_type='split',
                                               learning_rate=0.05, max_depth=-1,
                                               metric='binary_logloss',
                                               min_child_samples=20,
                                               min_child_weight=0.001,
                                               min_data_in_leaf=20,
                               

[sklean single stacking model] done in 30 s


In [28]:
 with timer('sklean multiple stacking model'):
    final_estimator = StackingClassifier(
        estimators= [
            ('lgb', lgbm.LGBMClassifier(**lgbm_params)),
            ('lr', LogisticRegression())
        ],
        final_estimator=LogisticRegression()
    )

    clf = StackingClassifier(
        estimators= [
            ('lgb', lgbm.LGBMClassifier(**lgbm_params)),
            #('nn', KerasClassifier(build_fn=build_fn, **keras_params)),
            ('rgf', RGFClassifier(**rgf_params)),
            ('et', ExtraTreesClassifier(**et_params)),
            ('rf', RandomForestClassifier(**rf_params)),
            ('lr', LogisticRegression()),
            ('knn', KNeighborsClassifier())
        ],
        final_estimator=final_estimator
    )

    clf.fit(df['X_train'], df['y_train'])
    predictions = clf.predict(df['X_test'])

StackingClassifier(cv=None,
                   estimators=[('lgb',
                                LGBMClassifier(application='binary',
                                               boosting='dart',
                                               boosting_type='gbdt',
                                               class_weight=None,
                                               colsample_bytree=1.0,
                                               drop_rate=0.15,
                                               feature_fraction=0.7,
                                               importance_type='split',
                                               learning_rate=0.05, max_depth=-1,
                                               metric='binary_logloss',
                                               min_child_samples=20,
                                               min_child_weight=0.001,
                                               min_data_in_leaf=20,
                               

[sklean multiple stacking model] done in 28 s
