# Применение ансамблей моделей 

http://xgboost.readthedocs.io/en/latest/  
http://xgboost.readthedocs.io/en/latest/model.html  
https://lightgbm.readthedocs.io/  
https://lightgbm.readthedocs.io/en/latest/    
https://tech.yandex.com/catboost/doc/dg/concepts/about-docpage/   
http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/gbm.html#defining-a-gbm-model  

In [1]:
import time
import re
from __future__ import print_function
from collections import defaultdict

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, LabelEncoder, MinMaxScaler,  Imputer, LabelBinarizer, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

import xgboost as xgb
import lightgbm as lgb

%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 8)
pd.options.display.float_format = '{:.2f}'.format



In [2]:
df_train = pd.read_csv('train.csv')
df_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.46,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.86,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.07,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.13,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.07,,C


In [3]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.83,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.69,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.66,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.29,,S


In [4]:
# move target to the right
survived = df_train['Survived']
df_train.drop(labels=['Survived'], axis=1, inplace=True)
df_train['Survived'] = survived
df_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [5]:
class LabelEncoderPipelineFriendly(LabelEncoder):
    
    def fit(self, X, y=None):
        super(LabelEncoderPipelineFriendly, self).fit(X)
        
    def transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).transform(X).reshape(-1, 1)

    def fit_transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).fit(X).transform(X).reshape(-1, 1)
    

class FeaturesSum(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return np.sum(X, axis=1).reshape(-1, 1)

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
    

class AgeFeature(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X['Initial'] = 0
        for i in X:
            X['Initial'] = X.Name.str.extract('([A-Za-z]+)\.') 
    
        X['Initial'].replace(
            ['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
            ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],
            inplace=True
        )
        X.groupby('Initial')['Age'].mean() 

        X.loc[(X.Age.isnull()) & (X.Initial=='Mr'), 'Age'] = 33
        X.loc[(X.Age.isnull()) & (X.Initial=='Mrs'), 'Age'] = 36
        X.loc[(X.Age.isnull()) & (X.Initial=='Master'), 'Age'] = 5
        X.loc[(X.Age.isnull()) & (X.Initial=='Miss'), 'Age'] = 22
        X.loc[(X.Age.isnull()) & (X.Initial=='Other'), 'Age'] = 46
        return X['Age'].as_matrix().reshape(-1, 1)

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [6]:
def get_sex_col(df):
    return df[['Sex']]

def get_age_name_cols(df):
    return df[['Age', 'Name']]

def get_pclass_col(df):
    return df[['Pclass']]

def get_sum_cols(df):
    return df[['Age', 'Fare']]

def get_num_cols(df):
    return df[['Fare', 'SibSp', 'Parch']]

vec = make_union(*[
    make_pipeline(FunctionTransformer(get_pclass_col, validate=False),  OneHotEncoder(sparse=False)),
    make_pipeline(FunctionTransformer(get_sex_col, validate=False),  LabelEncoderPipelineFriendly()),
    make_pipeline(FunctionTransformer(get_num_cols, validate=False), Imputer(strategy='mean'), MinMaxScaler()),
    make_pipeline(FunctionTransformer(get_age_name_cols, validate=False),  AgeFeature()),
])

In [7]:
x_train = vec.fit_transform(df_train)
x_train.shape

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#

(891, 8)

In [8]:
x_train

array([[ 0.        ,  0.        ,  1.        , ...,  0.125     ,
         0.        , 22.        ],
       [ 1.        ,  0.        ,  0.        , ...,  0.125     ,
         0.        , 38.        ],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
         0.        , 26.        ],
       ...,
       [ 0.        ,  0.        ,  1.        , ...,  0.125     ,
         0.33333333, 22.        ],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , 26.        ],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
         0.        , 32.        ]])

In [9]:
y_train = df_train['Survived']
y_train.shape

(891,)

In [10]:
lr = LogisticRegressionCV(cv=10)
lr.fit(x_train, y_train)
lr

LogisticRegressionCV(Cs=10, class_weight=None, cv=10, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [11]:
accuracy_score(y_train, lr.predict(x_train))

0.8035914702581369

# Применение модели

In [12]:
def apply_model(model, submission_name):
    x_test = vec.fit_transform(df_test) 
    print('shape of x_test is {}'.format(x_test.shape))
    y_test = model.predict(x_test)
    print('shape of y_test is {}'.format(y_test.shape))
    df_predicted = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_test})
    df_predicted.to_csv(submission_name + '.csv', sep=',', index=False)

In [13]:
apply_model(lr, 'linear_regression_cv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


shape of x_test is (418, 8)
shape of y_test is (418,)


# Обучение ансамблей

In [13]:
def randomized_cv(model, param_grid, x_train=x_train, y_train=y_train):
    grid_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy')
    t_start = time.time()
    grid_search.fit(x_train, y_train)
    t_end = time.time()
    print('model {} best accuracy score is {}'.format(model.__class__.__name__, grid_search.best_score_))
    print('time for training is {} seconds'.format(t_end - t_start))
    print(grid_search.best_score_)
    return grid_search.best_estimator_

# XGBoost

In [18]:
import xgboost as xgb

param_grid = {
    'max_depth': [2, 3, 4],
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.025]
}
xgb = randomized_cv(xgb.XGBClassifier(), param_grid)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


model XGBClassifier best accuracy score is 0.82379349046
time for training is 124.997275114 seconds
0.8237934904601572


In [19]:
apply_model(xgb, 'xgb_cv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


shape of x_test is (418, 8)
shape of y_test is (418,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if diff:


Особенности XGBoost
* есть регуляризация
* распараллеливание
* возможность кастомизации
* обработка отсуствующих значений
* встроенная кросс-валидация
* возможность архивировать и восстанавливать модель

# LightGBM 

In [20]:
import lightgbm as lgb

param_grid = {
    'max_depth': [2, 3, 4, 5],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.02, 0.05]
}
model = randomized_cv(lgb.LGBMClassifier(), param_grid)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


model LGBMClassifier best accuracy score is 0.833894500561
time for training is 105.239722967 seconds
0.8338945005611672


In [21]:
apply_model(model, 'lgb_cv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


shape of x_test is (418, 8)
shape of y_test is (418,)


  if diff:


Особенности

* использование гистограмм для всех признаков (уже тоже есть в xgboost)
* то же самое, но быстрее (см выше)

# H2O GBM 

In [19]:
import h2o
import numpy as np
import math
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
h2o.init(nthreads=-1, strict_version_check=True)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.7.0_151"; OpenJDK Runtime Environment (IcedTea 2.6.11) (7u151-2.6.11-1~deb8u1); OpenJDK 64-Bit Server VM (build 24.151-b01, mixed mode)
  Starting server from /usr/local/lib/python2.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpwh3A5d
  JVM stdout: /tmp/tmpwh3A5d/h2o_stroykova_started_from_python.out
  JVM stderr: /tmp/tmpwh3A5d/h2o_stroykova_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,07 secs
H2O cluster version:,3.16.0.2
H2O cluster version age:,20 days
H2O cluster name:,H2O_from_python_stroykova_gl11fm
H2O cluster total nodes:,1
H2O cluster free memory:,3.490 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [52]:
data = np.c_[x_train, y_train]
data = pd.DataFrame(data, columns=['C{}'.format(idx + 1) for idx in range(data.shape[-1])])

train_df_h2o = h2o.H2OFrame(python_obj=data)
train_df_h2o['C9'] = train_df_h2o['C9'].asfactor()

train_df_h2o.show()

Parse progress: |█████████████████████████████████████████████████████████| 100%


C1,C2,C3,C4,C5,C6,C7,C8,C9
0,0,1,1,0.0141511,0.125,0.0,22,0
1,0,0,0,0.139136,0.125,0.0,38,1
0,0,1,0,0.0154686,0.0,0.0,26,1
1,0,0,0,0.103644,0.125,0.0,35,1
0,0,1,1,0.0157126,0.0,0.0,35,0
0,0,1,1,0.0165095,0.0,0.0,33,0
1,0,0,1,0.101229,0.0,0.0,54,0
0,0,1,1,0.0411357,0.375,0.166667,2,0
0,0,1,0,0.0217308,0.0,0.333333,27,1
0,1,0,0,0.0586943,0.125,0.0,14,1


In [64]:
x_test = vec.fit_transform(df_test)
data_test = pd.DataFrame(x_test, columns=['C{}'.format(idx + 1) for idx in range(x_test.shape[-1])])

test_df_h2o = h2o.H2OFrame(python_obj=data_test)
test_df_h2o.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Parse progress: |█████████████████████████████████████████████████████████| 100%


C1,C2,C3,C4,C5,C6,C7,C8
0,0,1,1,0.0152816,0.0,0.0,34.5
0,0,1,0,0.0136631,0.125,0.0,47.0
0,1,0,1,0.0189087,0.0,0.0,62.0
0,0,1,1,0.0169081,0.0,0.0,27.0
0,0,1,0,0.0239836,0.125,0.111111,22.0
0,0,1,1,0.018006,0.0,0.0,14.0
0,0,1,0,0.0148912,0.0,0.0,30.0
0,1,0,1,0.0566042,0.125,0.111111,26.0
0,0,1,0,0.0141105,0.0,0.0,18.0
0,0,1,1,0.0471377,0.25,0.0,21.0


In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
C1    891 non-null float64
C2    891 non-null float64
C3    891 non-null float64
C4    891 non-null float64
C5    891 non-null float64
C6    891 non-null float64
C7    891 non-null float64
C8    891 non-null float64
C9    891 non-null category
dtypes: category(1), float64(8)
memory usage: 56.7 KB


In [55]:
gbm = H2OGradientBoostingEstimator()
gbm.train(x=['C{}'.format(idx + 1) for idx in range(data.shape[-1] - 1)], y='C9', training_frame=train_df_h2o)
print(gbm)

gbm Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_model_python_1513807303246_113


ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.0893923322822
RMSE: 0.298985505137
LogLoss: 0.304490577516
Mean Per-Class Error: 0.118301217525
AUC: 0.941251504596
Gini: 0.882503009193
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.479021024668: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,520.0,29.0,0.0528,(29.0/549.0)
1,63.0,279.0,0.1842,(63.0/342.0)
Total,583.0,308.0,0.1033,(92.0/891.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4790210,0.8584615,180.0
max f2,0.2425732,0.8619718,252.0
max f0point5,0.6237757,0.8961593,147.0
max accuracy,0.4790210,0.8967452,180.0
max precision,0.9855009,1.0,0.0
max recall,0.0835401,1.0,352.0
max specificity,0.9855009,1.0,0.0
max absolute_mcc,0.4790210,0.7801801,180.0
max min_per_class_accuracy,0.3418507,0.8684211,226.0


Gains/Lift Table: Avg response rate: 38.38 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0101010,0.9733939,2.6052632,2.6052632,1.0,1.0,0.0263158,0.0263158,160.5263158,160.5263158
,2,0.0258137,0.9703743,2.6052632,2.6052632,1.0,1.0,0.0409357,0.0672515,160.5263158,160.5263158
,3,0.0325477,0.9703438,2.6052632,2.6052632,1.0,1.0,0.0175439,0.0847953,160.5263158,160.5263158
,4,0.0426487,0.9694137,2.6052632,2.6052632,1.0,1.0,0.0263158,0.1111111,160.5263158,160.5263158
,5,0.0505051,0.9681475,2.6052632,2.6052632,1.0,1.0,0.0204678,0.1315789,160.5263158,160.5263158
,6,0.1010101,0.9559318,2.6052632,2.6052632,1.0,1.0,0.1315789,0.2631579,160.5263158,160.5263158
,7,0.1503928,0.9279993,2.6052632,2.6052632,1.0,1.0,0.1286550,0.3918129,160.5263158,160.5263158
,8,0.2008979,0.8388662,2.5473684,2.5907086,0.9777778,0.9944134,0.1286550,0.5204678,154.7368421,159.0708615
,9,0.3007856,0.6096997,2.1954465,2.4594462,0.8426966,0.9440299,0.2192982,0.7397661,119.5446481,145.9446190



Scoring History: 


0,1,2,3,4,5,6,7,8
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error
,2017-12-21 01:35:31,0.001 sec,0.0,0.4863193,0.6659120,0.5,1.0,0.6161616
,2017-12-21 01:35:31,0.016 sec,1.0,0.4611267,0.6158597,0.9040680,2.6052632,0.1773288
,2017-12-21 01:35:31,0.022 sec,2.0,0.4407085,0.5774328,0.9021799,2.6052632,0.1739618
,2017-12-21 01:35:31,0.027 sec,3.0,0.4234627,0.5459271,0.9014556,2.6052632,0.1739618
,2017-12-21 01:35:31,0.034 sec,4.0,0.4079521,0.5179697,0.9038097,2.6052632,0.1694725
---,---,---,---,---,---,---,---,---
,2017-12-21 01:35:32,0.378 sec,46.0,0.3013569,0.3085071,0.9393182,2.6052632,0.1066218
,2017-12-21 01:35:32,0.385 sec,47.0,0.3006887,0.3074628,0.9398055,2.6052632,0.1077441
,2017-12-21 01:35:32,0.393 sec,48.0,0.2995177,0.3053450,0.9408041,2.6052632,0.1066218



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
C4,330.3993835,1.0,0.4805600
C8,105.9970093,0.3208148,0.1541708
C5,91.3197327,0.2763920,0.1328229
C3,83.0104141,0.2512426,0.1207372
C6,46.7274857,0.1414273,0.0679643
C1,16.5053711,0.0499558,0.0240068
C2,8.2734900,0.0250409,0.0120336
C7,5.2970357,0.0160322,0.0077044





In [60]:
hyper_params = {'max_depth' : range(1,30,2)}

gbm_grid = H2OGradientBoostingEstimator(
        ntrees=10000,
        learn_rate=0.05,
        learn_rate_annealing = 0.99,
        sample_rate = 0.8,
        col_sample_rate = 0.8,
        seed = 1234,
        score_tree_interval = 10, 
        stopping_rounds = 5,
        stopping_metric = "misclassification",
        stopping_tolerance = 1e-4)

grid = H2OGridSearch(gbm_grid, hyper_params,
                         grid_id = 'depth_grid',
                         search_criteria = {'strategy': "RandomDiscrete"})


grid.train(x=['C{}'.format(idx + 1) for idx in range(data.shape[-1] - 1)], y='C9', training_frame=train_df_h2o)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [61]:
print(grid)

     max_depth            model_ids              logloss
0           21  depth_grid_model_10  0.20246105669136633
1           25  depth_grid_model_12  0.20261911154719406
2           27  depth_grid_model_13  0.20262775998296126
3           29  depth_grid_model_14  0.20262775998296126
4           23  depth_grid_model_11  0.20273249664306414
5           19   depth_grid_model_9   0.2029692834528766
6           27  depth_grid_model_17  0.20299158583991428
7           29  depth_grid_model_27  0.20299158583991428
8           23  depth_grid_model_21   0.2039713947108158
9           25  depth_grid_model_28  0.20398177460482617
10          21  depth_grid_model_25  0.20431413668073337
11          19  depth_grid_model_18  0.20493492849948586
12          17   depth_grid_model_8  0.20495308387684755
13          17  depth_grid_model_24  0.20495308387684755
14          15   depth_grid_model_7   0.2069009862618864
15          13   depth_grid_model_6  0.21216094436338703
16          13  depth_grid_mode

In [62]:
best_model = h2o.get_model(grid.sorted_metric_table()['model_ids'][0])
best_model

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  depth_grid_model_10


ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.0559740905509
RMSE: 0.236588441288
LogLoss: 0.202461056691
Mean Per-Class Error: 0.0737225577605
AUC: 0.982093972028
Gini: 0.964187944056
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.400063705216: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,513.0,36.0,0.0656,(36.0/549.0)
1,28.0,314.0,0.0819,(28.0/342.0)
Total,541.0,350.0,0.0718,(64.0/891.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4000637,0.9075145,191.0
max f2,0.2541906,0.9396163,227.0
max f0point5,0.6050285,0.9313725,153.0
max accuracy,0.4855104,0.9304153,172.0
max precision,0.9930546,1.0,0.0
max recall,0.0694526,1.0,331.0
max specificity,0.9930546,1.0,0.0
max absolute_mcc,0.4855104,0.8522752,172.0
max min_per_class_accuracy,0.3561496,0.9234973,196.0


Gains/Lift Table: Avg response rate: 38.38 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0101010,0.9901383,2.6052632,2.6052632,1.0,1.0,0.0263158,0.0263158,160.5263158,160.5263158
,2,0.0202020,0.9884602,2.6052632,2.6052632,1.0,1.0,0.0263158,0.0526316,160.5263158,160.5263158
,3,0.0303030,0.9873287,2.6052632,2.6052632,1.0,1.0,0.0263158,0.0789474,160.5263158,160.5263158
,4,0.0404040,0.9860989,2.6052632,2.6052632,1.0,1.0,0.0263158,0.1052632,160.5263158,160.5263158
,5,0.0505051,0.9842767,2.6052632,2.6052632,1.0,1.0,0.0263158,0.1315789,160.5263158,160.5263158
,6,0.1010101,0.9739394,2.6052632,2.6052632,1.0,1.0,0.1315789,0.2631579,160.5263158,160.5263158
,7,0.1515152,0.9339860,2.6052632,2.6052632,1.0,1.0,0.1315789,0.3947368,160.5263158,160.5263158
,8,0.2008979,0.8779594,2.6052632,2.6052632,1.0,1.0,0.1286550,0.5233918,160.5263158,160.5263158
,9,0.3007856,0.6849581,2.4589001,2.5566575,0.9438202,0.9813433,0.2456140,0.7690058,145.8900059,155.6657502



Scoring History: 


0,1,2,3,4,5,6,7,8
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error
,2017-12-21 01:38:56,15.755 sec,0.0,0.4863193,0.6659120,0.5,1.0,0.6161616
,2017-12-21 01:38:56,15.787 sec,10.0,0.3926348,0.4903584,0.9330441,2.6052632,0.1257015
,2017-12-21 01:38:56,15.821 sec,20.0,0.3474203,0.4087974,0.9397815,2.6052632,0.1178451
,2017-12-21 01:38:56,15.858 sec,30.0,0.3229680,0.3619419,0.9462420,2.6052632,0.1156004
,2017-12-21 01:38:56,15.901 sec,40.0,0.3091684,0.3332743,0.9503590,2.6052632,0.1088664
---,---,---,---,---,---,---,---,---
,2017-12-21 01:38:58,17.887 sec,500.0,0.2367757,0.2027238,0.9820540,2.6052632,0.0718294
,2017-12-21 01:38:58,17.933 sec,510.0,0.2367268,0.2026543,0.9820594,2.6052632,0.0718294
,2017-12-21 01:38:59,17.978 sec,520.0,0.2366803,0.2025861,0.9820620,2.6052632,0.0718294



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
C8,1075.0728760,1.0,0.3355807
C5,876.1934204,0.8150084,0.2735011
C4,751.6188354,0.6991329,0.2346155
C3,194.5909119,0.1810025,0.0607410
C6,143.7841797,0.1337437,0.0448818
C7,68.9221268,0.0641093,0.0215138
C1,53.2924423,0.0495710,0.0166351
C2,40.1443520,0.0373411,0.0125309




In [77]:
best_model.accuracy()

[[0.48551037759793186, 0.9304152637485971]]

In [65]:
preds = best_model.predict(test_df_h2o)
preds.head()

gbm prediction progress: |████████████████████████████████████████████████| 100%


predict,p0,p1
0,0.961502,0.0384981
0,0.842935,0.157065
0,0.832242,0.167758
1,0.486816,0.513184
1,0.588062,0.411938
0,0.918793,0.0812072
0,0.681731,0.318269
0,0.969699,0.0303009
1,0.319127,0.680873
0,0.920627,0.0793725




In [73]:
pred_df = preds.as_data_frame()

submit = pd.DataFrame()
submit['PassengerId'] = df_test['PassengerId']
submit['Survived'] = pred_df['predict']
submit.to_csv('h2o.csv', sep=',', index=False)

# CatBoost

https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/  
https://arxiv.org/pdf/1706.09516.pdf  


In [14]:
from catboost import CatBoostClassifier
param_grid = {
    'iterations': [2, 3, 4, 5],
    'depth': [2, 3, 4, 5],
    'learning_rate': [1, 0.1, 0.01, 0.001]
}
cbm = randomized_cv(CatBoostClassifier(), param_grid)

0:	learn: 0.6874452	total: 54.7ms	remaining: 164ms
1:	learn: 0.6811426	total: 63.9ms	remaining: 63.9ms
2:	learn: 0.6754437	total: 74.8ms	remaining: 24.9ms
3:	learn: 0.6684618	total: 85ms	remaining: 0us
0:	learn: 0.6893545	total: 8.79ms	remaining: 26.4ms
1:	learn: 0.6834289	total: 17.8ms	remaining: 17.8ms
2:	learn: 0.6765625	total: 27.3ms	remaining: 9.08ms
3:	learn: 0.6706602	total: 36.5ms	remaining: 0us
0:	learn: 0.6871043	total: 10.1ms	remaining: 30.2ms
1:	learn: 0.6810131	total: 25ms	remaining: 25ms
2:	learn: 0.6755886	total: 41.3ms	remaining: 13.8ms
3:	learn: 0.6698796	total: 52.4ms	remaining: 0us
0:	learn: 0.6859556	total: 9.78ms	remaining: 29.3ms
1:	learn: 0.6802094	total: 21.4ms	remaining: 21.4ms
2:	learn: 0.6748013	total: 30.2ms	remaining: 10.1ms
3:	learn: 0.6686276	total: 39.5ms	remaining: 0us
0:	learn: 0.6876425	total: 8.2ms	remaining: 24.6ms
1:	learn: 0.6820633	total: 18.1ms	remaining: 18.1ms
2:	learn: 0.6767718	total: 26.9ms	remaining: 8.97ms
3:	learn: 0.6714444	total: 36.1m

model CatBoostClassifier best accuracy score is 0.8114478114478114
time for training is 4.01835036277771 seconds
0.8114478114478114


Параметры модели

https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/

Настройка параметров

https://tech.yandex.com/catboost/doc/dg/concepts/parameter-tuning-docpage/

Особенности

* уменьшено (?) переобучение
* умеет обрабатывать категориальные признаки
* большое количество визуализаций
* работает лучше по бенчмаркам (но дольше)

# Подбор гиперпараметров. Общий подход.

* выбрать относительно высокий learning_rate (например, 0.05 - 0.2)
* определить необходимое количество деревьев для исключения проблема недообучения и переобучения - поставить побольше и выбрать такое, где ошибка на валидации начинает расти
* зафиксировать параметры из предыдущих пунктов и настроить параметры, связанные с деревьями.
* зафиксировать параметры деревьев и дополнительно настроить learning_rate и количество деревьев

Основные параметры, связанные с бустингом

* learning_rate
* n_estimators
* subsample
* loss

Основные параметры, связанные с деревьями

* max_depth
* max_features
* min_samples_split
* min_samples_leaf
* max_leaf_nodes
* ...

In [15]:
import os
data_path = "/media/d_2000/data/hcdr/"
train = os.path.join(data_path, "application_train.csv")
test = os.path.join(data_path, "application_test.csv")

In [16]:
df = pd.read_csv(train)

In [17]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
y = df['TARGET']

In [57]:
columns = list(df.columns.values)
columns.remove('SK_ID_CURR')
columns.remove('TARGET')

In [58]:
columns

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'OCCUPATION_TYPE',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'WEEKDAY_APPR_PROCESS_START',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'ORGANIZATION_TYPE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 

In [59]:
x = df[columns].values

In [60]:
y.unique()

array([1, 0])

In [50]:
pd.set_option('display.max_columns', 500)
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.02,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.08,0.26,0.14,0.02,0.04,0.97,0.62,0.01,0.0,0.07,0.08,0.12,0.04,0.02,0.02,0.0,0.0,0.03,0.04,0.97,0.63,0.01,0.0,0.07,0.08,0.12,0.04,0.02,0.02,0.0,0.0,0.03,0.04,0.97,0.62,0.01,0.0,0.07,0.08,0.12,0.04,0.02,0.02,0.0,0.0,reg oper account,block of flats,0.01,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.0,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.31,0.62,,0.1,0.05,0.99,0.8,0.06,0.08,0.03,0.29,0.33,0.01,0.08,0.05,0.0,0.01,0.09,0.05,0.99,0.8,0.05,0.08,0.03,0.29,0.33,0.01,0.08,0.06,0.0,0.0,0.1,0.05,0.99,0.8,0.06,0.08,0.03,0.29,0.33,0.01,0.08,0.06,0.0,0.01,reg oper account,block of flats,0.07,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.01,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.56,0.73,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.01,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.65,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.03,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.32,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
categorical = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']

### CatBoost

In [79]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier(eval_metric='AUC')
#clf.fit(x, y, cat_features=[0,1,2,3, 9, 10, 11, 12, 13, 26, 30, 38, 84, 85, 87, 88])

In [80]:
dfcb = pd.DataFrame(df)


In [81]:
for c in categorical:
    dfcb[c] = dfcb[c].astype(str)

In [82]:
x = dfcb[columns].values

In [83]:
clf.fit(x, y, cat_features=[0,1,2,3, 9, 10, 11, 12, 13, 26, 30, 38, 84, 85, 87, 88])

0:	learn: 0.7004769	total: 1.4s	remaining: 23m 16s
1:	learn: 0.7119360	total: 2.8s	remaining: 23m 17s
2:	learn: 0.7154135	total: 4.28s	remaining: 23m 41s
3:	learn: 0.7165532	total: 5.58s	remaining: 23m 8s
4:	learn: 0.7181569	total: 6.97s	remaining: 23m 6s
5:	learn: 0.7187070	total: 8.25s	remaining: 22m 46s
6:	learn: 0.7215420	total: 9.72s	remaining: 22m 58s
7:	learn: 0.7232822	total: 11s	remaining: 22m 48s
8:	learn: 0.7231418	total: 12.5s	remaining: 22m 53s
9:	learn: 0.7231141	total: 13.8s	remaining: 22m 44s
10:	learn: 0.7241536	total: 15.1s	remaining: 22m 37s
11:	learn: 0.7237101	total: 16.5s	remaining: 22m 40s
12:	learn: 0.7248131	total: 17.8s	remaining: 22m 34s
13:	learn: 0.7244707	total: 19.1s	remaining: 22m 25s
14:	learn: 0.7251753	total: 20.4s	remaining: 22m 22s
15:	learn: 0.7265652	total: 21.9s	remaining: 22m 24s
16:	learn: 0.7273139	total: 23.2s	remaining: 22m 21s
17:	learn: 0.7283074	total: 24.6s	remaining: 22m 20s
18:	learn: 0.7285984	total: 26s	remaining: 22m 20s
19:	learn: 

154:	learn: 0.7541896	total: 3m 35s	remaining: 19m 33s
155:	learn: 0.7542414	total: 3m 36s	remaining: 19m 31s
156:	learn: 0.7543048	total: 3m 37s	remaining: 19m 29s
157:	learn: 0.7544067	total: 3m 39s	remaining: 19m 28s
158:	learn: 0.7544965	total: 3m 40s	remaining: 19m 26s
159:	learn: 0.7545486	total: 3m 41s	remaining: 19m 24s
160:	learn: 0.7546036	total: 3m 43s	remaining: 19m 23s
161:	learn: 0.7547224	total: 3m 44s	remaining: 19m 22s
162:	learn: 0.7547570	total: 3m 46s	remaining: 19m 21s
163:	learn: 0.7548141	total: 3m 47s	remaining: 19m 19s
164:	learn: 0.7548907	total: 3m 48s	remaining: 19m 17s
165:	learn: 0.7549702	total: 3m 50s	remaining: 19m 16s
166:	learn: 0.7550347	total: 3m 51s	remaining: 19m 15s
167:	learn: 0.7550881	total: 3m 53s	remaining: 19m 14s
168:	learn: 0.7551070	total: 3m 54s	remaining: 19m 12s
169:	learn: 0.7551491	total: 3m 55s	remaining: 19m 11s
170:	learn: 0.7552303	total: 3m 57s	remaining: 19m 10s
171:	learn: 0.7552682	total: 3m 58s	remaining: 19m 8s
172:	learn:

305:	learn: 0.7614659	total: 7m 1s	remaining: 15m 56s
306:	learn: 0.7614887	total: 7m 3s	remaining: 15m 55s
307:	learn: 0.7615116	total: 7m 4s	remaining: 15m 54s
308:	learn: 0.7615481	total: 7m 6s	remaining: 15m 53s
309:	learn: 0.7615844	total: 7m 7s	remaining: 15m 51s
310:	learn: 0.7616280	total: 7m 8s	remaining: 15m 50s
311:	learn: 0.7616477	total: 7m 10s	remaining: 15m 48s
312:	learn: 0.7616811	total: 7m 11s	remaining: 15m 47s
313:	learn: 0.7617149	total: 7m 13s	remaining: 15m 46s
314:	learn: 0.7617355	total: 7m 14s	remaining: 15m 44s
315:	learn: 0.7617588	total: 7m 15s	remaining: 15m 43s
316:	learn: 0.7617877	total: 7m 17s	remaining: 15m 41s
317:	learn: 0.7618169	total: 7m 18s	remaining: 15m 40s
318:	learn: 0.7618712	total: 7m 19s	remaining: 15m 39s
319:	learn: 0.7618911	total: 7m 21s	remaining: 15m 37s
320:	learn: 0.7619133	total: 7m 22s	remaining: 15m 36s
321:	learn: 0.7619401	total: 7m 23s	remaining: 15m 34s
322:	learn: 0.7619533	total: 7m 25s	remaining: 15m 33s
323:	learn: 0.76

455:	learn: 0.7651000	total: 10m 38s	remaining: 12m 41s
456:	learn: 0.7651179	total: 10m 39s	remaining: 12m 40s
457:	learn: 0.7651449	total: 10m 41s	remaining: 12m 38s
458:	learn: 0.7651750	total: 10m 42s	remaining: 12m 37s
459:	learn: 0.7651926	total: 10m 44s	remaining: 12m 36s


KeyboardInterrupt: 