In [17]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [18]:
train=pd.read_excel('../SB_test/train.xlsx')
test=pd.read_excel('../SB_test/test.xlsx')

In [19]:
train['y']=train.y.apply(lambda x: 0 if x=='no' else 1)

In [20]:
train=train.replace('unknown', np.nan)

In [21]:
train=train.dropna()

In [22]:
del train['default']

In [23]:
train['housing']=train.housing.apply(lambda x: 0 if x=='no' else 1)

In [24]:
train['loan']=train.loan.apply(lambda x: 0 if x=='no' else 1)

In [25]:
train['contact']=train.contact.astype('category').cat.codes

In [26]:
train['month']=train.month.astype('category').cat.codes
train['day_of_week']=train.day_of_week.astype('category').cat.codes

In [27]:
X_train=pd.get_dummies(train)

In [28]:
y_train=X_train['y']

In [29]:
del X_train['y']

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [41]:
ones_ratio=y_train[y_train==1].shape[0]*1.0/y_train[y_train==0].shape[0]

param_grid = {
    # параметры ансамбля
    'n_estimators': [10, 30, 50, 100, 200, 400, 600, 1000],
    'learning_rate': [0.1],
    
    # параметры дерева
    'max_depth': [5],
    'min_child_weight': [2],
    'gamma': [0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'scale_pos_weight': [ones_ratio],
    
    # параметры регуляризации
    'reg_alpha': [0.0],
    'reg_lambda': [1.0]
}


clf = XGBClassifier()
gs = GridSearchCV(clf, param_grid, scoring='accuracy', cv=4)

gs.fit(X_train, y_train)
best_params = gs.best_estimator_.get_params()
print('Best score (ACC): ', gs.best_score_)
print('Best params: ')
best_params

Best score (ACC):  0.894142259414
Best params: 


{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 2,
 'missing': None,
 'n_estimators': 400,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0.0,
 'reg_lambda': 1.0,
 'scale_pos_weight': 0.13918017159199236,
 'seed': None,
 'silent': True,
 'subsample': 0.8}

In [42]:
param_grid = {
    'max_depth': range(3, 10, 2),
    'min_child_weight': range(1, 6, 2)
}

clf = XGBClassifier(**best_params) # в качестве отправной точки возьмём модель с наилучшими параметрами предыдущего шага

gs = GridSearchCV(clf, param_grid, scoring='accuracy', cv=4)

gs.fit(X_train, y_train)
best_params = gs.best_estimator_.get_params()
print('Best score (ACC): ', gs.best_score_)
print('Best params: ')
best_params

Best score (ACC):  0.897489539749
Best params: 


{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 7,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 400,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0.0,
 'reg_lambda': 1.0,
 'scale_pos_weight': 0.13918017159199236,
 'seed': None,
 'silent': True,
 'subsample': 0.8}

In [43]:
param_grid = {
    'gamma': [0.1*i for i in range(6)]
}

clf = XGBClassifier(**best_params)

gs = GridSearchCV(clf, param_grid, scoring='accuracy', cv=4)

gs.fit(X_train, y_train)
best_params = gs.best_estimator_.get_params()
print('Best score (ACC): ', gs.best_score_)
print('Best params: ')
best_params

Best score (ACC):  0.897489539749
Best params: 


{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 7,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 400,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0.0,
 'reg_lambda': 1.0,
 'scale_pos_weight': 0.13918017159199236,
 'seed': None,
 'silent': True,
 'subsample': 0.8}

In [44]:
param_grid = {
    'subsample': [0.5 + 0.1*i for i in range(6)],
    'colsample_bytree': [0.5 + 0.1*i for i in range(6)]
}

clf = XGBClassifier(**best_params)

gs = GridSearchCV(clf, param_grid, scoring='accuracy', cv=4)

gs.fit(X_train, y_train)
best_params = gs.best_estimator_.get_params()
print('Best score (ACC): ', gs.best_score_)
print('Best params: ')
best_params

Best score (ACC):  0.897489539749
Best params: 


{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 7,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 400,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0.0,
 'reg_lambda': 1.0,
 'scale_pos_weight': 0.13918017159199236,
 'seed': None,
 'silent': True,
 'subsample': 0.8}

In [45]:
param_grid = {
    'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100],
    'reg_lambda': [1e-5, 1e-2, 0.1, 1, 100]
}

clf = XGBClassifier(**best_params)

gs = GridSearchCV(clf, param_grid, scoring='accuracy', cv=4)

gs.fit(X_train, y_train)
best_params = gs.best_estimator_.get_params()
print('Best score (ACC): ', gs.best_score_)
print('Best params: ')
best_params

Best score (ACC):  0.897489539749
Best params: 


{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 7,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 400,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 1e-05,
 'reg_lambda': 1,
 'scale_pos_weight': 0.13918017159199236,
 'seed': None,
 'silent': True,
 'subsample': 0.8}

In [46]:
clf = XGBClassifier(**best_params)
best_n_estimators = clf.get_params()['n_estimators'] # возьмём наилучшие значения n_estimators с предыдущего шага
best_learning_rate = best_params['learning_rate'] # аналогичная запись
invariant_composition = best_n_estimators * best_learning_rate
n_estimators_range = [10, 30, 100, 200, 400, 600, 800, 1000]

best_score = gs.best_score_ # возьмём наилучшее качество с предыдущего шага

for n_estimators in n_estimators_range:
    learning_rate = invariant_composition / n_estimators
    clf.set_params(n_estimators=n_estimators, learning_rate=learning_rate)
    aucs = []
    for train_idx, test_idx in cv.split(X_train):
        X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]
        clf.fit(X_train_fold, y_train_fold)
        preds = clf.predict(X_test_fold)
        auc = accuracy_score(y_test_fold, preds)
        aucs.append(auc)
    auc = np.mean(aucs)
    if auc > best_score:
        best_n_estimators = n_estimators
        best_learning_rate = learning_rate
        best_score = auc
        
best_params['n_estimators'] = best_n_estimators
best_params['learning_rate'] = best_learning_rate

print('Best score (AUC): ', best_score)
print('Best params: ')
best_params

Best score (AUC):  0.897907178031
Best params: 


{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.05,
 'max_delta_step': 0,
 'max_depth': 7,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 800,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 1e-05,
 'reg_lambda': 1,
 'scale_pos_weight': 0.13918017159199236,
 'seed': None,
 'silent': True,
 'subsample': 0.8}

In [47]:
for feature_name, feature_importance in zip(X_train.columns, clf.feature_importances_):
    print('Feature: "%s"\tFeature importance: %.4f' % (feature_name, feature_importance))

Feature: "age"	Feature importance: 0.1342
Feature: "housing"	Feature importance: 0.0205
Feature: "loan"	Feature importance: 0.0073
Feature: "contact"	Feature importance: 0.0101
Feature: "month"	Feature importance: 0.0496
Feature: "day_of_week"	Feature importance: 0.0621
Feature: "duration"	Feature importance: 0.2283
Feature: "campaign"	Feature importance: 0.0401
Feature: "pdays"	Feature importance: 0.0144
Feature: "previous"	Feature importance: 0.0106
Feature: "emp.var.rate"	Feature importance: 0.0237
Feature: "cons.price.idx"	Feature importance: 0.0589
Feature: "cons.conf.idx"	Feature importance: 0.0619
Feature: "euribor3m"	Feature importance: 0.1368
Feature: "nr.employed"	Feature importance: 0.0228
Feature: "job_admin."	Feature importance: 0.0139
Feature: "job_blue-collar"	Feature importance: 0.0080
Feature: "job_entrepreneur"	Feature importance: 0.0000
Feature: "job_housemaid"	Feature importance: 0.0000
Feature: "job_management"	Feature importance: 0.0004
Feature: "job_retired"	Feat