**Reference**

https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

https://www.kaggle.com/hiro5299834/tps-apr-2021-voting-pseudo-labeling

https://www.kaggle.com/alexryzhkov/n3-tps-april-21-lightautoml-starter

In [None]:
import pandas as pd
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder

import numpy as np
import xgboost as xgb
from xgboost import plot_importance
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn import metrics

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

target = 'Survived'
IDcol = 'PassengerId'

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
train.head()

In [None]:
test = pd.read_csv("../input/tabular-playground-series-apr-2021/test.csv")
test.head()

In [None]:
def label_encoder(c):
    lc = LabelEncoder()
    return lc.fit_transform(c)

def feature_engineering(df):
    data = df
    # Age fillna with mean age for each class
    age_map = data[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
    data.Age = data.Age.fillna(data.Pclass.map(age_map['Age']))
    
    data['Embarked'] = data['Embarked'].fillna('X')

    fare_map = data[['Fare', 'Pclass']].dropna().groupby('Pclass').median().to_dict()
    data['Fare'] = data['Fare'].fillna(data['Pclass'].map(fare_map['Fare']))

    # Cabin, fillna with 'X' and take first letter
    data.Cabin = data.Cabin.map(lambda x: str(x)[0].strip().lower())

    # Ticket, fillna with 'X', split string and take first split 
    data.Ticket = data.Ticket.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')

    # Fare, fillna with mean value 
    # (THE ONLY FILLNA LEFT BECAUSE HERE WE USE TEST DATASET - LightAutoML can't do it in real life because of strict distinction between train and test stages)
    data.Fare = data.Fare.fillna(data.Fare.mean())
    
#     # Name, take only surnames
#     data.Name = data.Name.map(lambda x: str(x).split(',')[0])
    
#     data['relatives'] = data['SibSp'] + data['Parch']
#     data.loc[data['relatives'] > 0, 'travelled_alone'] = 'No'
#     data.loc[data['relatives'] == 0, 'travelled_alone'] = 'Yes'
    
    label_cols = ['Ticket']
#     label_cols = ['Name', 'Ticket']
    label_encoded_df = data[label_cols].apply(label_encoder)
    data = data.join(label_encoded_df.add_suffix("_lb"))
    
    onehot_features = ['Pclass', 'Sex', 'Cabin', 'Embarked']
    cat_oh = pd.get_dummies(data[onehot_features])
    data = data.join(cat_oh.add_suffix("_oh"))
    
    data['FirstName'] = data.Name.map(lambda x: str(x).split(',')[0])
    data['Surname'] = data.Name.map(lambda x: str(x).split(',')[1])
    
    count_encode_features = ["FirstName", "Surname"]
    for col in count_encode_features:
        data['Counter_' + col] = data[col].map(data.groupby(col)['PassengerId'].count().to_dict())
#     count_en = ce.CountEncoder()
#     cat_ce = count_en.fit_transform(data[count_encode_features])
#     data = data.join(cat_ce.add_suffix("_ce"))

    data.drop(columns=count_encode_features+["Name"], inplace=True)

    data.drop(columns=label_cols+onehot_features, inplace=True)
    
    return data

In [None]:
final_train_data = feature_engineering(train)
final_test_data = feature_engineering(test)
for col in (set(final_train_data.columns).symmetric_difference(final_test_data.columns)):
    if col not in final_train_data:
        final_train_data[col] = 0
    elif col != 'Survived':
        final_test_data[col] = 0

In [None]:
final_train_data.head()

In [None]:
#Choose all predictors except target & IDcols
predictors = [x for x in final_train_data.columns if x not in [target, IDcol]]

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42, sampling_strategy=1.0)
train_x = final_train_data[predictors]
train_y = final_train_data[target]
train_x, train_y = sm.fit_resample(train_x, train_y)

In [None]:
def modelfit(alg, X, Y,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X.values, label=Y.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X, Y,eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(X)
    dtrain_predprob = alg.predict_proba(X)[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("AUC Score (Train): %f" % metrics.roc_auc_score(Y, dtrain_predprob))
    print(metrics.classification_report(Y.values, dtrain_predictions))

    plot_importance(alg)

In [None]:
xgb1 = XGBClassifier(
    learning_rate=0.1,
    n_estimators=200,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27
)

modelfit(xgb1, train_x, train_y)

In [None]:
param_test1 = {
    'max_depth':range(7,12,2),
    'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(
    estimator = XGBClassifier(
        learning_rate =0.1,
        n_estimators=200,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27
    ),
    param_grid = param_test1,
    scoring='roc_auc',
    n_jobs=4,
    cv=5
)

# gsearch1.fit(train_x,train_y)
# gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

```
# Output
({'mean_fit_time': array([272.11131663, 260.83970675, 255.30913534, 383.37809596,
         357.81840076, 351.80142932, 504.15682721, 465.49360337,
         375.60504947]),
  'std_fit_time': array([  1.67645035,   3.85691721,   4.24622363,   2.84621593,
           7.08090777,   6.36768253,   7.58772393,   6.50357653,
         104.76211844]),
  'mean_score_time': array([0.33333292, 0.31011777, 0.32633858, 0.44162273, 0.42505212,
         0.4127224 , 0.56222205, 0.53651485, 0.35870881]),
  'std_score_time': array([0.01749353, 0.01518372, 0.02228676, 0.02101094, 0.03439906,
         0.03042925, 0.02745548, 0.04289937, 0.15277742]),
  'param_max_depth': masked_array(data=[7, 7, 7, 9, 9, 9, 11, 11, 11],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'param_min_child_weight': masked_array(data=[1, 3, 5, 1, 3, 5, 1, 3, 5],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'params': [{'max_depth': 7, 'min_child_weight': 1},
   {'max_depth': 7, 'min_child_weight': 3},
   {'max_depth': 7, 'min_child_weight': 5},
   {'max_depth': 9, 'min_child_weight': 1},
   {'max_depth': 9, 'min_child_weight': 3},
   {'max_depth': 9, 'min_child_weight': 5},
   {'max_depth': 11, 'min_child_weight': 1},
   {'max_depth': 11, 'min_child_weight': 3},
   {'max_depth': 11, 'min_child_weight': 5}],
  'split0_test_score': array([0.84867964, 0.848416  , 0.84871055, 0.84510666, 0.84536172,
         0.84576957, 0.84040074, 0.84145019, 0.84226035]),
  'split1_test_score': array([0.85657295, 0.85642851, 0.85667076, 0.85234729, 0.85223843,
         0.85300753, 0.84902852, 0.84786708, 0.84964006]),
  'split2_test_score': array([0.8487202 , 0.84836042, 0.84830478, 0.84538275, 0.84568105,
         0.84593315, 0.84224065, 0.8418523 , 0.84326009]),
  'split3_test_score': array([0.88349159, 0.88343288, 0.88293238, 0.88192533, 0.88170851,
         0.88165236, 0.87945831, 0.87988275, 0.87916737]),
  'split4_test_score': array([0.95772282, 0.9572616 , 0.95553898, 0.95985901, 0.95927461,
         0.95671505, 0.96216332, 0.96187734, 0.9586757 ]),
  'mean_test_score': array([0.87903744, 0.87877988, 0.87843149, 0.87692421, 0.87685286,
         0.87661553, 0.87465831, 0.87458593, 0.87460071]),
  'std_test_score': array([0.04136814, 0.04129873, 0.04057168, 0.0436221 , 0.04332705,
         0.04216995, 0.04595985, 0.04589382, 0.04413757]),
  'rank_test_score': array([1, 2, 3, 4, 5, 6, 7, 9, 8], dtype=int32)},
 {'max_depth': 7, 'min_child_weight': 1},
 0.8790374408410356)
```

In [None]:
param_test2 = {
    'max_depth':[6,7,8],
    'min_child_weight':[0,1,2]
}
gsearch2 = GridSearchCV(
    estimator = XGBClassifier(
        learning_rate =0.1,
        n_estimators=200,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27
    ),
    param_grid = param_test2,
    scoring='roc_auc',
    n_jobs=4,
    cv=5
)

# gsearch2.fit(train_x,train_y)
# gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

```
# Output
({'mean_fit_time': array([227.09031415, 220.99125409, 215.32283955, 285.30846672,
         269.19676309, 261.3322155 , 349.79917583, 324.81722174,
         262.03395381]),
  'std_fit_time': array([ 2.69721289,  4.69311073,  2.06245283,  3.38722253,  4.66644057,
          2.80817646,  5.50213699,  3.29204117, 82.70124183]),
  'mean_score_time': array([0.29869275, 0.28568945, 0.28238034, 0.34796915, 0.34466705,
         0.33007789, 0.38365641, 0.37955399, 0.25225396]),
  'std_score_time': array([0.0094437 , 0.04343774, 0.04030966, 0.01947428, 0.01948299,
         0.0313189 , 0.02556979, 0.0241019 , 0.11835266]),
  'param_max_depth': masked_array(data=[6, 6, 6, 7, 7, 7, 8, 8, 8],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'param_min_child_weight': masked_array(data=[0, 1, 2, 0, 1, 2, 0, 1, 2],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'params': [{'max_depth': 6, 'min_child_weight': 0},
   {'max_depth': 6, 'min_child_weight': 1},
   {'max_depth': 6, 'min_child_weight': 2},
   {'max_depth': 7, 'min_child_weight': 0},
   {'max_depth': 7, 'min_child_weight': 1},
   {'max_depth': 7, 'min_child_weight': 2},
   {'max_depth': 8, 'min_child_weight': 0},
   {'max_depth': 8, 'min_child_weight': 1},
   {'max_depth': 8, 'min_child_weight': 2}],
  'split0_test_score': array([0.84904146, 0.84927021, 0.84904328, 0.84811467, 0.84867964,
         0.84829193, 0.84648884, 0.84660514, 0.84743688]),
  'split1_test_score': array([0.85794388, 0.85740102, 0.857815  , 0.85660939, 0.85657295,
         0.85673058, 0.85468494, 0.85482016, 0.85429351]),
  'split2_test_score': array([0.84948941, 0.84950027, 0.84965347, 0.84857447, 0.8487202 ,
         0.84854944, 0.84643161, 0.84677379, 0.84733819]),
  'split3_test_score': array([0.88332711, 0.88303194, 0.88321182, 0.88311222, 0.88349159,
         0.88291708, 0.88255384, 0.88267127, 0.88265084]),
  'split4_test_score': array([0.95515361, 0.95474342, 0.95493367, 0.9576826 , 0.95772282,
         0.95719749, 0.95898348, 0.95833289, 0.95923679]),
  'mean_test_score': array([0.87899109, 0.87878937, 0.87893145, 0.87881867, 0.87903744,
         0.8787373 , 0.87782854, 0.87784065, 0.87819124]),
  'std_test_score': array([0.04007454, 0.03993423, 0.03997766, 0.04144552, 0.04136814,
         0.04122134, 0.04268963, 0.04236292, 0.04256081]),
  'rank_test_score': array([2, 5, 3, 4, 1, 6, 9, 8, 7], dtype=int32)},
 {'max_depth': 7, 'min_child_weight': 1},
 0.8790374408410356)
```

In [None]:
param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(
    estimator = XGBClassifier(
        learning_rate =0.1,
        n_estimators=200,
        max_depth=7,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27
    ),
    param_grid = param_test3,
    scoring='roc_auc',
    n_jobs=4,
    cv=5
)
# gsearch3.fit(train_x,train_y)
# gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

```
# Output
({'mean_fit_time': array([269.09461918, 269.72219725, 270.3569695 , 270.25370035,
         223.87595577]),
  'std_fit_time': array([ 2.24053036,  3.67366006,  2.23530612,  1.95032911, 97.76524852]),
  'mean_score_time': array([0.33340855, 0.37081723, 0.32860589, 0.32713428, 0.25316916]),
  'std_score_time': array([0.03999725, 0.04447045, 0.02231321, 0.02351676, 0.11302583]),
  'param_gamma': masked_array(data=[0.0, 0.1, 0.2, 0.3, 0.4],
               mask=[False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'gamma': 0.0},
   {'gamma': 0.1},
   {'gamma': 0.2},
   {'gamma': 0.3},
   {'gamma': 0.4}],
  'split0_test_score': array([0.84867964, 0.84828056, 0.84803939, 0.84876785, 0.84823407]),
  'split1_test_score': array([0.85657295, 0.85668415, 0.85690292, 0.85684749, 0.85700608]),
  'split2_test_score': array([0.8487202 , 0.84868282, 0.84862791, 0.84821527, 0.8483266 ]),
  'split3_test_score': array([0.88349159, 0.88380635, 0.88333698, 0.88342794, 0.88289486]),
  'split4_test_score': array([0.95772282, 0.95780495, 0.95750645, 0.95747198, 0.95880465]),
  'mean_test_score': array([0.87903744, 0.87905177, 0.87888273, 0.87894611, 0.87905325]),
  'std_test_score': array([0.04136814, 0.0414587 , 0.04135531, 0.04130341, 0.04184512]),
  'rank_test_score': array([3, 2, 5, 4, 1], dtype=int32)},
 {'gamma': 0.4},
 0.8790532516640599)
```

In [None]:
param_test4 = {
    'colsample_bytree':[i/10.0 for i in range(7,9)],
    'subsample':[i/10.0 for i in range(7,9)]
}
gsearch4 = GridSearchCV(
    estimator = XGBClassifier(
        learning_rate =0.1,
        n_estimators=200,
        max_depth=7,
        min_child_weight=1,
        gamma=0.4,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27
    ),
    param_grid = param_test4,
    scoring='roc_auc',
    n_jobs=4,
    cv=5
)
# gsearch4.fit(train_x,train_y)
# gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

```
# Output
({'mean_fit_time': array([264.73518562, 258.43494062, 273.45296659, 268.5024518 ]),
  'std_fit_time': array([5.60000943, 5.07955074, 4.58229505, 5.13243651]),
  'mean_score_time': array([0.32740898, 0.33808775, 0.33355627, 0.23618708]),
  'std_score_time': array([0.02487408, 0.02665582, 0.01449516, 0.10075778]),
  'param_colsample_bytree': masked_array(data=[0.7, 0.7, 0.8, 0.8],
               mask=[False, False, False, False],
         fill_value='?',
              dtype=object),
  'param_subsample': masked_array(data=[0.7, 0.8, 0.7, 0.8],
               mask=[False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'colsample_bytree': 0.7, 'subsample': 0.7},
   {'colsample_bytree': 0.7, 'subsample': 0.8},
   {'colsample_bytree': 0.8, 'subsample': 0.7},
   {'colsample_bytree': 0.8, 'subsample': 0.8}],
  'split0_test_score': array([0.84840981, 0.84907788, 0.84871567, 0.84823407]),
  'split1_test_score': array([0.85602778, 0.85674356, 0.85591018, 0.85700608]),
  'split2_test_score': array([0.84843902, 0.84850159, 0.84790089, 0.8483266 ]),
  'split3_test_score': array([0.88294695, 0.88357348, 0.88313872, 0.88289486]),
  'split4_test_score': array([0.9566038 , 0.95690679, 0.95625421, 0.95880465]),
  'mean_test_score': array([0.87848547, 0.87896066, 0.87838393, 0.87905325]),
  'std_test_score': array([0.04107082, 0.04101515, 0.04098995, 0.04184512]),
  'rank_test_score': array([3, 2, 4, 1], dtype=int32)},
 {'colsample_bytree': 0.8, 'subsample': 0.8},
 0.8790532516640599)
```

In [None]:
param_test5 = {
    'colsample_bytree':[i/100.0 for i in range(75,90,5)],
    'subsample':[i/100.0 for i in range(75,90,5)]
}
gsearch5 = GridSearchCV(
    estimator = XGBClassifier(
        learning_rate =0.1,
        n_estimators=200,
        max_depth=7,
        min_child_weight=1,
        gamma=0.4,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27
    ),
    param_grid = param_test5,
    scoring='roc_auc',
    n_jobs=4,
    cv=5
)
# gsearch5.fit(train_x,train_y)
# gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_

```
# Output
({'mean_fit_time': array([270.72046261, 266.66264801, 264.71653447, 273.68775826,
         270.43560157, 269.8571672 , 275.6143332 , 275.91246095,
         219.58569078]),
  'std_fit_time': array([ 4.58014369,  2.719242  ,  3.78624711,  2.84064368,  4.42293908,
          3.81819455,  5.21964478,  2.58611285, 91.04814611]),
  'mean_score_time': array([0.34822335, 0.35122457, 0.39520702, 0.33346424, 0.32467709,
         0.32508726, 0.30490289, 0.32646446, 0.22578835]),
  'std_score_time': array([0.02182779, 0.01044297, 0.04105881, 0.01235056, 0.03607853,
         0.01665893, 0.04933165, 0.01546562, 0.09601044]),
  'param_colsample_bytree': masked_array(data=[0.75, 0.75, 0.75, 0.8, 0.8, 0.8, 0.85, 0.85, 0.85],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'param_subsample': masked_array(data=[0.75, 0.8, 0.85, 0.75, 0.8, 0.85, 0.75, 0.8, 0.85],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'params': [{'colsample_bytree': 0.75, 'subsample': 0.75},
   {'colsample_bytree': 0.75, 'subsample': 0.8},
   {'colsample_bytree': 0.75, 'subsample': 0.85},
   {'colsample_bytree': 0.8, 'subsample': 0.75},
   {'colsample_bytree': 0.8, 'subsample': 0.8},
   {'colsample_bytree': 0.8, 'subsample': 0.85},
   {'colsample_bytree': 0.85, 'subsample': 0.75},
   {'colsample_bytree': 0.85, 'subsample': 0.8},
   {'colsample_bytree': 0.85, 'subsample': 0.85}],
  'split0_test_score': array([0.84906952, 0.84858363, 0.84901598, 0.84784835, 0.84823407,
         0.84892459, 0.84841866, 0.84765768, 0.84861848]),
  'split1_test_score': array([0.85600722, 0.85670832, 0.8566435 , 0.85567189, 0.85700608,
         0.85622788, 0.85555861, 0.85637197, 0.85614243]),
  'split2_test_score': array([0.84909758, 0.84905184, 0.84908512, 0.8486399 , 0.8483266 ,
         0.84817649, 0.84904721, 0.84863791, 0.84851062]),
  'split3_test_score': array([0.88300251, 0.88316477, 0.88381258, 0.88269211, 0.88289486,
         0.88342418, 0.88358259, 0.88357333, 0.88322687]),
  'split4_test_score': array([0.95711203, 0.957082  , 0.95796146, 0.95791735, 0.95880465,
         0.95836932, 0.95652797, 0.95694413, 0.95764367]),
  'mean_test_score': array([0.87885777, 0.87891811, 0.87930373, 0.87855392, 0.87905325,
         0.87902449, 0.87862701, 0.878637  , 0.87882841]),
  'std_test_score': array([0.0410757 , 0.04106843, 0.04135545, 0.041658  , 0.04184512,
         0.04169472, 0.04101897, 0.04125959, 0.04141897]),
  'rank_test_score': array([5, 4, 1, 9, 2, 3, 8, 7, 6], dtype=int32)},
 {'colsample_bytree': 0.75, 'subsample': 0.85},
 0.8793037298289728)
```

In [None]:
param_test6 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(
    estimator = XGBClassifier(
        learning_rate =0.1,
        n_estimators=200,
        max_depth=7,
        min_child_weight=1,
        gamma=0.4,
        subsample=0.75,
        colsample_bytree=0.85,
        use_label_encoder=False,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27
    ),
    param_grid = param_test6,
    scoring='roc_auc',
    n_jobs=4,
    cv=5
)
# gsearch6.fit(train_x,train_y)
# gsearch6.cv_results_, gsearch6.best_params_, gsearch6.best_score_

```
# Output
({'mean_fit_time': array([274.27435656, 272.03669429, 275.16715016, 278.57269487,
         166.68242221]),
  'std_fit_time': array([ 4.90697192,  2.53208601,  2.60412937,  2.64154721, 71.90902853]),
  'mean_score_time': array([0.3345221 , 0.33835368, 0.353792  , 0.32983398, 0.23151541]),
  'std_score_time': array([0.00992549, 0.01864636, 0.02415058, 0.03667023, 0.10200377]),
  'param_reg_alpha': masked_array(data=[1e-05, 0.01, 0.1, 1, 100],
               mask=[False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'reg_alpha': 1e-05},
   {'reg_alpha': 0.01},
   {'reg_alpha': 0.1},
   {'reg_alpha': 1},
   {'reg_alpha': 100}],
  'split0_test_score': array([0.84841864, 0.84798174, 0.84858742, 0.84870609, 0.84853058]),
  'split1_test_score': array([0.85555861, 0.85564686, 0.85542842, 0.85584093, 0.85666766]),
  'split2_test_score': array([0.84904722, 0.84848004, 0.84865261, 0.84885707, 0.85002616]),
  'split3_test_score': array([0.88358259, 0.88322205, 0.88347661, 0.88366919, 0.8802367 ]),
  'split4_test_score': array([0.95652799, 0.95659384, 0.95742614, 0.9568714 , 0.93509948]),
  'mean_test_score': array([0.87862701, 0.87838491, 0.87871424, 0.87878893, 0.87411211]),
  'std_test_score': array([0.04101897, 0.0411721 , 0.04140483, 0.04110521, 0.03254482]),
  'rank_test_score': array([3, 4, 2, 1, 5], dtype=int32)},
 {'reg_alpha': 1},
 0.8787889342699449)
```

In [None]:
param_test6b = {
    'reg_alpha':[0, 1, 5, 10, 20]
}
gsearch6b = GridSearchCV(
    estimator = XGBClassifier(
        learning_rate =0.1,
        n_estimators=200,
        max_depth=7,
        min_child_weight=1,
        gamma=0.4,
        subsample=0.75,
        colsample_bytree=0.85,
        use_label_encoder=False,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27
    ),
    param_grid = param_test6b,
    scoring='roc_auc',
    n_jobs=4,
    cv=5
)
# gsearch6b.fit(train_x,train_y)
# gsearch6b.cv_results_, gsearch6b.best_params_, gsearch6b.best_score_

```
# Output
({'mean_fit_time': array([275.51043258, 276.02718916, 284.80924888, 278.52208066,
         223.55131669]),
  'std_fit_time': array([ 3.60772008,  4.03648455,  2.52794731,  3.42733534, 93.9355048 ]),
  'mean_score_time': array([0.34229422, 0.35052695, 0.35500917, 0.3653419 , 0.23838854]),
  'std_score_time': array([0.016226  , 0.0136301 , 0.0188017 , 0.02159019, 0.10474294]),
  'param_reg_alpha': masked_array(data=[0, 1, 5, 10, 20],
               mask=[False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'reg_alpha': 0},
   {'reg_alpha': 1},
   {'reg_alpha': 5},
   {'reg_alpha': 10},
   {'reg_alpha': 20}],
  'split0_test_score': array([0.84841866, 0.84870609, 0.84906648, 0.84951319, 0.8500191 ]),
  'split1_test_score': array([0.85555861, 0.85584093, 0.8565433 , 0.85687631, 0.85723906]),
  'split2_test_score': array([0.84904721, 0.84885707, 0.84965501, 0.84988758, 0.85036728]),
  'split3_test_score': array([0.88358259, 0.88366919, 0.88294375, 0.88301965, 0.88267709]),
  'split4_test_score': array([0.95652797, 0.9568714 , 0.95613531, 0.95509408, 0.95242409]),
  'mean_test_score': array([0.87862701, 0.87878893, 0.87886877, 0.87887816, 0.87854532]),
  'std_test_score': array([0.04101897, 0.04110521, 0.04056296, 0.04003213, 0.03882486]),
  'rank_test_score': array([4, 3, 2, 1, 5], dtype=int32)},
 {'reg_alpha': 10},
 0.8788781621408062)

```

In [None]:
param_test7 = {
    'learning_rate':[i/100.0 for i in range(1,10,1)]
}
gsearch7 = GridSearchCV(
    estimator = XGBClassifier(
        learning_rate =0.1,
        n_estimators=200,
        max_depth=7,
        min_child_weight=1,
        gamma=0.4,
        subsample=0.75,
        colsample_bytree=0.85,
        use_label_encoder=False,
        objective= 'binary:logistic',
        nthread=4,
        reg_alpha=10,
        scale_pos_weight=1,
        seed=27
    ),
    param_grid = param_test7,
    scoring='roc_auc',
    n_jobs=4,
    cv=5
)
# gsearch7.fit(train_x,train_y)
# gsearch7.cv_results_, gsearch7.best_params_, gsearch7.best_score_

```
# Output
({'mean_fit_time': array([310.02117634, 302.40386853, 296.49384093, 291.0676362 ,
         290.1681962 , 285.81479111, 283.90757165, 283.79644413,
         225.64013681]),
  'std_fit_time': array([  2.70587367,   4.02507028,   2.64717498,   3.67896398,
           2.23926685,   4.08376128,   3.60766283,   2.31671398,
         102.87196643]),
  'mean_score_time': array([0.3807838 , 0.37723298, 0.32812343, 0.34239044, 0.32590175,
         0.34391332, 0.34846554, 0.34919987, 0.27772937]),
  'std_score_time': array([0.03526834, 0.0165861 , 0.02764899, 0.03012603, 0.05415552,
         0.02491196, 0.03945512, 0.01410936, 0.12914528]),
  'param_learning_rate': masked_array(data=[0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'params': [{'learning_rate': 0.01},
   {'learning_rate': 0.02},
   {'learning_rate': 0.03},
   {'learning_rate': 0.04},
   {'learning_rate': 0.05},
   {'learning_rate': 0.06},
   {'learning_rate': 0.07},
   {'learning_rate': 0.08},
   {'learning_rate': 0.09}],
  'split0_test_score': array([0.84799126, 0.84990173, 0.85065057, 0.85069628, 0.85067647,
         0.85034422, 0.85045547, 0.85023372, 0.85007281]),
  'split1_test_score': array([0.85561522, 0.85746034, 0.85808133, 0.85816636, 0.85818866,
         0.85781694, 0.85776686, 0.85745461, 0.85737084]),
  'split2_test_score': array([0.84947623, 0.85095675, 0.85151307, 0.8514018 , 0.85126894,
         0.85106383, 0.85101527, 0.85040059, 0.85048767]),
  'split3_test_score': array([0.87876346, 0.88080307, 0.88205728, 0.88287393, 0.88300376,
         0.88318071, 0.8830507 , 0.88312628, 0.88278265]),
  'split4_test_score': array([0.9365298 , 0.94615062, 0.94973019, 0.95032134, 0.95192121,
         0.95356768, 0.95450251, 0.95427359, 0.9544222 ]),
  'mean_test_score': array([0.8736752 , 0.8770545 , 0.87840649, 0.87869194, 0.87901181,
         0.87919468, 0.87935816, 0.87909776, 0.87902724]),
  'std_test_score': array([0.03331069, 0.03630598, 0.0374357 , 0.03767776, 0.03830895,
         0.03905876, 0.03940857, 0.0394784 , 0.03954837]),
  'rank_test_score': array([9, 8, 7, 6, 5, 2, 1, 3, 4], dtype=int32)},
 {'learning_rate': 0.07},
 0.8793581617792399)
```

In [None]:
import gc
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, auc, f1_score, recall_score
import seaborn as sns
from imblearn.over_sampling import SMOTE

# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('features_importances.png')
    
def display_roc_curve(y_, oof_preds_,sub_preds_,folds_idx_):
    # Plot ROC curves
    plt.figure(figsize=(6,6))
    scores = [] 
    for n_fold, (_, val_idx) in enumerate(folds_idx_):  
        # Plot the roc curve
        fpr, tpr, thresholds = roc_curve(y_.iloc[val_idx], oof_preds_[val_idx])
#         score = 2 * auc(fpr, tpr) -1
        score = roc_auc_score(y_.iloc[val_idx], oof_preds_[val_idx])
        scores.append(score)
        plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.4f)' % (n_fold + 1, score))
    
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
    fpr, tpr, thresholds = roc_curve(y_, oof_preds_)
#     score = 2 * auc(fpr, tpr) -1
    score = roc_auc_score(y_, oof_preds_)
    plt.plot(fpr, tpr, color='b',
             label='Avg ROC (AUC = %0.4f $\pm$ %0.4f)' % (score, np.std(scores)),
             lw=2, alpha=.8)
    
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.tight_layout()
    
    plt.savefig('roc_curve.png')


def kfold_cv(train_df, test_df, num_folds, stratified = False, debug= False):
    print("Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=25000)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=25000)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['Survived','PassengerId']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['Survived'])):        
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['Survived'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['Survived'].iloc[valid_idx]
        
        sm = SMOTE(random_state=42, sampling_strategy=1.0)
        train_x, train_y = sm.fit_resample(train_x, train_y)

        clf = XGBClassifier(
            learning_rate=0.07,
            n_estimators=200,
            max_depth=7,
            min_child_weight=1,
            gamma=0.4,
            subsample=0.75,
            colsample_bytree=0.85,
            use_label_encoder=False,
            objective= 'binary:logistic',
            nthread=4,
            reg_alpha=10,
            scale_pos_weight=1,
            seed=27
        )

        clf.fit(train_x, train_y.ravel(), eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric='auc', verbose= 1000, early_stopping_rounds= 200)

        oof_pred = clf.predict(valid_x)
        
        pred = clf.predict(valid_x)
        print('F1 Score: ' + str( f1_score(valid_y, pred) ))
        print('Recall Score: ' + str( recall_score(valid_y, pred) ))
        
        sub_pred = clf.predict(test_df[feats]) / folds.n_splits
        oof_preds[valid_idx] = oof_pred
        sub_preds += sub_pred
                
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['Survived'], oof_preds))
    
    folds_idx = [(trn_idx, val_idx) for trn_idx, val_idx in folds.split(train_df[feats], train_df['Survived'])]
    display_roc_curve(y_=train_df['Survived'],oof_preds_=oof_preds,sub_preds_ = sub_preds, folds_idx_=folds_idx)
    
    # Write submission file and plot feature importance
    if not debug:
        test_df['Survived'] = sub_preds
        test_df = test_df.astype({'Survived': 'int32'})
        test_df[['PassengerId', 'Survived']].to_csv('submission.csv', index= False)
        
    display_importances(feature_importance_df)
    return feature_importance_df

In [None]:
feature_importance = kfold_cv(final_train_data, final_test_data, 5, True)