In [1]:
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2

%matplotlib inline

from sklearn.metrics import fbeta_score
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV, train_test_split

import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype, is_numeric_dtype

import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold

from rfpimp import *

from sklearn.metrics import roc_auc_score

In [2]:
from sortedcontainers import SortedList
import copy
import collections
import numpy as np
from itertools import product,chain
import pandas
from sklearn.model_selection import KFold
import catboost as cb

In [35]:
def create_submission(pred, path, fname):
    submission = pd.DataFrame({'id': test.id, 'stroke': pred})
    submission.to_csv(path + fname, index=False)

In [3]:
path = 'data/mckinsey/'

train = pd.read_csv(path + 'train.csv')

test = pd.read_csv(path + 'test.csv')

sample = pd.read_csv(path + 'sample_submission_1.csv')

In [4]:
train.head(1)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0


In [5]:
test.head(1)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,36306,Male,80.0,0,0,Yes,Private,Urban,83.84,21.1,formerly smoked


In [6]:
train.shape

(43400, 12)

In [7]:
train.isnull().sum()

id                       0
gender                   0
age                      0
hypertension             0
heart_disease            0
ever_married             0
work_type                0
Residence_type           0
avg_glucose_level        0
bmi                   1462
smoking_status       13292
stroke                   0
dtype: int64

In [8]:
train.smoking_status.unique()

array([nan, 'never smoked', 'formerly smoked', 'smokes'], dtype=object)

In [9]:
train.work_type.unique()

array(['children', 'Private', 'Never_worked', 'Self-employed', 'Govt_job'],
      dtype=object)

In [10]:
train.Residence_type.unique()

array(['Rural', 'Urban'], dtype=object)

In [11]:
train.groupby('stroke').size()

stroke
0    42617
1      783
dtype: int64

In [12]:
def train_cats(df):
    """Change any columns of strings in a panda's dataframe to a column of
    catagorical values. This applies the changes inplace.

    Parameters:
    -----------
    df: A pandas dataframe. Any columns of strings will be changed to
        categorical values.

    Examples:
    ---------

    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category
    """
    for n, c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

In [13]:
train_cats(train)

In [14]:
def apply_cats(df, trn):
    """Changes any columns of strings in df into categorical variables using trn as
    a template for the category codes.

    Parameters:
    -----------
    df: A pandas dataframe. Any columns of strings will be changed to
        categorical values. The category codes are determined by trn.

    trn: A pandas dataframe. When creating a category for df, it looks up the
        what the category's code were in trn and makes those the category codes
        for df.

    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category {a : 1, b : 2}

    >>> df2 = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['b', 'a', 'a']})
    >>> apply_cats(df2, df)

           col1 col2
        0     1    b
        1     2    a
        2     3    a

    now the type of col is category {a : 1, b : 2}
    """
    for n, c in df.items():
        if (n in trn.columns) and (trn[n].dtype.name == 'category'):
            df[n] = pd.Categorical(
                c, categories=trn[n].cat.categories, ordered=True)

In [15]:
apply_cats(test, train)

In [16]:
train['bmi_is_na'] = 0

train.loc[train.bmi.isna(),'bmi_is_na'] = 1

test['bmi_is_na'] = 0

test.loc[test.bmi.isna(),'bmi_is_na'] = 1

train.bmi.fillna(train.bmi[train.bmi.notnull()].median(), inplace=True)

test.bmi.fillna(train.bmi[train.bmi.notnull()].median(), inplace=True)

In [17]:
train.dtypes

id                      int64
gender               category
age                   float64
hypertension            int64
heart_disease           int64
ever_married         category
work_type            category
Residence_type       category
avg_glucose_level     float64
bmi                   float64
smoking_status       category
stroke                  int64
bmi_is_na               int64
dtype: object

In [18]:
test.dtypes

id                      int64
gender               category
age                   float64
hypertension            int64
heart_disease           int64
ever_married         category
work_type            category
Residence_type       category
avg_glucose_level     float64
bmi                   float64
smoking_status       category
bmi_is_na               int64
dtype: object

In [19]:
cat_cols = [
    'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'
]

for c in cat_cols:
    train[c] = train[c].cat.codes
    test[c] = test[c].cat.codes

#### Catboost

In [20]:
train.dtypes

id                     int64
gender                  int8
age                  float64
hypertension           int64
heart_disease          int64
ever_married            int8
work_type               int8
Residence_type          int8
avg_glucose_level    float64
bmi                  float64
smoking_status          int8
stroke                 int64
bmi_is_na              int64
dtype: object

In [21]:
x_train,y_train = train.drop('stroke',axis=1), train.stroke

In [22]:
cat_cols_index = np.where(x_train.dtypes != "float")[0]

In [23]:
cat_cols_index

array([ 0,  1,  3,  4,  5,  6,  7, 10, 11])

In [27]:
from sortedcontainers import SortedList
import copy
import collections
import numpy as np
from itertools import product, chain
import pandas
from sklearn.model_selection import KFold
import catboost as cb
''' a class for doing grid search on a set of parameters provided in a dict. 'pdict' should be a dictionary like the following:
pdict = {'depth':[1,2], 'iterations':[250,100,500], 'thread_count':4}

when grid_search is called it will return an iterator that provides samples from the dictionary e.g.
{'depth':1, 'iterations':250, 'thread_count':4}
{'depth':2, 'iterations':250, 'thread_count':4}
{'depth':1, 'iterations':100, 'thread_count':4}
etc.
after calling an iteration of grid_search, you need to test the classifier and run 'register_result'
This will update the internal list of results, so that the next call to grid_search will use the best
parameters for all the parameters not currently being updated.

grid_search can be provided a list e.g. grid_search(['depth']) this will use the current best parameters for all
the other arguments and only search over 'depth'. You can then call e.g. grid_search(['iterations']) and it will use
the best depth found previously and cycle through all the 'iterations'. Searching incrementally can be much faster
than doing a full grid search, but may miss the global optimum. '''


class paramsearch:
    def __init__(self, pdict):
        self.pdict = {}
        # if something is not passed in as a sequence, make it a sequence with 1 element
        #   don't treat strings as sequences
        for a, b in pdict.items():
            if isinstance(b, collections.Sequence) and not isinstance(b, str):
                self.pdict[a] = b
            else:
                self.pdict[a] = [b]
        # our results are a sorted list, so the best score is always the final element
        self.results = SortedList()

    def grid_search(self, keys=None):
        # do grid search on only the keys listed. If none provided, do all
        if keys == None: keylist = self.pdict.keys()
        else: keylist = keys

        listoflists = []  # this will be list of lists of key,value pairs
        for key in keylist:
            listoflists.append([(key, i) for i in self.pdict[key]])
        for p in product(*listoflists):
            # do any changes to the current best parameter set
            if len(self.results) > 0: template = self.results[-1][1]
            else: template = {a: b[0] for a, b in self.pdict.items()}
            # if our updates are the same as current best, don't bother
            if self.equaldict(dict(p), template): continue
            # take the current best and update just the ones to change
            yield self.overwritedict(dict(p), template)

    def equaldict(self, a, b):
        for key in a.keys():
            if a[key] != b[key]: return False
        return True

    def overwritedict(self, new, old):
        old = copy.deepcopy(old)
        for key in new.keys():
            old[key] = new[key]
        return old

    # save a (score,params) pair to results. Since 'results' is a sorted list,
    #   the best score is always the final element. A small amount of noise is added
    #   because sorted lists don't like it when two scores are exactly the same
    def register_result(self, result, params):
        self.results.add((result + np.random.randn() * 1e-10, params))

    def bestscore(self):
        return self.results[-1][0]

    def bestparam(self):
        return self.results[-1][1]


params = {
    'depth': [1, 3, 5],
    'l2_leaf_reg': [1, 4, 9],
    'iterations': [100],
    'learning_rate': [0.05, 0.08, 0.09, 0.1],
    "one_hot_max_size": [10]
}


# this function does 3-fold crossvalidation with catboostclassifier
def crossvaltest(params, train_set, train_label, cat_dims, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True)
    res = []
    for train_index, test_index in kf.split(train_set):
        train = train_set.iloc[train_index, :]
        test = train_set.iloc[test_index, :]

        labels = train_label.ix[train_index]
        test_labels = train_label.ix[test_index]

        clf = cb.CatBoostClassifier(**params)
        clf.fit(train, np.ravel(labels), cat_features=cat_dims)

        res.append(np.mean(clf.predict(test) == np.ravel(test_labels)))
    return np.mean(res)


def catboost_param_tune(params,
                        train_set,
                        train_label,
                        cat_dims=None,
                        n_splits=5):
    ps = paramsearch(params)
    # search 'border_count', 'l2_leaf_reg' etc. individually
    #   but 'iterations','learning_rate' together
    for prms in chain(
            ps.grid_search(['iterations', 'learning_rate']),
            ps.grid_search(['one_hot_max_size'])):
        res = crossvaltest(prms, train_set, train_label, cat_dims, n_splits)
        # save the crossvalidation result so that future iterations can reuse the best parameters
        ps.register_result(res, prms)
        print(res, prms, 'best:', ps.bestscore(), ps.bestparam())
    return ps.bestparam()


bestparams = catboost_param_tune(
    params, x_train, y_train, cat_dims=cat_cols_index)

0:	learn: 0.5475390	total: 30.1ms	remaining: 2.98s
1:	learn: 0.4390703	total: 56.8ms	remaining: 2.79s
2:	learn: 0.3530381	total: 82.4ms	remaining: 2.67s
3:	learn: 0.2910756	total: 106ms	remaining: 2.54s
4:	learn: 0.2439057	total: 138ms	remaining: 2.63s
5:	learn: 0.2082121	total: 167ms	remaining: 2.61s
6:	learn: 0.1824070	total: 191ms	remaining: 2.54s
7:	learn: 0.1618399	total: 215ms	remaining: 2.47s
8:	learn: 0.1466314	total: 248ms	remaining: 2.51s
9:	learn: 0.1345470	total: 304ms	remaining: 2.73s
10:	learn: 0.1234364	total: 344ms	remaining: 2.78s
11:	learn: 0.1134510	total: 367ms	remaining: 2.69s
12:	learn: 0.1056377	total: 395ms	remaining: 2.64s
13:	learn: 0.1011709	total: 418ms	remaining: 2.57s
14:	learn: 0.0976685	total: 441ms	remaining: 2.5s
15:	learn: 0.0944278	total: 492ms	remaining: 2.58s
16:	learn: 0.0920702	total: 558ms	remaining: 2.73s
17:	learn: 0.0896167	total: 580ms	remaining: 2.64s
18:	learn: 0.0873214	total: 601ms	remaining: 2.56s
19:	learn: 0.0861261	total: 622ms	remai

62:	learn: 0.0735634	total: 2.52s	remaining: 1.48s
63:	learn: 0.0735432	total: 2.58s	remaining: 1.45s
64:	learn: 0.0734041	total: 2.63s	remaining: 1.41s
65:	learn: 0.0733776	total: 2.67s	remaining: 1.38s
66:	learn: 0.0733770	total: 2.72s	remaining: 1.34s
67:	learn: 0.0733610	total: 2.81s	remaining: 1.32s
68:	learn: 0.0733535	total: 2.87s	remaining: 1.29s
69:	learn: 0.0732393	total: 2.9s	remaining: 1.24s
70:	learn: 0.0732379	total: 2.94s	remaining: 1.2s
71:	learn: 0.0732169	total: 2.97s	remaining: 1.16s
72:	learn: 0.0732043	total: 2.99s	remaining: 1.11s
73:	learn: 0.0731891	total: 3.02s	remaining: 1.06s
74:	learn: 0.0731787	total: 3.08s	remaining: 1.02s
75:	learn: 0.0731698	total: 3.1s	remaining: 979ms
76:	learn: 0.0730421	total: 3.12s	remaining: 932ms
77:	learn: 0.0730308	total: 3.14s	remaining: 886ms
78:	learn: 0.0730232	total: 3.16s	remaining: 841ms
79:	learn: 0.0730204	total: 3.19s	remaining: 797ms
80:	learn: 0.0730135	total: 3.21s	remaining: 752ms
81:	learn: 0.0730020	total: 3.23s	

25:	learn: 0.0790872	total: 1.06s	remaining: 3.02s
26:	learn: 0.0788524	total: 1.13s	remaining: 3.05s
27:	learn: 0.0785332	total: 1.16s	remaining: 2.97s
28:	learn: 0.0780015	total: 1.2s	remaining: 2.93s
29:	learn: 0.0777439	total: 1.22s	remaining: 2.84s
30:	learn: 0.0775311	total: 1.24s	remaining: 2.76s
31:	learn: 0.0773276	total: 1.26s	remaining: 2.68s
32:	learn: 0.0771552	total: 1.33s	remaining: 2.7s
33:	learn: 0.0770058	total: 1.37s	remaining: 2.66s
34:	learn: 0.0764206	total: 1.43s	remaining: 2.65s
35:	learn: 0.0761207	total: 1.48s	remaining: 2.63s
36:	learn: 0.0760511	total: 1.52s	remaining: 2.59s
37:	learn: 0.0758597	total: 1.6s	remaining: 2.62s
38:	learn: 0.0757615	total: 1.65s	remaining: 2.58s
39:	learn: 0.0756778	total: 1.7s	remaining: 2.54s
40:	learn: 0.0753432	total: 1.75s	remaining: 2.51s
41:	learn: 0.0752852	total: 1.87s	remaining: 2.58s
42:	learn: 0.0751933	total: 1.91s	remaining: 2.53s
43:	learn: 0.0747598	total: 1.96s	remaining: 2.49s
44:	learn: 0.0746864	total: 2.02s	r

87:	learn: 0.0712764	total: 3.79s	remaining: 517ms
88:	learn: 0.0712341	total: 3.86s	remaining: 477ms
89:	learn: 0.0711968	total: 3.93s	remaining: 437ms
90:	learn: 0.0711902	total: 4s	remaining: 396ms
91:	learn: 0.0711842	total: 4.05s	remaining: 352ms
92:	learn: 0.0711811	total: 4.08s	remaining: 308ms
93:	learn: 0.0711759	total: 4.11s	remaining: 262ms
94:	learn: 0.0711746	total: 4.13s	remaining: 217ms
95:	learn: 0.0711704	total: 4.15s	remaining: 173ms
96:	learn: 0.0711668	total: 4.17s	remaining: 129ms
97:	learn: 0.0711569	total: 4.2s	remaining: 85.7ms
98:	learn: 0.0710644	total: 4.26s	remaining: 43ms
99:	learn: 0.0710590	total: 4.3s	remaining: 0us
0.9819585253456221 {'depth': 1, 'l2_leaf_reg': 1, 'iterations': 100, 'learning_rate': 0.08, 'one_hot_max_size': 10} best: 0.9819585252838774 {'depth': 1, 'l2_leaf_reg': 1, 'iterations': 100, 'learning_rate': 0.08, 'one_hot_max_size': 10}
0:	learn: 0.5359478	total: 74.6ms	remaining: 7.38s
1:	learn: 0.4200534	total: 186ms	remaining: 9.11s
2:	le

47:	learn: 0.0737838	total: 2.41s	remaining: 2.61s
48:	learn: 0.0737401	total: 2.53s	remaining: 2.63s
49:	learn: 0.0736896	total: 2.62s	remaining: 2.62s
50:	learn: 0.0735383	total: 2.73s	remaining: 2.62s
51:	learn: 0.0735040	total: 2.81s	remaining: 2.6s
52:	learn: 0.0734998	total: 2.95s	remaining: 2.62s
53:	learn: 0.0734823	total: 3s	remaining: 2.56s
54:	learn: 0.0734541	total: 3.02s	remaining: 2.48s
55:	learn: 0.0734169	total: 3.05s	remaining: 2.4s
56:	learn: 0.0734035	total: 3.07s	remaining: 2.32s
57:	learn: 0.0733751	total: 3.09s	remaining: 2.24s
58:	learn: 0.0730952	total: 3.12s	remaining: 2.17s
59:	learn: 0.0730690	total: 3.14s	remaining: 2.09s
60:	learn: 0.0728676	total: 3.18s	remaining: 2.03s
61:	learn: 0.0728394	total: 3.2s	remaining: 1.96s
62:	learn: 0.0728215	total: 3.32s	remaining: 1.95s
63:	learn: 0.0727232	total: 3.39s	remaining: 1.91s
64:	learn: 0.0727046	total: 3.43s	remaining: 1.85s
65:	learn: 0.0727035	total: 3.46s	remaining: 1.78s
66:	learn: 0.0726883	total: 3.56s	rem

14:	learn: 0.0919086	total: 462ms	remaining: 2.62s
15:	learn: 0.0884870	total: 524ms	remaining: 2.75s
16:	learn: 0.0868747	total: 604ms	remaining: 2.95s
17:	learn: 0.0855906	total: 651ms	remaining: 2.97s
18:	learn: 0.0845641	total: 710ms	remaining: 3.02s
19:	learn: 0.0833187	total: 768ms	remaining: 3.07s
20:	learn: 0.0826479	total: 836ms	remaining: 3.15s
21:	learn: 0.0814867	total: 887ms	remaining: 3.15s
22:	learn: 0.0799130	total: 935ms	remaining: 3.13s
23:	learn: 0.0794503	total: 983ms	remaining: 3.11s
24:	learn: 0.0784671	total: 1.04s	remaining: 3.13s
25:	learn: 0.0782387	total: 1.09s	remaining: 3.11s
26:	learn: 0.0779981	total: 1.14s	remaining: 3.07s
27:	learn: 0.0777213	total: 1.18s	remaining: 3.04s
28:	learn: 0.0773424	total: 1.22s	remaining: 2.98s
29:	learn: 0.0771413	total: 1.24s	remaining: 2.89s
30:	learn: 0.0767872	total: 1.26s	remaining: 2.81s
31:	learn: 0.0766387	total: 1.28s	remaining: 2.73s
32:	learn: 0.0765341	total: 1.31s	remaining: 2.65s
33:	learn: 0.0763224	total: 1.3

76:	learn: 0.0735399	total: 2.94s	remaining: 879ms
77:	learn: 0.0735398	total: 2.98s	remaining: 842ms
78:	learn: 0.0735387	total: 3.01s	remaining: 799ms
79:	learn: 0.0735323	total: 3.03s	remaining: 757ms
80:	learn: 0.0735321	total: 3.05s	remaining: 716ms
81:	learn: 0.0735297	total: 3.08s	remaining: 675ms
82:	learn: 0.0735296	total: 3.1s	remaining: 635ms
83:	learn: 0.0735168	total: 3.14s	remaining: 599ms
84:	learn: 0.0735166	total: 3.22s	remaining: 568ms
85:	learn: 0.0735079	total: 3.32s	remaining: 541ms
86:	learn: 0.0735079	total: 3.44s	remaining: 514ms
87:	learn: 0.0735070	total: 3.48s	remaining: 475ms
88:	learn: 0.0734946	total: 3.5s	remaining: 433ms
89:	learn: 0.0734168	total: 3.53s	remaining: 392ms
90:	learn: 0.0733196	total: 3.55s	remaining: 351ms
91:	learn: 0.0733143	total: 3.57s	remaining: 310ms
92:	learn: 0.0733136	total: 3.6s	remaining: 271ms
93:	learn: 0.0733092	total: 3.62s	remaining: 231ms
94:	learn: 0.0733085	total: 3.64s	remaining: 192ms
95:	learn: 0.0733079	total: 3.67s	

37:	learn: 0.0734889	total: 1.38s	remaining: 2.25s
38:	learn: 0.0734481	total: 1.4s	remaining: 2.19s
39:	learn: 0.0730250	total: 1.43s	remaining: 2.14s
40:	learn: 0.0729899	total: 1.45s	remaining: 2.08s
41:	learn: 0.0729541	total: 1.47s	remaining: 2.03s
42:	learn: 0.0729321	total: 1.49s	remaining: 1.98s
43:	learn: 0.0729018	total: 1.52s	remaining: 1.94s
44:	learn: 0.0725603	total: 1.56s	remaining: 1.91s
45:	learn: 0.0725369	total: 1.59s	remaining: 1.87s
46:	learn: 0.0724872	total: 1.61s	remaining: 1.82s
47:	learn: 0.0724688	total: 1.63s	remaining: 1.76s
48:	learn: 0.0724536	total: 1.65s	remaining: 1.72s
49:	learn: 0.0724409	total: 1.67s	remaining: 1.67s
50:	learn: 0.0724286	total: 1.69s	remaining: 1.63s
51:	learn: 0.0724186	total: 1.72s	remaining: 1.59s
52:	learn: 0.0723785	total: 1.74s	remaining: 1.54s
53:	learn: 0.0723655	total: 1.77s	remaining: 1.51s
54:	learn: 0.0723494	total: 1.79s	remaining: 1.46s
55:	learn: 0.0723418	total: 1.81s	remaining: 1.42s
56:	learn: 0.0721945	total: 1.83

0:	learn: 0.5209068	total: 95.8ms	remaining: 9.48s
1:	learn: 0.3954940	total: 179ms	remaining: 8.75s
2:	learn: 0.3098065	total: 262ms	remaining: 8.47s
3:	learn: 0.2481294	total: 355ms	remaining: 8.53s
4:	learn: 0.1976735	total: 480ms	remaining: 9.13s
5:	learn: 0.1678089	total: 573ms	remaining: 8.98s
6:	learn: 0.1476975	total: 616ms	remaining: 8.18s
7:	learn: 0.1319949	total: 643ms	remaining: 7.4s
8:	learn: 0.1159699	total: 667ms	remaining: 6.74s
9:	learn: 0.1080473	total: 699ms	remaining: 6.29s
10:	learn: 0.1028371	total: 721ms	remaining: 5.84s
11:	learn: 0.0971566	total: 745ms	remaining: 5.46s
12:	learn: 0.0937583	total: 766ms	remaining: 5.12s
13:	learn: 0.0902856	total: 799ms	remaining: 4.91s
14:	learn: 0.0878716	total: 823ms	remaining: 4.67s
15:	learn: 0.0855460	total: 873ms	remaining: 4.58s
16:	learn: 0.0843186	total: 966ms	remaining: 4.72s
17:	learn: 0.0833568	total: 1.11s	remaining: 5.04s
18:	learn: 0.0824863	total: 1.17s	remaining: 4.97s
19:	learn: 0.0809885	total: 1.23s	remaini

65:	learn: 0.0737169	total: 2.27s	remaining: 1.17s
66:	learn: 0.0737102	total: 2.31s	remaining: 1.14s
67:	learn: 0.0737100	total: 2.36s	remaining: 1.11s
68:	learn: 0.0737003	total: 2.38s	remaining: 1.07s
69:	learn: 0.0736982	total: 2.4s	remaining: 1.03s
70:	learn: 0.0736745	total: 2.43s	remaining: 992ms
71:	learn: 0.0734960	total: 2.45s	remaining: 954ms
72:	learn: 0.0734871	total: 2.48s	remaining: 917ms
73:	learn: 0.0734586	total: 2.51s	remaining: 883ms
74:	learn: 0.0733562	total: 2.55s	remaining: 850ms
75:	learn: 0.0733390	total: 2.6s	remaining: 822ms
76:	learn: 0.0733305	total: 2.64s	remaining: 790ms
77:	learn: 0.0733291	total: 2.68s	remaining: 755ms
78:	learn: 0.0733154	total: 2.73s	remaining: 725ms
79:	learn: 0.0733113	total: 2.76s	remaining: 690ms
80:	learn: 0.0732333	total: 2.79s	remaining: 654ms
81:	learn: 0.0732313	total: 2.82s	remaining: 619ms
82:	learn: 0.0732263	total: 2.85s	remaining: 583ms
83:	learn: 0.0732217	total: 2.91s	remaining: 554ms
84:	learn: 0.0732179	total: 2.98s

In [28]:
bestparams

{'depth': 1,
 'iterations': 100,
 'l2_leaf_reg': 1,
 'learning_rate': 0.09,
 'one_hot_max_size': 10}

In [30]:
clf = cb.CatBoostClassifier(eval_metric="AUC",one_hot_max_size=10, \
                        depth=1, iterations= 100, l2_leaf_reg= 1, learning_rate= 0.09)
clf.fit(x_train,y_train, cat_features=cat_cols_index)

0:	learn: 0.5792770	total: 51.1ms	remaining: 5.06s
1:	learn: 0.6517149	total: 95.8ms	remaining: 4.69s
2:	learn: 0.6881295	total: 169ms	remaining: 5.48s
3:	learn: 0.6881784	total: 202ms	remaining: 4.86s
4:	learn: 0.6881644	total: 234ms	remaining: 4.45s
5:	learn: 0.6892600	total: 274ms	remaining: 4.29s
6:	learn: 0.8133020	total: 349ms	remaining: 4.64s
7:	learn: 0.8121318	total: 384ms	remaining: 4.41s
8:	learn: 0.8257970	total: 418ms	remaining: 4.23s
9:	learn: 0.8290905	total: 452ms	remaining: 4.07s
10:	learn: 0.8295709	total: 496ms	remaining: 4.01s
11:	learn: 0.8279684	total: 568ms	remaining: 4.16s
12:	learn: 0.8310190	total: 605ms	remaining: 4.05s
13:	learn: 0.8320950	total: 637ms	remaining: 3.91s
14:	learn: 0.8314954	total: 677ms	remaining: 3.84s
15:	learn: 0.8326696	total: 713ms	remaining: 3.74s
16:	learn: 0.8332056	total: 779ms	remaining: 3.8s
17:	learn: 0.8327726	total: 820ms	remaining: 3.73s
18:	learn: 0.8331464	total: 854ms	remaining: 3.64s
19:	learn: 0.8332622	total: 887ms	remain

<catboost.core.CatBoostClassifier at 0x1a19ef3f60>

In [32]:
probs = clf.predict_proba(test)[:,1]

In [33]:
np.mean(probs)

0.017718593615353988

In [34]:
np.mean(probs>0.5)

0.0

In [36]:
create_submission(probs,path,'cb2.csv')

In [66]:
test.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,bmi_is_na
0,36306,1,80.0,0,0,1,2,1,83.84,21.1,0,0
1,61829,0,74.0,0,1,1,3,0,179.5,26.0,0,0
2,14152,0,14.0,0,0,0,4,0,95.16,21.2,-1,0
3,12997,1,28.0,0,0,0,2,1,94.76,23.4,-1,0
4,40801,0,63.0,0,0,1,0,0,83.57,27.6,1,0


In [67]:
x_train, y_train = train.drop('stroke', axis=1).reset_index(drop= True), train.stroke

x_test = test

clf = cb.CatBoostClassifier(eval_metric="AUC",one_hot_max_size=10, \
                        depth=1, iterations= 100, l2_leaf_reg= 1, learning_rate= 0.09)

trn = x_train.copy()

tst = x_test.copy()

x_train['cat_soft'] = 0

kf = KFold(n_splits=5, shuffle=True, random_state=1)

for train_index, val_index in StratifiedKFold(n_splits=3, shuffle=True, random_state=1).split(
        x_train, y_train):
    x_trn, y_trn = trn.loc[train_index,:], y_train[train_index]
    x_val, y_val = trn.loc[val_index,:], y_train[val_index]

    clf.fit(x_trn, y_trn,cat_features=cat_cols_index)
    
    probs = clf.predict_proba(x_val)[:, 1]
    
    
    x_train.loc[val_index,'cat_soft'] = probs

    
clf.fit(trn,y_train)
probs = clf.predict_proba(tst)[:, 1]

x_test.loc[:,'cat_soft'] = probs

x_train.to_feather('data/mckinsey/stack_trn_cat')

x_test.to_feather('data/mckinsey/stack_tst_cat')

0:	learn: 0.5468231	total: 47.3ms	remaining: 4.68s
1:	learn: 0.6343131	total: 77.2ms	remaining: 3.78s
2:	learn: 0.6438685	total: 136ms	remaining: 4.4s
3:	learn: 0.7002673	total: 172ms	remaining: 4.12s
4:	learn: 0.7302613	total: 200ms	remaining: 3.79s
5:	learn: 0.7319239	total: 226ms	remaining: 3.54s
6:	learn: 0.7351160	total: 254ms	remaining: 3.37s
7:	learn: 0.7354626	total: 314ms	remaining: 3.61s
8:	learn: 0.7933293	total: 342ms	remaining: 3.46s
9:	learn: 0.8269671	total: 370ms	remaining: 3.33s
10:	learn: 0.8400282	total: 400ms	remaining: 3.24s
11:	learn: 0.8417638	total: 432ms	remaining: 3.16s
12:	learn: 0.8413259	total: 464ms	remaining: 3.11s
13:	learn: 0.8470876	total: 519ms	remaining: 3.19s
14:	learn: 0.8503843	total: 546ms	remaining: 3.1s
15:	learn: 0.8515231	total: 575ms	remaining: 3.02s
16:	learn: 0.8505330	total: 605ms	remaining: 2.95s
17:	learn: 0.8489679	total: 637ms	remaining: 2.9s
18:	learn: 0.8485857	total: 664ms	remaining: 2.83s
19:	learn: 0.8488925	total: 724ms	remainin

66:	learn: 0.8649811	total: 3.09s	remaining: 1.52s
67:	learn: 0.8650048	total: 3.13s	remaining: 1.47s
68:	learn: 0.8650384	total: 3.16s	remaining: 1.42s
69:	learn: 0.8650037	total: 3.19s	remaining: 1.37s
70:	learn: 0.8649819	total: 3.23s	remaining: 1.32s
71:	learn: 0.8650164	total: 3.27s	remaining: 1.27s
72:	learn: 0.8650114	total: 3.33s	remaining: 1.23s
73:	learn: 0.8650238	total: 3.37s	remaining: 1.18s
74:	learn: 0.8650539	total: 3.4s	remaining: 1.13s
75:	learn: 0.8650667	total: 3.43s	remaining: 1.08s
76:	learn: 0.8650758	total: 3.46s	remaining: 1.03s
77:	learn: 0.8650357	total: 3.49s	remaining: 985ms
78:	learn: 0.8650185	total: 3.53s	remaining: 937ms
79:	learn: 0.8650177	total: 3.62s	remaining: 906ms
80:	learn: 0.8650328	total: 3.71s	remaining: 870ms
81:	learn: 0.8649908	total: 3.74s	remaining: 822ms
82:	learn: 0.8649954	total: 3.78s	remaining: 774ms
83:	learn: 0.8649990	total: 3.82s	remaining: 728ms
84:	learn: 0.8650736	total: 3.89s	remaining: 687ms
85:	learn: 0.8651116	total: 3.93

29:	learn: 0.8564381	total: 2.13s	remaining: 4.96s
30:	learn: 0.8564041	total: 2.17s	remaining: 4.83s
31:	learn: 0.8559232	total: 2.21s	remaining: 4.69s
32:	learn: 0.8563556	total: 2.25s	remaining: 4.58s
33:	learn: 0.8565378	total: 2.33s	remaining: 4.53s
34:	learn: 0.8567735	total: 2.4s	remaining: 4.46s
35:	learn: 0.8573302	total: 2.46s	remaining: 4.37s
36:	learn: 0.8576140	total: 2.5s	remaining: 4.26s
37:	learn: 0.8582696	total: 2.55s	remaining: 4.16s
38:	learn: 0.8579590	total: 2.58s	remaining: 4.04s
39:	learn: 0.8578770	total: 2.63s	remaining: 3.94s
40:	learn: 0.8574190	total: 2.67s	remaining: 3.85s
41:	learn: 0.8577182	total: 2.71s	remaining: 3.74s
42:	learn: 0.8595574	total: 2.74s	remaining: 3.64s
43:	learn: 0.8598100	total: 2.81s	remaining: 3.58s
44:	learn: 0.8620314	total: 2.9s	remaining: 3.54s
45:	learn: 0.8622183	total: 2.98s	remaining: 3.5s
46:	learn: 0.8624140	total: 3.07s	remaining: 3.46s
47:	learn: 0.8625805	total: 3.15s	remaining: 3.42s
48:	learn: 0.8624567	total: 3.24s	r