In [1]:
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2

%matplotlib inline

from sklearn.metrics import fbeta_score
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV, train_test_split

import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype, is_numeric_dtype

import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold

from rfpimp import *

from sklearn.metrics import roc_auc_score

In [2]:
from sortedcontainers import SortedList
import copy
import collections
import numpy as np
from itertools import product,chain
import pandas
from sklearn.model_selection import KFold
import catboost as cb

In [3]:
import xgboost as xgb

In [4]:
path = 'data/mckinsey/'

train = pd.read_csv(path + 'train.csv')

test = pd.read_csv(path + 'test.csv')

sample = pd.read_csv(path + 'sample_submission_1.csv')

train.head(1)

test.head(1)

train.shape

train.isnull().sum()

train.smoking_status.unique()

train.work_type.unique()

train.Residence_type.unique()

train.groupby('stroke').size()

def train_cats(df):
    """Change any columns of strings in a panda's dataframe to a column of
    catagorical values. This applies the changes inplace.

    Parameters:
    -----------
    df: A pandas dataframe. Any columns of strings will be changed to
        categorical values.

    Examples:
    ---------

    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category
    """
    for n, c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

train_cats(train)

def apply_cats(df, trn):
    """Changes any columns of strings in df into categorical variables using trn as
    a template for the category codes.

    Parameters:
    -----------
    df: A pandas dataframe. Any columns of strings will be changed to
        categorical values. The category codes are determined by trn.

    trn: A pandas dataframe. When creating a category for df, it looks up the
        what the category's code were in trn and makes those the category codes
        for df.

    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category {a : 1, b : 2}

    >>> df2 = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['b', 'a', 'a']})
    >>> apply_cats(df2, df)

           col1 col2
        0     1    b
        1     2    a
        2     3    a

    now the type of col is category {a : 1, b : 2}
    """
    for n, c in df.items():
        if (n in trn.columns) and (trn[n].dtype.name == 'category'):
            df[n] = pd.Categorical(
                c, categories=trn[n].cat.categories, ordered=True)

apply_cats(test, train)

train['bmi_is_na'] = 0

train.loc[train.bmi.isna(),'bmi_is_na'] = 1

test['bmi_is_na'] = 0

test.loc[test.bmi.isna(),'bmi_is_na'] = 1

train.bmi.fillna(train.bmi[train.bmi.notnull()].median(), inplace=True)

test.bmi.fillna(train.bmi[train.bmi.notnull()].median(), inplace=True)

train.dtypes

test.dtypes

cat_cols = [
    'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'
]

for c in cat_cols:
    train[c] = train[c].cat.codes
    test[c] = test[c].cat.codes

train.head()

test.head()

def create_submission(pred, path, fname):
    submission = pd.DataFrame({'id': test.id, 'stroke': pred})
    submission.to_csv(path + fname, index=False)

train.shape

test.shape

def modelfit(alg,
             labels,
             predictors,
             useTrainCV=True,
             cv_folds=5,
             early_stopping_rounds=50):

    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(predictors, labels)
        cvresult = xgb.cv(
            xgb_param,
            xgtrain,
            num_boost_round=alg.get_params()['n_estimators'],
            nfold=cv_folds,
            metrics='auc',
            early_stopping_rounds=early_stopping_rounds,
            show_stdv=True,
            verbose_eval=True,
            seed=4,stratified=True)
        alg.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    alg.fit(predictors, labels)

    #Predict training set:
    dtrain_predictions = alg.predict(predictors)
    dtrain_predprob = alg.predict_proba(predictors)[:, 1]

    #Print model report:
#     fbeta_score = fbeta_threshold(labels, dtrain_predprob)
#     auc_score = metrics.roc_auc_score(labels, dtrain_predprob)

#     print(f"fbeta: {fbeta_score}")
#     print(f"AUC: {auc_score}")

    return alg

xgb1 = xgb.XGBClassifier(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    seed=27,
    silent=True)

x_train, y_train = train.drop('stroke', axis=1).values, train.stroke

m = modelfit(xgb1,y_train,x_train,cv_folds=3)

m

param_test1 = {
    'max_depth': range(3, 10, 2),
    'min_child_weight': range(1, 6, 2)
}
gsearch1 = GridSearchCV(
    estimator=xgb.XGBClassifier(
        learning_rate=0.1,
        n_estimators=37,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27),
    param_grid=param_test1,
    scoring='roc_auc',
    n_jobs=4,
    iid=False,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=1).split(
        train.drop('stroke', axis=1), train.stroke),verbose=3)

gsearch1.fit(x_train, y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

param_test1 = {
    'max_depth': [2,3,4],
    'min_child_weight': [4,5,6]
}
gsearch1 = GridSearchCV(
    estimator=xgb.XGBClassifier(
        learning_rate=0.1,
        n_estimators=37,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27),
    param_grid=param_test1,
    scoring='roc_auc',
    n_jobs=4,
    iid=False,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=1).split(
        train.drop('stroke', axis=1), train.stroke),verbose=3)

gsearch1.fit(x_train, y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

param_test1 = {
    'max_depth': [3],
    'min_child_weight': [22,24,26]
}
gsearch1 = GridSearchCV(
    estimator=xgb.XGBClassifier(
        learning_rate=0.1,
        n_estimators=37,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27),
    param_grid=param_test1,
    scoring='roc_auc',
    n_jobs=4,
    iid=False,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=1).split(
        train.drop('stroke', axis=1), train.stroke),verbose=3)

gsearch1.fit(x_train, y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(
    estimator=xgb.XGBClassifier(
        learning_rate=0.1,
        n_estimators=37,
        max_depth=3,
        min_child_weight=22,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27),
    param_grid=param_test3,
    scoring='roc_auc',
    n_jobs=4,
    iid=False,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=1).split(
        train.drop('stroke', axis=1), train.stroke),verbose=3)

gsearch3.fit(x_train, y_train)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

xgb1 = xgb.XGBClassifier(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=3,
    min_child_weight=22,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    seed=27,
    silent=True)

x_train, y_train = train.drop('stroke', axis=1).values, train.stroke

m = modelfit(xgb1,y_train,x_train,cv_folds=3)

m

param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(
    estimator=xgb.XGBClassifier(
        learning_rate=0.1,
        n_estimators=37,
        max_depth=3,
        min_child_weight=22,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27),
    param_grid=param_test4,
    scoring='roc_auc',
    n_jobs=4,
    iid=False,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=1).split(
        train.drop('stroke', axis=1), train.stroke),verbose=3)

gsearch4.fit(x_train, y_train)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

param_test5 = {
 'subsample':[i/100.0 for i in [85,90,95]],
 'colsample_bytree':[i/100.0 for i in [75,80,85]]
}
gsearch5 = GridSearchCV(
    estimator=xgb.XGBClassifier(
        learning_rate=0.1,
        n_estimators=37,
        max_depth=3,
        min_child_weight=22,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27),
    param_grid=param_test5,
    scoring='roc_auc',
    n_jobs=4,
    iid=False,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=1).split(
        train.drop('stroke', axis=1), train.stroke),verbose=3)

gsearch5.fit(x_train, y_train)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

xgb1 = xgb.XGBClassifier(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=3,
    min_child_weight=22,
    gamma=0,
    subsample=0.9,
    colsample_bytree=0.75,
    objective='binary:logistic',
    seed=27,
    silent=True)

x_train, y_train = train.drop('stroke', axis=1).values, train.stroke

m = modelfit(xgb1,y_train,x_train,cv_folds=3)

m

m.fit(x_train,y_train)

probs = m.predict_proba(test.values)[:,1]

create_submission(probs,path,'xgb8.csv')

m

for stacking

x_train, y_train = train.drop('stroke', axis=1).reset_index(drop= True), train.stroke

x_test = test

clf = m

trn = x_train.copy()

tst = x_test.copy()

x_train['xgb_soft'] = 0

kf = KFold(n_splits=5, shuffle=True, random_state=1)

for train_index, val_index in StratifiedKFold(n_splits=3, shuffle=True, random_state=1).split(
        x_train, y_train):
    x_trn, y_trn = trn.loc[train_index,:], y_train[train_index]
    x_val, y_val = trn.loc[val_index,:], y_train[val_index]

    clf.fit(x_trn, y_trn)
    
    probs = clf.predict_proba(x_val)[:, 1]
    
    
    x_train.loc[val_index,'xgb_soft'] = probs

    
clf.fit(trn,y_train)
probs = clf.predict_proba(tst)[:, 1]

x_test.loc[:,'xgb_soft'] = probs

x_train.to_feather('data/mckinsey/stack_trn_xgb')

x_test.to_feather('data/mckinsey/stack_tst_xgb')

