In [1]:
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2

%matplotlib inline

from sklearn.metrics import fbeta_score
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV, train_test_split

import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype, is_numeric_dtype

import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold

from rfpimp import *

from sklearn.metrics import roc_auc_score

In [2]:
from sortedcontainers import SortedList
import copy
import collections
import numpy as np
from itertools import product,chain
import pandas
from sklearn.model_selection import KFold
import catboost as cb

In [3]:
path = 'data/mckinsey/'

train = pd.read_csv(path + 'train.csv')

test = pd.read_csv(path + 'test.csv')

sample = pd.read_csv(path + 'sample_submission_1.csv')

train.head(1)

test.head(1)

train.shape

train.isnull().sum()

train.smoking_status.unique()

train.work_type.unique()

train.Residence_type.unique()

train.groupby('stroke').size()

def train_cats(df):
    """Change any columns of strings in a panda's dataframe to a column of
    catagorical values. This applies the changes inplace.

    Parameters:
    -----------
    df: A pandas dataframe. Any columns of strings will be changed to
        categorical values.

    Examples:
    ---------

    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category
    """
    for n, c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

train_cats(train)

def apply_cats(df, trn):
    """Changes any columns of strings in df into categorical variables using trn as
    a template for the category codes.

    Parameters:
    -----------
    df: A pandas dataframe. Any columns of strings will be changed to
        categorical values. The category codes are determined by trn.

    trn: A pandas dataframe. When creating a category for df, it looks up the
        what the category's code were in trn and makes those the category codes
        for df.

    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category {a : 1, b : 2}

    >>> df2 = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['b', 'a', 'a']})
    >>> apply_cats(df2, df)

           col1 col2
        0     1    b
        1     2    a
        2     3    a

    now the type of col is category {a : 1, b : 2}
    """
    for n, c in df.items():
        if (n in trn.columns) and (trn[n].dtype.name == 'category'):
            df[n] = pd.Categorical(
                c, categories=trn[n].cat.categories, ordered=True)

apply_cats(test, train)

train['bmi_is_na'] = 0

train.loc[train.bmi.isna(),'bmi_is_na'] = 1

test['bmi_is_na'] = 0

test.loc[test.bmi.isna(),'bmi_is_na'] = 1

train.bmi.fillna(train.bmi[train.bmi.notnull()].median(), inplace=True)

test.bmi.fillna(train.bmi[train.bmi.notnull()].median(), inplace=True)

train.dtypes

test.dtypes

cat_cols = [
    'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'
]

for c in cat_cols:
    train[c] = train[c].cat.codes
    test[c] = test[c].cat.codes

train.head()

test.head()

def create_submission(pred, path, fname):
    submission = pd.DataFrame({'id': test.id, 'stroke': pred})
    submission.to_csv(path + fname, index=False)

train.shape

test.shape

### without class weight balanced

all features considered

model = RandomForestClassifier(
    random_state=1,
    n_jobs=-1,
    n_estimators=50,
    oob_score=False,
    min_samples_leaf=3)

grid = {
    'min_samples_leaf': [200,230,250],
    'max_features': ['auto']
}

clf = GridSearchCV(
    model,
    grid,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=1).split(
        train.drop('stroke', axis=1), train.stroke),
    scoring='roc_auc',
    verbose=5)

clf.fit(train.drop('stroke', axis=1), train.stroke)

clf.best_score_

clf.best_params_

rf = RandomForestClassifier(
    n_jobs=-1,
    n_estimators=200,
    min_samples_leaf=200,
    max_features='auto',
    oob_score=True)

rf.fit(train.drop('stroke', axis=1), train.stroke)

probs = rf.predict_proba(test)[:,1]

create_submission(probs,path,'rfv5.csv')

feature selection

cols_to_keep = ['heart_disease', 'id', 'bmi', 'avg_glucose_level', 'age','hypertension']

x_train,y_train = train.loc[:,cols_to_keep], train.stroke

model = RandomForestClassifier(
    random_state=1,
    n_jobs=-1,
    n_estimators=50,
    oob_score=False,
    min_samples_leaf=3)

grid = {
    'min_samples_leaf': [200,230,250],
    'max_features': ['auto','log2',None,0.5]
}

clf = GridSearchCV(
    model,
    grid,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=1).split(
        x_train, y_train),
    scoring='roc_auc',
    verbose=5)

clf.fit(x_train, y_train)

clf.best_score_

clf.best_params_

cv score is not improving

without stratification

model = RandomForestClassifier(
    random_state=1,
    n_jobs=-1,
    n_estimators=50,
    oob_score=False,
    min_samples_leaf=3)

grid = {
    'min_samples_leaf': [100,150,200,230,250],
    'max_features': ['auto']
}

clf = GridSearchCV(
    model,
    grid,
    cv=3,
    scoring='roc_auc',
    verbose=5)

clf.fit(train.drop('stroke', axis=1), train.stroke)

clf.best_score_

clf.best_params_

### Ensembling

xgb = pd.read_csv('data/mckinsey/xgb_ens.csv')

rf = pd.read_csv('data/mckinsey/rf_ens.csv')

lgb = pd.read_csv('data/mckinsey/lgb_ens.csv')

cb = pd.read_csv('data/mckinsey/cat_ens.csv')

plt.hist(xgb.stroke)

plt.hist(rf.stroke)

plt.hist(lgb.stroke)

plt.hist(cb.stroke)

pred = (rf.stroke+xgb.stroke+cb.stroke+lgb.stroke)/4

create_submission(pred,path,'ensemble_4.csv')

for stacking

x_train, y_train = train.drop('stroke', axis=1).reset_index(drop= True), train.stroke

x_test = test

clf = RandomForestClassifier(
    n_jobs=-1,
    n_estimators=200,
    min_samples_leaf=200,
    max_features='auto',
    oob_score=True)

trn = x_train.copy()

tst = x_test.copy()

x_train['rf_soft'] = 0

kf = KFold(n_splits=5, shuffle=True, random_state=1)

for train_index, val_index in StratifiedKFold(n_splits=3, shuffle=True, random_state=1).split(
        x_train, y_train):
    x_trn, y_trn = trn.loc[train_index,:], y_train[train_index]
    x_val, y_val = trn.loc[val_index,:], y_train[val_index]

    clf.fit(x_trn, y_trn)
    
    probs = clf.predict_proba(x_val)[:, 1]
    
    
    x_train.loc[val_index,'rf_soft'] = probs

    
clf.fit(trn,y_train)
probs = clf.predict_proba(tst)[:, 1]

x_test.loc[:,'rf_soft'] = probs

x_train.to_feather('data/mckinsey/stack_trn_rf')

x_test.to_feather('data/mckinsey/stack_tst_rf')

