In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
train_data.head()

In [None]:
train_data.describe()

In [None]:
#look for null values
train_data.isna().sum()

In [None]:
# fix the null value for bmi by using mean
train_data.bmi = train_data.bmi.fillna(train_data.bmi.mean())

In [None]:
train_data.isna().sum()

In [None]:
train_data.info()

In [None]:
# check all categorical values
new_train = pd.DataFrame()
col_dict = []
unique_dict = []
count_dict = []
obj_col = train_data.select_dtypes(include='object')
for col in obj_col.columns:
    col_dict.append(col)
    unique_dict.append(obj_col[col].unique())
    count_dict.append(obj_col[col].value_counts())
new_train['col_dict'] = col_dict
new_train['unique_dict'] = unique_dict
new_train['count_dict'] = count_dict

In [None]:
new_train.head()

In [None]:
train_data.stroke.value_counts()

In [None]:
plt.figure(figsize=(10,8))
sns.histplot(x= train_data.stroke, legend=True)

In [None]:
#clearly this is imbalanced dataset

In [None]:
sns.heatmap(train_data.corr())

In [None]:
# from above we see that ID is not at all related, hence can be dropped
# age seems to be somehow correlated to stroke.

In [None]:
train_data = train_data.drop('id', axis=1)

In [None]:
# label encoding of all the categorical columns
obj_col = train_data.select_dtypes(include='object')
for col in obj_col.columns:
    #convert column types
    obj_col[col] = obj_col[col].astype('category')
    train_data[col+'_cat'] = obj_col[col].cat.codes

In [None]:
obj_col.head()

In [None]:
train_data.head()

In [None]:
new_train_data = train_data.copy()

In [None]:
new_train_data = new_train_data.drop(columns=new_train_data.select_dtypes(include='object'), axis=1)

In [None]:
new_train_data.info()

In [None]:
new_train_data.head()

In [None]:
new_train_data.info()

In [None]:
sns.heatmap(new_train_data.corr())

In [None]:
# lets balance the data as it is highly imbalanced
# lets try first cross validation
# oversamplling otherwise number of data would be too less (oversampling with SMOTE)
# BalancedBaggingClassifier with RandomForest as base estimator


In [None]:
X = new_train_data.drop(columns = 'stroke', axis=1)
y = new_train_data.stroke

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import recall_score, f1_score, roc_auc_score, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedBaggingClassifier

In [None]:
def score_model(model, params=None):
    smoter = SMOTE(random_state=42)
    
    scores = []
    if params is None:
        params = {
            'n_estimators': 100,
            'max_depth': 5,
            'random_state': 13
        }
        
    Kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    for i,j in Kfold.split(X,y):
        X_train, X_test = X.iloc[i],X.iloc[j]
        y_train, y_test = y.iloc[i], y.iloc[j]
        
        X_train_upsample, y_train_upsample = smoter.fit_resample(X_train, y_train)
        model_obj  = model(**params).fit(X_train_upsample, y_train_upsample)
        score = f1_score(y_test, model_obj.predict(X_test))
        
        scores.append(score)
    return np.array(scores)
        #summarize the train and test composition
        #train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
        #test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])        

        #print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))

In [None]:
score_tracker = []
# lets loop through the best params
params = {'n_estimators': [50, 100, 200],
 'max_depth': [4, 6, 10, 12],
 'random_state': [13]}

for n_est in params['n_estimators']:
    for max_dep in params['max_depth']:
        example_params = {'n_estimators' : n_est,
                         'max_depth': max_dep,
                         'random_state' : 13
                         }
        example_params['recall'] = score_model(RandomForestClassifier, example_params).mean()
        
        score_tracker.append(example_params)
    

In [None]:
sorted(score_tracker, key = lambda x: x['recall'], reverse=True)

In [None]:
# try score model with decisiontreeclassifier
params_dtc = {
   'criterion': 'entropy',
    'splitter': 'random',
    'max_depth': 5,
    'random_state': 13
}
score_model(DecisionTreeClassifier, params_dtc)



In [None]:
# best one is for n_est=100, max_depth=6

In [None]:
# trying BalancedBaggingClassifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
bbc = BalancedBaggingClassifier(base_estimator=rfc, sampling_strategy='auto',
                                replacement=False, random_state=3)
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=33)
bbc.fit(X_train, y_train)
preds = bbc.predict(X_test)
print('recall score with BalancedBaggingClassifier_rfc: ', recall_score(y_test, preds))

# try with DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=42)
bbc_dtc = BalancedBaggingClassifier(base_estimator=dtc, sampling_strategy='auto',
                                replacement=False, random_state=3)
bbc_dtc.fit(X_train, y_train)
preds_dtc = bbc_dtc.predict(X_test)
print('f1 score with BalancedBaggingClassifier_dtc: ', f1_score(y_test, preds_dtc))
