In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns


In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
profile

In [None]:
df.head()

In [None]:
(df.isnull().sum())/len(df) * 100

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x='stroke',data=df)

#### Severely imbalanced data. Need special technique applied to training data, such as SMOTE.

In [None]:
df[df['bmi'].isnull()]

In [None]:
# bmi has 201 null values and replace it with mean
df=df.replace(to_replace=np.nan,value=df.mean())

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
#Drop id column as that doesn't add any value.
df = df.drop(['id'], axis =1)

In [None]:
df.isnull().sum()

In [None]:
print(df['stroke'].value_counts())

In [None]:
print(df.dtypes)

In [None]:
df['gender'].value_counts()

#### Drop 'Other' since it has just one entry.

In [None]:
df = df[df.gender != 'Other']

In [None]:
df['gender'].value_counts()

In [None]:
dummies = pd.get_dummies(df[['gender', 'ever_married', 'Residence_type', 'smoking_status','work_type' ]],drop_first=True )
df = df.drop(['gender', 'ever_married', 'Residence_type', 'smoking_status','work_type' ], axis = 1)
df = pd.concat([df,dummies],axis=1)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
print(df['stroke'].value_counts())

In [None]:
plt.figure(figsize=(18,18))
sns.heatmap(df.corr(), annot=True, cmap=plt.cm.Reds)

In [None]:
#sns.pairplot(df,hue='stroke',palette='coolwarm')
df.corr()['stroke'][:].sort_values().plot(kind='bar')

1. #### Choose features with correlation > 0.01

In [None]:
cor_target = abs(df.corr()['stroke'])
set_features = cor_target[cor_target > 0.01]
print(set_features)

In [None]:

df = df[set_features.index]
df.head()

In [None]:
#Pick up feature set and the labels
X = df.drop('stroke',axis=1).values
y = df['stroke'].values

#### Using XGBoost classifier with SMOTEENN for oversampling the trianing dataset.

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from collections import Counter


#Split the dataset into 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

#Do oversampling for the train set using SMOTE
from imblearn.combine import SMOTEENN

smn = SMOTEENN()

print("Counts before SMOTE")
print(Counter(y_train))

X_res,y_res = smn.fit_resample(X_train,y_train)
print("Counts after SMOTE")
print(Counter(y_res))

In [None]:
classifier = XGBClassifier(objective= 'binary:logistic', eval_metric='error', max_depth = 4,max_delta_step=1,
                           learning_rate= 0.00003, n_estimators=6000,subsample=0.5,
                           use_label_encoder=False)

classifier.fit(X_res, y_res)



In [None]:
# evaluate predictions
from sklearn.metrics import confusion_matrix, average_precision_score, roc_auc_score, roc_curve, classification_report, precision_recall_curve, f1_score
    
y_prob=classifier.predict_proba(X_test)
y_pred = classifier.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
roc_xgb = roc_auc_score(y_test, y_prob[:,1])
print('ROC-AUC', roc_xgb)
print('='*20)
print('Confusion Matrix')
cm_xgb = confusion_matrix(y_test, y_pred)
print(cm_xgb)
sns.heatmap(cm_xgb, annot=True, cmap='viridis')
cl_xgb = classification_report(y_test,y_pred )
print(cl_xgb)


#### Let's do a quick check with Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression(C=2.0, max_iter=1000)
logmodel.fit(X_res,y_res)

In [None]:
predictions = logmodel.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
y_prob=logmodel.predict_proba(X_test)
#print('ROC-AUC')
roc_lin = roc_auc_score(y_test, y_prob[:,1])
print('ROC-AUC:', roc_lin)
print('='*20)
print(' Confusion Matrix')
cm_lin = confusion_matrix(y_test, predictions)
print(cm_lin)
sns.heatmap(cm_lin, annot=True, cmap='viridis')
cl_lin = classification_report(y_test,predictions)
print(cl_lin)


In [None]:
print("XGBoost")
print("*" * 10)
print(cl_xgb)
print("*" * 20)
print("Linear Regression")
print("*" * 20)

print(cl_lin)

### Observations: Using the same dataset for training, XGBoost and Logistic Regressions models gave comparable results. Tuning the models may improve the performance.