In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing required libraries
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,roc_auc_score,confusion_matrix,roc_curve

# step 1 : Reading and understanding the data

In [None]:
# reading the data and displaying the head of the data
df=pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

In [None]:
# checking the shape
df.shape

In [None]:
# checking info 
df.info()

In [None]:
# checking five point summary of data
df.describe(include = 'all')


# step 2 - checking for duplicates and missing values 

__Here we are checking each id value to check ther duplicates in the data and we are taking sum of all the boolean values and equating it to zero to check if it is True__

In [None]:
# checking for duplicates
sum(df['id'].duplicated())==0

In [None]:
# checking the null values in DataFrame
round(df.isnull().sum()*100/len(df),2)

__As we can see BMI is the only variable containing null values . lets check this columns for more details__

In [None]:
df['bmi'].describe()

In [None]:
# distribution plot for BMI
plt.figure(figsize=[12,8])
sns.distplot(df['bmi'])
plt.axvline(df['bmi'].mean(),label='mean',color='r')
plt.axvline(df['bmi'].median(),label='median',color='g')
plt.legend()
plt.show()

__we can see from the above plot that both mean and median are close to each other. so we can replace null values with  median because median is less affected by outliers than mean__

In [None]:
# replacing null values with median
df.bmi.fillna(28.1,axis=0, inplace=True)

In [None]:
# checking for null values after null value treatment
df.isnull().sum()

# step 3 : Exploratory Data Analysis

__A) univariate analysis__

In [None]:
df.dtypes

In [None]:
# mapping 0 to no and 1 to yes for hyper tension and heart_disease variables
cols=['hypertension','heart_disease']
for col in cols:
    df[col]=df[col].map({1:'Yes',0:'No'})

In [None]:
df.head()

In [None]:
# creating numerical and categorical columns in each list
num_cols=list(df.select_dtypes(include=np.number).columns)
print(num_cols)
cat_cols=list(df.select_dtypes(include='object').columns)
print(cat_cols)

In [None]:
# checking distribution of numerical columns
plt.figure(figsize=[15,10])
for col in enumerate(num_cols[1:]):
    plt.subplot(2,2,col[0]+1)
    sns.distplot(df[col[1]])
    plt.tight_layout()
plt.show()

# countplots for categorical columns

In [None]:
# checking countplot for categorical columns 
plt.figure(figsize=[20,15])
for col in enumerate(cat_cols):
    plt.subplot(4,2,col[0]+1)
    sns.countplot(df[col[1]])
    plt.tight_layout()
plt.show()

## b) Bivariate Analysis

In [None]:
# scatter plot for age vs average_glucose_level
plt.figure(figsize=[15,10])
sns.scatterplot(df['age'],df['avg_glucose_level'],color='cyan')
plt.show()

In [None]:
# scatter plot for age vs bmi
plt.figure(figsize=[15,10])
sns.scatterplot(df['age'],df['bmi'],color='orange')
plt.show()

In [None]:
# scatter plot for avg_glucose_level vs bmi
plt.figure(figsize=[15,10])
sns.scatterplot(df['avg_glucose_level'],df['bmi'],color='g')
plt.show()

In [None]:
# checking countplot with stroke for categorical columns 
plt.figure(figsize=[20,18])
for col in enumerate(cat_cols):
    plt.subplot(4,2,col[0]+1)
    sns.countplot(df[col[1]],hue=df['stroke'])
    plt.tight_layout()
plt.show()


In [None]:
# pair plot 
plt.figure(figsize=[20,12])
sns.pairplot(data=df,hue='stroke')
plt.show()

In [None]:
plt.figure(figsize=[15,10])
sns.scatterplot(df['age'],df['avg_glucose_level'],hue=df['stroke'],color='cyan')
plt.show()

In [None]:
plt.figure(figsize=[15,10])
sns.scatterplot(df['age'],df['bmi'],hue=df['stroke'])
plt.show()


In [None]:
plt.figure(figsize=[15,10])
sns.scatterplot(df['avg_glucose_level'],df['bmi'],hue=df['stroke'])
plt.show()

## c)Multivariate analysis

In [None]:
# correlation matrix
df.corr()

In [None]:
# heatmap
plt.figure(figsize=[10,6])
sns.heatmap(df.corr(),cmap='RdYlGn',annot=True)
plt.show()

# Step 4 : Outlier Treatment

In [None]:
# checking for outliers in numerical columns
plt.figure(figsize=[15,10])
for col in enumerate(num_cols[1:-1]):
    plt.subplot(2,2,col[0]+1)
    sns.boxplot(df[col[1]])
    plt.tight_layout()
plt.show()

__we can see that avg_glucose_level and bmi are having so many outliers so treating them with various preprocessing techniques and iqr iqr capping method__

In [None]:
# IQR capping method
# x = df.describe()
# for i in num_cols[2:-1]:
#     q1=x.loc['25%',i]
#     q3=x.loc['75%',i]
#     iqr=q3-q1
#     uppl=q3+(1.5*iqr)
#     lowl=q1-(1.5*iqr)
#     df[i]=df[i].apply(lambda x:uppl if x>uppl else x )
#     df[i]=df[i].apply(lambda x: lowl if x<lowl else x)

In [None]:

# plt.figure(figsize=[15,10])
# for col in enumerate(num_cols[1:-1]):
#     plt.subplot(2,2,col[0]+1)
#     sns.boxplot(df[col[1]])
#     plt.tight_layout()
# plt.show()

In [None]:
df.shape

## step 5: Dummies Creation

In [None]:
x=df.drop(['stroke','id'],axis=1)
y=df['stroke']

In [None]:
# creating dummies
xd=pd.get_dummies(x,drop_first=True)
xd.head()

In [None]:
# checking the correlation after creating the dummies
plt.figure(figsize=[20,10])
sns.heatmap(xd.corr(),annot=True,cmap='RdYlGn')
plt.show()

## Step 6 : Train Test Split 

In [None]:
x_train,x_test,y_train,y_test=train_test_split(xd,y,test_size=0.3,random_state=100)

In [None]:
# checking the shape of x_train x_test y_train y_test
x_train.shape,x_test.shape,y_train.shape,y_test.shape

# scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
cols_to_scale=['age','avg_glucose_level','bmi']

In [None]:
# creating scaler instance
scaler=MinMaxScaler()

# fit transform for x_train
x_train[cols_to_scale]=scaler.fit_transform(x_train[cols_to_scale])

# transforming for x_test
x_test[cols_to_scale]=scaler.transform(x_test[cols_to_scale])

In [None]:
x_train.head()

# Step 7 : Model building

## Base model

In [None]:
# creating an instance for logistic regression
logreg=LogisticRegression(solver='liblinear')

In [None]:
logreg.fit(x_train,y_train)

In [None]:
y_train_pred=logreg.predict(x_train)
y_train_pred

In [None]:
accuracy_score(y_train,y_train_pred)

In [None]:
y_train_prob=logreg.predict_proba(x_train)[:,1]
y_train_prob

In [None]:
roc_auc_score(y_train,y_train_prob)

In [None]:
print(confusion_matrix(y_train,y_train_pred))

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(12, 8))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
from sklearn import metrics

In [None]:
fpr,tpr,thresholds=metrics.roc_curve(y_train_pred,y_train_prob,drop_intermediate=False)

In [None]:
draw_roc(y_train,y_train_prob)

In [None]:
y_test_pred=logreg.predict(x_test)
accuracy_score(y_test,y_test_pred)

In [None]:
print(confusion_matrix(y_test,y_test_pred))

# Final model : gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RandomizedSearchCV,StratifiedKFold
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform


In [None]:
gbc=GradientBoostingClassifier()

params={'n_estimators':sp_randint(50,250),'max_depth':sp_randint(1,15),
        'learning_rate':sp_uniform(0,0.5),'learning_rate':range(0,2)}

r_search=RandomizedSearchCV(estimator=gbc,param_distributions=params,cv=3,n_iter=10,scoring='roc_auc',
                           random_state=4,n_jobs=-1)

print(r_search.fit(xd,y))
print(r_search.best_params_)

In [None]:
gbc=GradientBoostingClassifier(**r_search.best_params_,random_state=4)
gbc.fit(x_train,y_train)
y_train_pred=gbc.predict(x_train)
y_train_prob=gbc.predict_proba(x_train)[:,1]
print('train - confusion matrix : ','\n',confusion_matrix(y_train,y_train_pred))
print('train - accuracy score : ','\n', accuracy_score(y_train,y_train_pred))
print('train - AUC : ', roc_auc_score(y_train,y_train_prob))

y_pred=gbc.predict(x_test)
y_prob=gbc.predict_proba(x_test)[:,1]
print('test - confusion matrix : ','\n',confusion_matrix(y_test,y_pred))
print('test - accuracy score : ','\n', accuracy_score(y_test,y_pred))
print('test - AUC : ', roc_auc_score(y_test,y_prob))

In [None]:
draw_roc(y_train,y_train_prob)

In [None]:
draw_roc(y_test,y_prob)