In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing required libraries
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix, roc_auc_score, roc_curve
sns.set_style('darkgrid')

# Step 1 : Reading and understanding data

In [None]:
# Reading the data
stroke = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
# Checking shape of the data
stroke.shape

In [None]:
# checking head of data
stroke.head()

In [None]:
# Checking info
stroke.info()

In [None]:
# Checking stats of data
stroke.describe(include='all')

# Step 2: Checking for duplicates and Missing value treatment

**Here we are checking each id value to check the duplicates in the data. We are taking sum of all the boolean values and equating it to zero to check if it is true**

In [None]:
# Checking for duplicates
sum(stroke.duplicated(subset='id'))==0

In [None]:
# Checking null values in dataframe
round(stroke.isnull().sum()*100/len(stroke),2)

__As we can see bmi column has null values. Lets inspect this column for more details.__

In [None]:
# inspecting bmi column
stroke['bmi'].describe()

In [None]:
# Lets look at the distribution as well
from scipy.stats import norm
plt.figure(figsize = [12,8])
sns.distplot(stroke.bmi, color='green')
plt.axvline(stroke['bmi'].mean(),label='mean',color='red')
plt.axvline(stroke['bmi'].median(),label='mean',color='blue')
plt.legend()
plt.show()

__As we can see that mean and median seems to be close, we can replace the null values with median is not effected by outliers.__

In [None]:
# replacing null values with median
stroke.bmi.fillna(28.1, axis = 0, inplace = True)

In [None]:
# Checking null values again
stroke.isnull().sum()

__There are no null values in the dataset__

# Step 3: Exploratory Data Analysis

## a) Univariate analysis

In [None]:
# Checking data types of each column
stroke.dtypes

In [None]:
# Mapping 1 to yes and 0 to no for hypertension and heart_disease columns
cols = ['hypertension','heart_disease']
for col in cols:
    stroke[col] = stroke[col].map({1: 'Yes',0:'No'})

In [None]:
# Checking head to confirm whether mapping is done or no
stroke.head()

In [None]:
# creating numerical  columns in lists

numerical = list(stroke.select_dtypes(exclude='object').columns)
numerical

In [None]:
# creating categorical columns in list 

categorical = list(stroke.select_dtypes(include='object').columns)
categorical

In [None]:
# checking distribution plots for numerical columns
plt.figure(figsize=[20,15])
for col in enumerate(numerical[1:]):
    plt.subplot(2,2,col[0]+1)
    sns.distplot(stroke[col[1]])
    plt.tight_layout()
plt.show()

In [None]:
# checking countplot for categorical columns 

plt.figure(figsize=[20,15])
for col in enumerate(categorical):
    plt.subplot(4,2,col[0]+1)
    sns.countplot(stroke[col[1]])
    plt.tight_layout()
plt.show()

## b) Bivariate analysis

In [None]:
# Scatter plot for age vs avg_glucose_level

plt.figure(figsize=[12,8])
sns.scatterplot(stroke.age, stroke.avg_glucose_level, color= 'b')
plt.show()

In [None]:
# Scatter plot for age vs bmi
plt.figure(figsize=[12,8])
sns.scatterplot(stroke.age, stroke.bmi, color= 'r')
plt.show()

In [None]:
# Scatter plot for age vs bmi
plt.figure(figsize=[12,8])
sns.scatterplot(stroke.avg_glucose_level, stroke.bmi, color= 'g')
plt.show()

In [None]:
stroke.head()


In [None]:
# checking countplot with stroke for categorical columns 

plt.figure(figsize=[20,15])
for col in enumerate(categorical):
    plt.subplot(4,2,col[0]+1)
    sns.countplot(stroke[col[1]],hue=stroke.stroke)
    plt.tight_layout()
plt.show()


In [None]:
# Pair plot for the data
plt.figure(figsize=[20,15])
sns.pairplot(data = stroke, hue = 'stroke')
plt.show()

In [None]:
# Scatter plot for age vs avg_glucose_level

plt.figure(figsize=[12,8])
sns.scatterplot(stroke.age, stroke.avg_glucose_level, hue=stroke.stroke)
plt.show()

In [None]:
# Scatter plot for age vs avg_glucose_level

plt.figure(figsize=[12,8])
sns.scatterplot(stroke.age, stroke.bmi, hue=stroke.stroke)
plt.show()

In [None]:
# Scatter plot for age vs avg_glucose_level

plt.figure(figsize=[12,8])
sns.scatterplot(stroke.bmi, stroke.avg_glucose_level, hue=stroke.stroke)
plt.show()

## c) Multivariate analysis

In [None]:
# Correlation matrix
stroke.corr()

In [None]:
# Heatmap for the data
plt.figure(figsize=[12,8])
sns.heatmap(stroke.corr(), cmap='RdYlGn', annot=True)

# Step 4: Outlier treatment

In [None]:
# Checking outliers in numerical columns

plt.figure(figsize=[20,15])
for col in enumerate(numerical[1:-1]):
    plt.subplot(2,2,col[0]+1)
    sns.boxplot(stroke[col[1]])
    plt.tight_layout()
plt.show()


In [None]:
# # Capping outliers 
# x = stroke.describe()
# for i in numerical[2:-1]:
#     q1=x.loc['25%',i]
#     q3=x.loc['75%',i]
#     iqr=q3-q1
#     uppl=q3+(1.5*iqr)
#     lowl=q1-(1.5*iqr)
#     stroke[i]=stroke[i].apply(lambda x:uppl if x>uppl else x )
#     stroke[i]=stroke[i].apply(lambda x: lowl if x<lowl else x)

In [None]:
# # Checking outliers in numerical columns

# plt.figure(figsize=[20,15])
# for col in enumerate(numerical[1:-1]):
#     plt.subplot(2,2,col[0]+1)
#     sns.boxplot(stroke[col[1]])
#     plt.tight_layout()
# plt.show()


In [None]:
stroke.shape

In [None]:
stroke.dtypes

# Step 5: Creating dummies

In [None]:
# Dropping id column from the data frame
stroke.drop('id',1, inplace = True)

In [None]:
stroke.head()

In [None]:
X = stroke.drop('stroke',1)
y = stroke[['stroke']]

creating dummies for the X

In [None]:
X = pd.get_dummies(X, drop_first= True)

In [None]:
# Looking at the heatmap after dummies creation
plt.figure(figsize=[15,8])
sns.heatmap(X.corr(), cmap='RdYlGn', annot = True)
plt.show()

# Step 6: Train Test Split and Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=100, test_size = 0.3)

In [None]:
X_train.shape, X_test.shape,  y_train.shape,y_test.shape

In [None]:
X_train.dtypes

## Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
cols_to_scale = ['age', 'avg_glucose_level', 'bmi']

In [None]:
# Creating scaler instance

scaler = MinMaxScaler()

# Fit transform for X_train
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

# Transforming X_test
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])


In [None]:
X_train.head()

In [None]:
X_test.head()

# Step 7: Model Building

## Base model

In [None]:
# Creating logistic regression instance
logreg = LogisticRegression(solver='liblinear')

In [None]:
# Fitting model
logreg.fit(X_train,y_train)

In [None]:
y_train_pred = logreg.predict(X_train)
y_train_pred

In [None]:
accuracy_score(y_train,y_train_pred)

In [None]:
Probabilities = logreg.predict_proba(X_train)[:,1]
Probabilities

In [None]:
roc_auc_score(y_train, Probabilities)

In [None]:
confusion_matrix(y_train,y_train_pred)

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(12, 8))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
from sklearn import metrics

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred, Probabilities, drop_intermediate = False )

In [None]:
draw_roc(y_train,Probabilities)

In [None]:
stroke.stroke.value_counts(normalize=True)

In [None]:
from imblearn.over_sampling  import SMOTE
smote = SMOTE(sampling_strategy='minority')

In [None]:
X_train_sm, y_train_sm = smote.fit_resample(X_train,y_train)

In [None]:
logreg.fit(X_train_sm,y_train_sm)

In [None]:
y_pred_train_sm = logreg.predict(X_train_sm)

In [None]:
accuracy_score(y_train_sm, y_pred_train_sm)

In [None]:
print(classification_report(y_true=y_train_sm, y_pred=y_pred_train_sm))

In [None]:
confusion_matrix(y_train_sm,y_pred_train_sm)