# Stroke Prediction 

According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.

--Dataset Copyrights: https://www.kaggle.com/fedesoriano

In [None]:
import warnings 
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Reading the dataset

data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data.head()

In [None]:
data.info()

In [None]:
# Null value percentage in bmi column:

data['bmi'].isnull().sum() / len(data) * 100

In [None]:
# Since the null value percentage is too low, we can drop the records.

data['bmi'].value_counts()

In [None]:
# data['bmi'].median()

In [None]:
data = data[~np.isnan(data['bmi'])]

In [None]:
# data['bmi'] = data['bmi'].fillna(data['bmi'].median())

In [None]:
data.info()

In [None]:
data['smoking_status'].value_counts()

In [None]:
len(data[data['smoking_status'] == 'Unknown']) / len(data) * 100

#### Since the null values are already categorized as 'Unknown', we will keep it as such and proceed.

In [None]:
data.head()

In [None]:
data['work_type'].value_counts()

In [None]:
data['Residence_type'].value_counts()

In [None]:
data['ever_married'].value_counts()

In [None]:
data['gender'].value_counts()

In [None]:
# Since there is only one value with Gender other, we can drop it. 
# Because this will lead to creating one extra variable during model building.

data = data[-(data['gender'] == 'Other')]

In [None]:
data['gender'].value_counts()

In [None]:
data['stroke'].value_counts()

In [None]:
data['hypertension'].value_counts()

#### We can see that the column age has values which will be easy to handle if made into buckets

In [None]:
data['age'].describe()

In [None]:
data.age.loc[(data.age > 100)]

In [None]:
data.age.loc[(data.age < 0)].count()

In [None]:
data.age.loc[(data.age > 80)].count()

In [None]:
data.age.loc[(data.age > 90)].count()

In [None]:
data.age.loc[(data.age < 10) & (data.stroke == 1)].count()

In [None]:
sns.distplot(data.age)
plt.show()

In [None]:
bins = [0, 20, 40, 60, 80, 1000]
labels = ['0-20', '20-40', '40-60', '60-80', '80+']
data['AgeGroup'] = pd.cut(data['age'], bins=bins, labels=labels, right=False)

In [None]:
data.head()

In [None]:
data['AgeGroup'].value_counts()

In [None]:
data.drop('age', axis=1, inplace=True)

In [None]:
data.head()

In [None]:
sns.displot(data.avg_glucose_level)
plt.show()

In [None]:
data.avg_glucose_level.describe()

#### As we can see, the minimum blood sugar level is around 55 which is completely normal. Hence we will create category accordingly.

In [None]:
bins = [0, 100, 150, 200, 250, 1000]
labels = ['< 100', '100-150', '150-200', '200-250', '250+']
data['GlucoseLevelRange'] = pd.cut(data['avg_glucose_level'], bins=bins, labels=labels, right=False)

In [None]:
data.head()

In [None]:
data.GlucoseLevelRange.value_counts()

In [None]:
data.drop('avg_glucose_level', axis=1, inplace=True)

In [None]:
data.head()

In [None]:
# We can drop the id column as it will be of no use.

data.drop('id', axis=1, inplace=True)

In [None]:
data.head()

In [None]:
# Handling bmi:

data['bmi'].describe()

- According to https://www.nhlbi.nih.gov/health/educational/lose_wt/BMI/bmi_tbl.pdf, BMI is divided into 4 categories.

- Let's check our value ranges


In [None]:
sns.displot(data.bmi)
plt.show()

In [None]:
sns.boxplot(data.bmi)
plt.show()

In [None]:
len(data[data['bmi'] > 65])

- It is clearly seen that few values are crossing the general range.
- But according to sources our maximum BMI value present(97.6) is a possible one. Hence we won't remove the values.

In [None]:
data.head()

In [None]:
bins = [0, 19, 25, 30, 40, 1000]
labels = ['Underweight', 'Normal', 'Overweight', 'Obese', 'ExtObese']
data['BMIGroup'] = pd.cut(data['bmi'], bins=bins, labels=labels, right=False)

In [None]:
data.drop('bmi', axis=1, inplace=True)

In [None]:
data.head()

#### Since our dataset is cleaned, we can proceed with preparing the data.

### Data Preparation:


In [None]:
data.info()

In [None]:
var_list = ['ever_married']
# Defining the map function
def binary_map(x):
    return x.map({'Yes': 1, "No": 0})

# Applying the function
data[var_list] = data[var_list].apply(binary_map)

In [None]:
data.tail()

In [None]:
# Creating dummy variables for all categorical variables:

dummy1 = pd.get_dummies(data[['gender', 'work_type', 'Residence_type', 'smoking_status', 'AgeGroup', 'GlucoseLevelRange', 'BMIGroup']], drop_first=True)

In [None]:
dummy1.head()

In [None]:
data = pd.concat([data, dummy1], axis=1)

In [None]:
data.head()

In [None]:
strokedf = data.copy()

In [None]:
strokedf = strokedf.drop(['gender', 'work_type', 'Residence_type', 'smoking_status', 'AgeGroup', 'GlucoseLevelRange', 'BMIGroup'], axis=1)

In [None]:
strokedf.head()

In [None]:
strokedf.info()

In [None]:
strokedf.isnull().sum()

### Test-Train-Split:

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Assigning all the feature variables to X:
X = strokedf.drop(['stroke'], axis=1)

In [None]:
# Assigning the target variable to y:
y = strokedf['stroke']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
# Splitting data into Test & Train set:

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

### Since we dont have any continuous variables, we can skip the feature scaling part.

In [None]:
stroke = (sum(strokedf['stroke'])/len(strokedf['stroke'].index))*100
stroke

#### We have a 4.3% Stroke rate

In [None]:
strokedf.stroke.value_counts()

In [None]:
# We can reduce the non-stroke sample number to balance the ratio:

shuffled_data = strokedf.sample(random_state=4)
stroke_yes = strokedf.loc[strokedf['stroke'] == 1]
stroke_no = strokedf.loc[strokedf['stroke'] == 0].sample(n= 1500,random_state= 101)

norm_strokedf = pd.concat([stroke_yes, stroke_no])

In [None]:
norm_strokedf.head()

In [None]:
stroke1 = (sum(norm_strokedf['stroke'])/len(norm_strokedf['stroke'].index))*100
stroke1

In [None]:
sns.countplot('stroke', data= norm_strokedf, palette= "colorblind")
plt.title('Stroke Analysis')
plt.show()

### Let's see the correlation:


In [None]:
# Importing matplotlib and seaborn
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Let's see the correlation matrix 
plt.figure(figsize = (20,10))        # Size of the figure
sns.heatmap(norm_strokedf.corr(),annot = True)
plt.show()

In [None]:
norm_strokedf.columns

In [None]:
# Lets remove some highly correlated variables:

X_test = X_test.drop(['work_type_children','work_type_Private','BMIGroup_Overweight', 'BMIGroup_Normal', 'AgeGroup_20-40', 'smoking_status_never smoked'], axis=1)
X_train = X_train.drop(['work_type_children','work_type_Private','BMIGroup_Overweight', 'BMIGroup_Normal', 'AgeGroup_20-40', 'smoking_status_never smoked'], axis=1)

In [None]:
plt.figure(figsize = (20,10))
sns.heatmap(X_train.corr(),annot = True)
plt.show()

### Model Building:



In [None]:
import statsmodels.api as sm

In [None]:
# Logistic regression model
logmod1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logmod1.fit().summary()

### Feature Selection using RFE:

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
from sklearn.feature_selection import RFE
rfe = RFE(logreg, 10)
rfe = rfe.fit(X_train, y_train)

In [None]:
rfe.support_

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]

In [None]:
X_train.columns[~rfe.support_]

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logmod2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logmod2.fit()
res.summary()

In [None]:
# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

##### Creating a dataframe with the actual stroke flag and the predicted probabilities

In [None]:
y_train_pred_final = pd.DataFrame({'Stroke':y_train.values, 'Stroke_Prob':y_train_pred})
y_train_pred_final['pID'] = y_train.index
y_train_pred_final.head()

##### Creating new column 'predicted' with 1 if Stroke_Prob > 0.3 else 0

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Stroke_Prob.map(lambda x: 1 if x > 0.3 else 0)

# Let's see the head
y_train_pred_final.head()

In [None]:
from sklearn import metrics

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Stroke, y_train_pred_final.predicted )
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Stroke, y_train_pred_final.predicted))

#### Checking VIFs

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping variable with high p-value:

col = col.drop('ever_married', 1)
col

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logmod3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logmod3.fit()
res.summary()

In [None]:
# Dropping variable with high p-value:

col = col.drop('smoking_status_formerly smoked', 1)
col

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logmod4 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logmod4.fit()
res.summary()

In [None]:
# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

##### Creating a dataframe with the actual stroke flag and the predicted probabilities

In [None]:
y_train_pred_final = pd.DataFrame({'Stroke':y_train.values, 'Stroke_Prob':y_train_pred})
y_train_pred_final['pID'] = y_train.index
y_train_pred_final.head()

##### Creating new column 'predicted' with 1 if Stroke_Prob > 0.3 else 0

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Stroke_Prob.map(lambda x: 1 if x > 0.3 else 0)

# Let's see the head
y_train_pred_final.head()

In [None]:
from sklearn import metrics

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Stroke, y_train_pred_final.predicted )
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Stroke, y_train_pred_final.predicted))

#### Checking VIFs

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

### Sensitivity and Specificity

In [None]:
TP = confusion[1,1] # True Positive 
TN = confusion[0,0] # True Negatives
FP = confusion[0,1] # False Positives
FN = confusion[1,0] # False Negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

In [None]:
# Calculate false postive rate - predicting stroke when patient does not have stroke
print(FP/ float(TN+FP))

In [None]:
# positive predictive value 
print (TP / float(TP+FP))

In [None]:
# Negative predictive value
print (TN / float(TN+ FN))

### Plotting the ROC Curve

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Stroke, y_train_pred_final.Stroke_Prob, drop_intermediate = False )

In [None]:
draw_roc(y_train_pred_final.Stroke, y_train_pred_final.Stroke_Prob)

##### The curve looks good

### Finding Optimal Cutoff Point

Optimal cutoff probability is that prob where we get balanced sensitivity and specificity

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Stroke_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Stroke, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

#### From the curve above, 0.08 is the optimum point to take it as a cutoff probability.

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Stroke_Prob.map( lambda x: 1 if x > 0.08 else 0)

y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_train_pred_final.Stroke, y_train_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_train_pred_final.Stroke, y_train_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # True Positive 
TN = confusion2[0,0] # true Negatives
FP = confusion2[0,1] # False Positives
FN = confusion2[1,0] # False Negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

In [None]:
# Calculate false postive rate - predicting stroke when patient does not have stroke
print(FP/ float(TN+FP))

In [None]:
# Positive predictive value 
print (TP / float(TP+FP))

In [None]:
# Negative predictive value
print (TN / float(TN+ FN))

#### We will also check the Precision-Recall for our model and make final call on which method to go with


## Precision and Recall

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Stroke, y_train_pred_final.predicted )
confusion

#### Precision
TP / TP + FP

In [None]:
confusion[1,1]/(confusion[0,1]+confusion[1,1])

#### Recall
TP / TP + FN

In [None]:
confusion[1,1]/(confusion[1,0]+confusion[1,1])

##### Using sklearn metrics

In [None]:
from sklearn.metrics import precision_score, recall_score

In [None]:
precision_score(y_train_pred_final.Stroke, y_train_pred_final.predicted)

In [None]:
recall_score(y_train_pred_final.Stroke, y_train_pred_final.predicted)

### Precision and Recall tradeoff

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
# y_train_pred_final.Stroke, y_train_pred_final.predicted

In [None]:
p, r, thresholds = precision_recall_curve(y_train_pred_final.Stroke, y_train_pred_final.Stroke_Prob)

In [None]:
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.show()

###  Making predictions on the Test set

In [None]:
X_test = X_test[col]
X_test.head()

In [None]:
X_test_sm = sm.add_constant(X_test)

Making predictions on the test set

In [None]:
y_test_pred = res.predict(X_test_sm)

In [None]:
y_test_pred[:10]

In [None]:
# Converting y_pred to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_test_pred)

In [None]:
# Let's see the head
y_pred_1.head()

In [None]:
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)

In [None]:
# Putting pID to index
y_test_df['pID'] = y_test_df.index

In [None]:
# Removing index for both dataframes to append them side by side 
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
y_pred_final.head()

In [None]:
# Renaming the column 
y_pred_final= y_pred_final.rename(columns={ 0 : 'Stroke_Prob'})

In [None]:
# Let's see the head of y_pred_final
y_pred_final.head()

In [None]:
y_pred_final['final_predicted'] = y_pred_final.Stroke_Prob.map(lambda x: 1 if x > 0.08 else 0)

In [None]:
y_pred_final.head()

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_pred_final.stroke, y_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_pred_final.stroke, y_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # True Positive 
TN = confusion2[0,0] # True Negatives
FP = confusion2[0,1] # Talse Positives
FN = confusion2[1,0] # Talse Negatives

#### We go with the Sensitivity Specificity Evaluation as the Percision-Recall was not suiting our set

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

In [None]:
X_test.columns.values

In [None]:
res.summary()