In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read and check data

In [None]:
stroke_data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
stroke_data.head()

In [None]:
stroke_data.info()

# Some visual analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
sns.kdeplot(data = stroke_data[stroke_data["stroke"]==0],x = "age",shade = True,label = "Healthy")
sns.kdeplot(data = stroke_data[stroke_data["stroke"]==1],x = "age",shade = True,label = "Stroke")
plt.legend()


## Conclusions:
* Most stroke patients are roughly above 40 years of age and married.
* Chances of getting a stroke is less for people below the age of 25
* People below the age of 30 don't get married that often

In [None]:

sns.kdeplot(data = stroke_data[stroke_data['stroke'] == 0], x = 'avg_glucose_level', shade = True,   label = "Healthy")
sns.kdeplot(data = stroke_data[stroke_data['stroke'] == 1], x = 'avg_glucose_level', shade = True,   label = "Stroke")
plt.legend()

In [None]:
sns.kdeplot(x = stroke_data.stroke,y = stroke_data.age,hue = stroke_data.Residence_type,shade = True,alpha = 0.5)

## Conclusion:
* Doesn't matter if you are in urban or rural area.

In [None]:
sns.displot(x = stroke_data.stroke,y = stroke_data.age,hue = stroke_data.work_type,kind = 'kde',fill = True,alpha = 0.5)

## Conclusion
* People worked in govt jobs are not prone to stroke

In [None]:
h = stroke_data[stroke_data['stroke']==0].smoking_status.value_counts()
s = stroke_data[stroke_data['stroke']==1].smoking_status.value_counts()
plt.subplot(2,1,1)
plt.bar(h.index,height = h.values,width = 0.2,label = "healthy",color = "green")
plt.legend()
plt.subplot(2,1,2)
plt.bar(s.index,height = s.values,width = 0.2,label = "stroke",color = "magenta")
plt.legend()



#sns.histplot(data = stroke_data,x = 'stroke',y = 'age',hue = 'smoking_status')

In [None]:
sns.displot(data = stroke_data,x = 'age',y = 'gender',hue = 'stroke')

In [None]:
sns.kdeplot(data = stroke_data[stroke_data['stroke'] == 0], x = 'avg_glucose_level', shade = True,   label = "Healthy")
sns.kdeplot(data = stroke_data[stroke_data['stroke'] == 1], x = 'avg_glucose_level', shade = True,   label = "Stroke")
plt.legend()

In [None]:
sns.kdeplot(data = stroke_data,x = 'bmi',y = 'age',hue = 'stroke',fill = True,alpha = 0.8)
plt.text(-5,-70,"Conclusion \n    For people who had a stroke, the bmi seems to have a major role for people aged 40-80 with single dominating peak \n   Younger people tend to have lower bmi and older people tend to have bmi in the range 25-35 indicated by the double peak.",{'color':"red",'fontfamily':"serif",'fontsize':14,'fontweight':5,'linespacing':1.5})

## Conclusion:
* For people who had a stroke, the bmi seems to have a major role for people aged 40-80 with single dominating peak
* Younger people tend to have lower bmi and older people tend to have bmi in the range 25-35 indicated by the double peak.

In [None]:
sns.kdeplot(data = stroke_data,x = 'age',y = 'avg_glucose_level',hue = 'stroke',fill = True,alpha = 0.8)
plt.text(-20,-150,"Conclusion \n Aged people with irregular blood glucose(high or low) tend to be prone to stroke",{'color':"red",'fontfamily':"serif",'fontsize':14,'fontweight':5,'linespacing':1.5})

In [None]:
sns.kdeplot(data = stroke_data,x = 'bmi',y = 'avg_glucose_level',hue = 'stroke',fill = True,alpha = 0.8)


In [None]:
correlation_matrix = stroke_data.corr()
correlation_matrix["stroke"]

# Cleaning the data

In [None]:
stroke_labels = stroke_data["stroke"].copy()
stroke_data_drop = stroke_data.drop(["bmi","Residence_type","id","stroke"],axis = 1)

In [None]:
stroke_data_drop.head()

## One hot encoding the categorical variables

In [None]:
import sklearn
from sklearn.preprocessing import OneHotEncoder
stroke_data_cat = stroke_data_drop[["gender","ever_married","work_type","smoking_status"]]
encoder = OneHotEncoder()
stroke_1hot = encoder.fit_transform(stroke_data_cat)
stroke_1hot

In [None]:
encoder.categories_

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


cat_attribs = ["gender","ever_married","work_type","smoking_status"]
num_attributes = ["age","hypertension","heart_disease","avg_glucose_level"]

num_pipeline = Pipeline([('std_scaler',StandardScaler())])
full_pipeline = ColumnTransformer([("num",num_pipeline,num_attributes),("cat",OneHotEncoder(),cat_attribs)])
stroke_prepared = full_pipeline.fit_transform(stroke_data_drop)

# Machine Learning Classifier

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=1)

for train_index, test_index in sss.split(stroke_prepared, stroke_labels):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = stroke_prepared[train_index], stroke_prepared[test_index]
    y_train, y_test = stroke_labels[train_index], stroke_labels[test_index]

#X_train, X_test, y_train, y_test = train_test_split(stroke_prepared,stroke_labels, test_size=0.2, random_state=1)

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(1000,500,500,100,10), activation = 'logistic',solver='adam', alpha=0.0001, batch_size='auto',max_iter=200, shuffle=True, random_state=1, verbose=False)
clf.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(clf,X_train,y_train,cv = 3)
from sklearn.metrics import mean_squared_error
#clf.score(X_test,y_test)


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train,y_pred)

## Confusion Matrix
The confusion Matrix shows that the classifier is not at all doing it job. This is because, the data is highly skewed. 95% of the data falls under the "non-stroke" category. Only 5% data is labelled "stroke". This makes it possible for even the dumbest classifier algorithm to score atleast high since all it has to do is classify everyone are a "non-stroke" candidate and still stands at getting 95% accuracy. This also shows that accuracy of prediction is not a measure for this particular type of problems.


The confusion matrix on the other hand gives us a clearer picture. The model correctly classifies 3889 data as "non-stroke" ($\textit{True-Negative}$) but it also clasifies the 199 "stroke" cases as "non stroke" ($\textit{False-Negative}$). Note that both the "non-stroke" classified as "stroke"($\textit{False-Positive}$) and the "stroke" classified correctly ($\textit{True-Positive}$) are zero. This emphasises the point discussed above.

This gives us reason to develop either even powerful models or manipulate the data in such a was as to get a better model out of it. A powerful model with such small data will be prone to overfit. Therefore we need to augment our data to get a better model.

## Using a different model to try classification
* From the sklearn, we use the stochastic gradient descent model to try and improve our prediction

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state = 2)
y_sgd_pred = cross_val_predict(sgd_clf,X_train,y_train,cv=3)
confusion_matrix(y_train,y_sgd_pred)


In [None]:
y_scores = cross_val_predict(sgd_clf,X_train,y_train,cv=3,method = "decision_function")

## Precision and Recall
* Precision is the accuracy of the model. How many ***Positive*** are actually ***Positives*** for "stroke"? The accuracy is given as:

$$\frac{True Positive}{True Positive + False Positive}$$

* Recall is how many of the actual "stroke" are actually classified as "stroke" by the model?
Note that actual "stroke" is composed in both the ***True Positive*** and the ***False Negative***
$$\frac{True Positive}{True Positive + False Negative}$$


In [None]:
from sklearn.metrics import precision_recall_curve
precisions,recalls,thresholds = precision_recall_curve(y_train,y_scores)
plt.plot(thresholds,precisions[:-1],'b--',label = "Precision")
plt.plot(thresholds,recalls[:-1],'g-',label = "Recall")
plt.legend()
plt.show()

In [None]:
plt.plot(recalls,precisions)
plt.xlabel("Recall",fontsize = 14)
plt.ylabel("Precision",fontsize = 14)

We see that because of the disparity in the data, as the ***Recall*** increases, there is a drastic change in ***Precision***. Since this model might be used for diagnostic purposes, we can afford to err on the side of caution. What it means is we can afford to have ***false positives for*** "stroke" but when we miss ***false negatives*** someone's life may be danger. Therefore what we can do here is change the threshold of the model so that the precision of the model will take a hit but we make sure that most of the people prone to "stroke" are identitifed. Lets say we want the Recall value at 90% and see how the model behaves.

In [None]:
threshold_90_recall = thresholds[np.argmax(recalls >= 0.90)]
threshold_90_recall

In [None]:
y_train_pred_90 = (y_scores >= threshold_90_recall)
y_train_pred_90

In [None]:
from sklearn.metrics import recall_score,precision_score
recall_score(y_train,y_train_pred_90)
precision_score(y_train,y_train_pred_90)

We see that our precision takes a hard hit. We might as well classify everyone as "stroke" prone. This is very bad model and negates the whole purpose of building a model in the first place. Let's explore more avenues to come up with a resaonable model.

In [None]:
from sklearn.metrics import roc_curve
fpr,tpr,thresholds = roc_curve(y_train,y_scores)
plt.tight_layout()
plt.plot(fpr,tpr)
plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")

The higher the recall (TPR) more the false positive (FPR) that the classifier produces. A good classifier stays as far away from the line as possible (top-left corner)

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC
forest_clf = RFC(random_state = 2)
y_train_pred_forest = cross_val_predict(forest_clf,X_train,y_train,cv=5)
confusion_matrix(y_train,y_train_pred_forest)

In [None]:
y_proba_forest = cross_val_predict(forest_clf,X_train,y_train,cv=5,method = "predict_proba")
y_scores_forest = y_proba_forest[:,-1]
fpr_forest,tpr_forest,thresholds_forest = roc_curve(y_train,y_scores_forest)
prec_forest,rec_forest,thresholds_forest = precision_recall_curve(y_train,y_scores_forest)
plt.plot(thresholds_forest,prec_forest[:-1],'b--',label = "precision")
plt.plot(thresholds_forest,rec_forest[:-1],'g-',label = "recall")
plt.xlabel("threshold")
plt.figure()
plt.plot(fpr_forest,tpr_forest)
plt.plot([0,1],[0,1],'k--')

In [None]:
from sklearn.metrics import roc_auc_score #area under the roc curve
roc_auc_score(y_train,y_scores_forest)

The area under the ROC curve give us the performance of the classifier. A perfect classifier will have a value of 1 and a random classifier will have a value of 0.5 as indicated by the diagona; line above. The RandomFroest Classifier performs slightly better than the previous model.


# Data Augmentation and Training.
Using Data Augmentation from [Nikunj Malpani's](https://www.kaggle.com/nikunjmalpani/stroke-prediction-step-by-step-guide) notebook.

In [None]:
# Using SMOTE
from imblearn.over_sampling import SMOTE
sampler = SMOTE(random_state = 42)
X = stroke_prepared
y = stroke_labels
X,y= sampler.fit_resample(X,y.values.ravel())
y_cat = pd.DataFrame({'stroke':y})
sns.countplot(data = y_cat, x = 'stroke', y= None)
plt.show()

In [None]:
for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

y_pred_clf = cross_val_predict(clf,X_train,y_train,cv = 3)
confusion_matrix(y_train,y_pred_clf)

Let's now observe how our earlier models perfrom on the new dataset.

In [None]:
y_scores_clf = cross_val_predict(clf,X_train,y_train,cv=3,method = "predict_proba")

In [None]:
y_scores = y_scores_clf[:,-1]
fpr,tpr,thresholds = roc_curve(y_train,y_scores)
prec,rec,thresholds = precision_recall_curve(y_train,y_scores)
plt.plot(thresholds,prec[:-1],'b--',label = "precision")
plt.plot(thresholds,rec[:-1],'g-',label = "recall")
plt.xlabel("threshold")
plt.figure()
plt.plot(fpr,tpr)
plt.plot([0,1],[0,1],'k--')
a_roc = roc_auc_score(y_train,y_scores)
plt.text(0.6,0.2,"A_roc = {:3f}".format(a_roc),{'fontsize': 16})

In [None]:
y_scores = cross_val_predict(sgd_clf,X_train,y_train,cv=3,method = "decision_function")

In [None]:
fpr,tpr,thresholds = roc_curve(y_train,y_scores)
prec,rec,thresholds = precision_recall_curve(y_train,y_scores)
plt.plot(thresholds,prec[:-1],'b--',label = "precision")
plt.plot(thresholds,rec[:-1],'g-',label = "recall")
plt.xlabel("threshold")
plt.figure()
plt.plot(fpr,tpr)
plt.plot([0,1],[0,1],'k--')
a_roc = roc_auc_score(y_train,y_scores)
plt.text(0.6,0.2,"A_roc = {:3f}".format(a_roc),{'fontsize':16})

In [None]:
y_proba_forest = cross_val_predict(forest_clf,X_train,y_train,cv=5,method = "predict_proba")
y_scores = y_proba_forest[:,-1]


In [None]:
fpr,tpr,thresholds = roc_curve(y_train,y_scores)
prec,rec,thresholds = precision_recall_curve(y_train,y_scores)
plt.plot(thresholds,prec[:-1],'b--',label = "precision")
plt.plot(thresholds,rec[:-1],'g-',label = "recall")
plt.xlabel("threshold")
plt.figure()
plt.plot(fpr,tpr)
plt.plot([0,1],[0,1],'k--')
a_roc = roc_auc_score(y_train,y_scores)
plt.text(0.6,0.2,"A_roc = {:3f}".format(a_roc),{'fontsize':16})

# Test Data Predictions
Since the Random Forest was the best model among all the three models tested we will use that model for Test Data prediction and evaluate our scores.

In [None]:
y_proba_forest_test = cross_val_predict(forest_clf,X_test,y_test,cv=5,method = "predict_proba")
y_scores = y_proba_forest_test[:,-1]

In [None]:
fpr,tpr,thresholds = roc_curve(y_test,y_scores)
prec,rec,thresholds = precision_recall_curve(y_test,y_scores)
plt.plot(thresholds,prec[:-1],'b--',label = "precision")
plt.plot(thresholds,rec[:-1],'g-',label = "recall")
plt.xlabel("Threshold")
plt.ylabel("Precission/Recall")
plt.figure()
plt.plot(fpr,tpr)
plt.plot([0,1],[0,1],'k--')
a_roc = roc_auc_score(y_test,y_scores)
plt.text(0.5,0.2,"A_roc = {:3f}".format(a_roc),{'fontsize':16})
plt.title("Test Prediction A_ROC score")

# Conclusion
The Random Forest Classifier has excellent performance for the test data.