In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Packages
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot

In [None]:
data = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data.head()

### Data Cleaning

In [None]:
# Lets find what column have a null value
data.info()

#### As you can see the column bmi is the only column have a null value which is have a total of 5110 - 4909 = 201 null values.

### Drop NaN value on the avg_glucose _level

In [None]:
# now lets removed a row that bmi have a null value
data = data.dropna(subset = ['bmi'])
data.info()

## Engineering Features

### Numerical Features

In [None]:
Attri = data[['age', 'avg_glucose_level', 'bmi', 'stroke']]
Attri.head()

In [None]:
g = sns.pairplot(Attri, diag_kind="kde")
g.map_lower(sns.kdeplot, levels=4, color=".2")

In [None]:
plt.figure(figsize=(7,4))
cor = Attri.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

#### Based on the Numerical Features Chart:
    1) Age is the most correlated to the stroke.
    2) At the age of greater than 60 years old he have a high risk to have stroke.
    3) Between 25 to 35 bmi have a high risk to have stroke.
    4) Between 50 to 100mg/l of average glucose on blood have a high risk to have stroke.

#### Categorical Features

In [None]:
cat = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [None]:
for f in cat:
    data[f].value_counts().plot(kind='bar')
    plt.title(f)
    plt.grid()
    plt.show()

### Drop the Other Value on the gender column

In [None]:
data.drop(data.loc[data['gender']== 'Other'].index, inplace=True)
data['gender'].value_counts()

In [None]:
data.info()

### Feature Selection

#### One Hot Encoding for Categorical Features

In [None]:
#cat is the column name on the above for categorical feature
df = data
data_feature = pd.get_dummies(df, columns=cat)
data_feature.head()

In [None]:
#lets drop the work_type_Never_worked because based on the chart on the above for work type never work is the least count 
#lets drop also the id number because it is only a counter number of patients on the data
data_feature.pop('work_type_Never_worked')
data_feature.pop('id')
Feature = data_feature
Feature.head()

In [None]:
Feature.columns

### Normalization/Standardization

In [None]:
X = Feature[['age', 'avg_glucose_level', 'bmi', 'gender_Female',
       'gender_Male', 'hypertension_0', 'hypertension_1',
       'heart_disease_0', 'heart_disease_1', 'ever_married_No',
       'ever_married_Yes', 'work_type_Govt_job', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural',
       'Residence_type_Urban', 'smoking_status_Unknown',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes']]
X.shape

In [None]:
y = Feature[['stroke']]
y.head()

In [None]:
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

In [None]:
stan_data = pd.DataFrame(X)
fig, ax = pyplot.subplots(figsize= (10,6))
sns.kdeplot(ax = ax, data=X , legend = False)

### Creating test and train data coming from the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

### Confusion Matrix Function

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import itertools
from sklearn.metrics import f1_score
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    plt.figure(figsize=(5,4))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

## MODELS

 ### Random Forest Model

In [None]:
RF = RandomForestClassifier( max_depth= 10, random_state=0, n_estimators=10)
RFF = RF.fit(X_train, y_train)
#Get the Score of Random Forest Classifier both Train and Test
RFM_Train = RF.score(X_train, y_train)
RFM_Test = RF.score(X_test, y_test)
print('Random Forest Train Score: ' + str(RFM_Train))
print('Random Forest Test Score: ' + str(RFM_Test))
#Predict value of RFM
RFM_pred = RF.predict(X_test)
print(RFM_pred[0:5])

In [None]:
f1_score(y_test, RFM_pred, average='weighted') 
# Compute confusion matrix  
RFM_matrix = confusion_matrix(y_test, RFM_pred)
np.set_printoptions(precision=2)

print (classification_report(y_test, RFM_pred))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(RFM_matrix, classes=['No Stroke','Have Stroke'],normalize= False,  title='Random Forest Confusion matrix')

In [None]:
### Logistic Regression CLassifier
from sklearn.linear_model import LogisticRegression
LRC = LogisticRegression(random_state=0)
LRCF = LRC.fit(X_train, y_train)
#Get the Score of Logistic Regression Classifier both Train and Test
LRC_Train = LRC.score(X_train, y_train)
LRC_Test = LRC.score(X_test, y_test)
print('Logistic Regression Classifier Train Score: ' + str(LRC_Train))
print('Logistic Regression Classifier Test Score: ' + str(LRC_Test))
#Predict value of LRC
LRC_pred = LRC.predict(X_test)
print(LRC_pred[0:5])

In [None]:
f1_score(y_test, LRC_pred, average='weighted') 
# Compute confusion matrix  
LRC_matrix = confusion_matrix(y_test, LRC_pred)
np.set_printoptions(precision=2)
print (classification_report(y_test, LRC_pred))
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(LRC_matrix, classes=['No Stroke','Have Stroke'],normalize= False,  title='Logistic Regression Confusion matrix')

### Support Vector Machine (SVM)

In [None]:
from sklearn import svm
SVM = svm.SVC()
SVMF = SVM.fit(X_train, y_train)
#Get the Score of Support Vector Machine both Train and Test
SVM_Train = SVM.score(X_train, y_train)
SVM_Test = SVM.score(X_test, y_test)
print('Support Vector Machine Train Score: ' + str(SVM_Train))
print('Support Vector Machine Test Score: ' + str(SVM_Test))
#Predict value of SVM
SVM_pred = SVM.predict(X_test)
print(SVM_pred[0:5])

In [None]:
f1_score(y_test, SVM_pred, average='weighted') 
# Compute confusion matrix  
SVM_matrix = confusion_matrix(y_test, SVM_pred)
np.set_printoptions(precision=2)
print (classification_report(y_test, SVM_pred))
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(SVM_matrix, classes=['No Stroke','Have Stroke'],normalize= False,  title='SVM Confusion matrix')

### KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=10)
KNNF = neigh.fit(X_train, y_train)
#Get the Score of KNN Classifier both Train and Test
KNN_Train = neigh.score(X_train, y_train)
KNN_Test = neigh.score(X_test, y_test)
print('KNN Classifier Train Score: ' + str(KNN_Train))
print('KNN Classifier Machine Test Score: ' + str(KNN_Test))
#Predict value of KNN
KNN_pred = neigh.predict(X_test)
print(KNN_pred[0:5])

In [None]:
f1_score(y_test, KNN_pred, average='weighted') 
# Compute confusion matrix  
KNN_matrix = confusion_matrix(y_test, KNN_pred)
np.set_printoptions(precision=2)
print (classification_report(y_test, KNN_pred))
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(KNN_matrix, classes=['No Stroke','Have Stroke'],normalize= False,  title='K-Neighbor Confusion matrix')

### Naive Bayes Classifier 

In [None]:
from sklearn.naive_bayes import GaussianNB
NBC = GaussianNB()
NBCF = NBC.fit(X_train, y_train)
#Get the Score of Naive Bayes Classifier both Train and Test
NBC_Train = NBC.score(X_train, y_train)
NBC_Test = NBC.score(X_test, y_test)
print('Naive Bayes Classifier Train Score: ' + str(NBC_Train))
print('Naive Bayes Classifier Test Score: ' + str(NBC_Test))
#Predict value of Naive Bayes Classifier
NBC_pred = NBC.predict(X_test)
print(NBC_pred[0:5])

In [None]:
f1_score(y_test, NBC_pred, average='weighted') 
# Compute confusion matrix  
NBC_matrix = confusion_matrix(y_test, NBC_pred)
np.set_printoptions(precision=2)
print (classification_report(y_test, NBC_pred))
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(NBC_matrix, classes=['No Stroke','Have Stroke'],normalize= False,  title='Naive Bayes Confusion matrix')

### Gradient Boosting CLassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
GBCF = GBC.fit(X_train, y_train)
#Get the Score of Gradient Boosting Classifier both Train and Test
GBC_Train = GBC.score(X_train, y_train)
GBC_Test = GBC.score(X_test, y_test)
print('Gradient Boosting Classifier Train Score: ' + str(GBC_Train))
print('Gradient Boosting Classifier Test Score: ' + str(GBC_Test))
#Predict value of Gradient Boosting Classifier
GBC_pred = GBC.predict(X_test)
print(GBC_pred[0:5])

In [None]:
f1_score(y_test, GBC_pred, average='weighted') 
# Compute confusion matrix  
GBC_matrix = confusion_matrix(y_test, GBC_pred)
np.set_printoptions(precision=2)

print (classification_report(y_test, GBC_pred))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(GBC_matrix, classes=['No Stroke','Have Stroke'],normalize= False,  title='Gradient Boosting Classifier Confusion matrix')

### Conclusion:
Based on the 6 models, all precision for Not having stroke is more than 95% but for precision for predicting the  patient to have stroke, the is highest Random Forest Model which is 43% and the 2nd to the highest is only 17% which is gradient boosting classifier.
    
Therefore the most suitable to Model among the 6 model is Random Forest Classifier with a: <br>
    Having Stroke: 43% Precision<br>
    Not Having Stroke: 96% Precision<br>
    F1-Score: 98%<br>
    Accuracy: 96%<br>
    