In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# to prevent unnecessary warnings
import warnings
warnings.simplefilter(action='ignore')

In [None]:
# importing useful libraries
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

#import helper modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## **Exploratory Data Analysis (EDA)**

In [None]:
#reading the data set with pandas
stroke_df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

stroke_df.head() #get the first 5 rows of the dataset

In [None]:
stroke_df.info()

In [None]:
stroke_df.describe() #numerically describing the characteristics of the dataset

In [None]:
stroke_df.isnull().sum() #checking the null values of each column of the dataset

In [None]:
#we have only one column with missing value which is bmi, we can fill the missing values with mode

stroke_df['bmi']= stroke_df['bmi'].fillna(stroke_df['bmi'].mode().iloc[0])

In [None]:
stroke_df.isnull().sum().sum() #checking the total number of null values in the dataset

In [None]:
#checking the value counts for the label column

print(stroke_df['stroke'].value_counts())

#plotting the values
sns.countplot(stroke_df['stroke'])

# we can see there is a huge gap in the data for those patient with stroke and those without stroke 
# which can cause our model to be biased in prediction or behave poorly if used directly without any change

In [None]:
#converting the categorical columns to that with numeric value

stroke_df['gender'] = stroke_df['gender'].astype('category').cat.codes
stroke_df['ever_married'] = stroke_df['ever_married'].astype('category').cat.codes
stroke_df['work_type'] = stroke_df['work_type'].astype('category').cat.codes
stroke_df['Residence_type'] = stroke_df['Residence_type'].astype('category').cat.codes
stroke_df['smoking_status'] = stroke_df['smoking_status'].astype('category').cat.codes

In [None]:
stroke_df.info() #information on the characteristics of the datasset

In [None]:
plt.figure(figsize = (25,10)) #set figure size for the plot generated

sns.heatmap(stroke_df.corr(), annot= True)#visualization of the numerical correlation of each feature of the dataset

In [None]:
sns.pairplot(stroke_df, kind = 'scatter', diag_kind= 'kde',hue = 'stroke')
#for distribution and relationship of each feature

In [None]:
#from the correlation plot, the age and ever married column seems highly correlated
#well drop the ever married column and also id column

s_df = stroke_df.drop(columns= ['ever_married'], axis = 1)

### Preprocessing

In [None]:
#splitting the data into train and test set

X = s_df.drop('stroke', axis= 1) # all columns except the target column
y = s_df['stroke'] #the target column


#splitting the data set into train and test sample
# using 30% of the dataset as the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state= 42)

print(X_train.shape)
print(X_test.shape)


**SMOTE - Synthetic Minority Over-sampling Technique**

In [None]:
# dealing with the imabalnce dataset with imblearn library (SMOTE)

from imblearn.over_sampling import SMOTE
#SMOTE is an oversampling technique that generates synthetic samples
#from the dataset which increases the predictive power for minority classes.

smote = SMOTE() 

# call the smote module only on the training sample
X_smote, y_smote = smote.fit_resample(X_train, y_train)

print(X_smote.shape)

sns.countplot(y_smote) #plotting to see the data distribution of the target after using SMOTE


In [None]:
testing = X_test['id'] #taking ID column for the purpose of submission

In [None]:
# drop the id column from both the train and test set
X_smote = X_smote.drop(columns =['id'])# could have been .drop('id', axis = 1) if columns= wasn't set
X_test = X_test.drop(columns =['id'])

In [None]:
#scaling the data with min max

# import module
from sklearn.preprocessing import MinMaxScaler
  
# scale features
scaler = MinMaxScaler() # minimum-maximum scaler module
X_smote = scaler.fit_transform(X_smote) #call both fit and transform on the resampled training data

X_test = scaler.transform(X_test) #call just transfrom on the test data

## **Building Models**

**Logistic Regression model**

In [None]:
from sklearn.linear_model import LogisticRegression #import the Logistic Regression from library

log = LogisticRegression()
log.fit(X_smote,y_smote) #fit the model on the train data


#use the model to evaluate the performance on the test set
pred = log.predict(X_test)

In [None]:
# import the metrics used for evaluation from sklearn library
from sklearn.metrics import (f1_score, roc_auc_score,accuracy_score,
                             precision_recall_curve, auc, roc_curve, recall_score)

clf_log = classification_report(y_test, pred) #get classification report for performance of the logistic model
print(clf_log)

In [None]:
#confusion matrix for the prediction
cm = confusion_matrix(y_test, pred)
cm

### Voting Classifier

In [None]:
#import from sklearn library
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import  VotingClassifier 


rdf = RandomForestClassifier(random_state = 42) #base random forest model

dt = DecisionTreeClassifier(random_state = 42) #base decision tree model

voting = VotingClassifier(estimators = [('tree',dt), #build the voting model with decision tree and random forest 
                                        #as the two base sub model
          ('rdf', rdf)],
         voting = 'soft')
          
          
voting.fit(X_smote, y_smote) #fit the classifier on the resampled training data

In [None]:
dtc, rd = voting.estimators_ #get the estimators for the two sub model

#check performance of the voting classifier and the ones for the individuals

#print the score for the individual model
print(voting.score(X_test, y_test))
print(dtc.score(X_test, y_test))
print(rd.score(X_test, y_test))

In [None]:
#get the performance of the voting model on the prediction 
#as compared to the y_test values as classification report
clf_v = classification_report(y_test, voting.predict(X_test))
print(clf_v)

In [None]:
#confusion matrix for the prediction
cv = confusion_matrix(y_test, voting.predict(X_test))
cv

In [None]:
#get the performance of the random forest model on the prediction 
#as compared to the y_test values as classification report

clf_r = classification_report(y_test, rd.predict(X_test))
print(clf_r) #print the report

In [None]:
#confusion matrix for the prediction
cr = confusion_matrix(y_test, rd.predict(X_test))
cr

### Xgboost model

In [None]:
xg = xgb.XGBClassifier() # generate the model
xg.fit(X_smote, y_smote) #fit the model on the resampled training data


#use the model to evaluate the performance on the test set
xgpred = xg.predict(X_test)


In [None]:
clf_x = classification_report(y_test, xgpred) #get the performance of the xgboost model on the 
#prediction as compared to the y_test values as classification report 
print(clf_x)

In [None]:
#confusion matrix for the predictions using the xgboost model
cx = confusion_matrix(y_test, xgpred)
cx

In [None]:
xgprd = xg.predict_proba(X_test)[:,1]

#Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.
fpr_log, tpr_log, _ = roc_curve(y_test, xgprd)
roc_auc_log = auc(fpr_log, tpr_log)

#plot the AUC_ROC area
sns.set_style("white")
plt.figure(figsize=(10, 7)) #to set the size of the figure generated
plt.plot(fpr_log, tpr_log, color='darkorange',
         label='ROC curve (area = %0.2f)' % roc_auc_log)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')

plt.xlim([0.0, 1.0])#value range and limit on x axis
plt.ylim([0.0, 1.05]) #value range and limit on y axis

plt.xlabel('False Positive Rate',fontsize=18,labelpad =10) #Label for x axis
plt.ylabel('True Positive Rate',fontsize=18) #Label for y axis

plt.title('Receiver Operating Characteristic',fontsize=22).set_position([.5, 1.02]) #Plot title
plt.legend(loc="lower right",fontsize=13)
plt.show()

In [None]:
submission = pd.DataFrame({'Id': testing, 'Stroke': xgpred}) #form a dataframe with only the id and predictions column
submission.to_csv('submission.csv', index=False) #make the dataframe into a csv file  
submission = pd.read_csv('submission.csv')
submission