In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly
plotly.offline.init_notebook_mode(connected = True)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split as tts,RandomizedSearchCV,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,plot_confusion_matrix

In [None]:
#data loading
st=pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
st.head()

In [None]:
#checking number of rows and columns of dataset
st.shape

In [None]:
st.info()

## TREATING MISSING VALUES

In [None]:
st.isnull().sum()

***BMI*** *column has 201 missing values.*

In [None]:
#creating a copy of original dataset for treating missing values
st_copy=st.copy(deep=True)

In [None]:
st_copy['ever_married']=st_copy['ever_married'].replace({'Yes':1,'No':0})
st_copy=pd.get_dummies(st_copy,drop_first=True)

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(st_copy.corr(),annot=True)

*Since **'bmi'** feature don't show high correlation with any of the variables in the dataset neither it is related  by the characteristics of the missing data itself.Therefore the values in bmi are* ***Missing Completely At Random(MCAR).***

In [None]:
st['gender'].value_counts(normalize=True)

*Since the proportion of values having **Other** is very less **(0.0196%)** so we can impute it to female(as it was the most frequent occuring element).*

In [None]:
st['gender']=st['gender'].replace('Other','Female')

In [None]:
#dropping missing data 
st=st.dropna()

In [None]:
#dropping unnecessary columns
st.drop(columns='id',inplace=True)

## OBSERVING CATEGORICAL VARIABLES

In [None]:
#function to observe values in each categorical feature
def value_viz(feature,title):
    return px.pie(st,feature,title=title)

In [None]:
value_viz('gender','Distribution Of Gender')

In [None]:
value_viz('gender','Distribution Of Gender')

In [None]:
value_viz('hypertension','Distribution of people with High Blood Pressure')

In [None]:
value_viz('heart_disease','Distribution of People having Heart Disease')

In [None]:
value_viz('ever_married','Distribution of people who are married')

In [None]:
value_viz('work_type','Distribution of people\'s work type')

In [None]:
value_viz('Residence_type','Distribution of where people live')

In [None]:
value_viz('smoking_status','Distribution of people who smoke')

In [None]:
value_viz('stroke','Distribution of people having stroke')

*It can be seen that it is an imbalanced dataset having people without stroke as 95.7% and remaining 4.26% having stroke*

## OBSERVING CONTINUOUS VARIABLES

In [None]:
plt.figure(figsize=(20,5))
sns.histplot(st['age'])
plt.xticks(range(0,100,10))
plt.title("Distribution of Age")

* *Most of the individuals of the dataset are of age 40 and above.*

* *age is normally distributed.*

In [None]:
plt.figure(figsize=(20,5))
sns.histplot(st['avg_glucose_level'])
plt.title('Distribution of Average Glucose Level')
plt.xticks(range(0,300,25))

* *Most of the individuals have 75-100 average glucose levels.*

* *avg_glucose_level has right skewed distribution.*

In [None]:
plt.figure(figsize=(20,5))
sns.histplot(st['bmi'])
plt.title('Distribution of Body Mass Index')
plt.xlabel('BMI in kg/m2')
plt.xticks(range(0,100,10))

* *Most of the individuals of the dataset has BMI index between 20-30 kilogram/metresq.*
 
* *bmi is also normally distributed with some outliers*

## OUTLIER DETECTION

In [None]:
plt.figure(figsize=(20,5))
sns.boxplot(x='age',data=st)

No outliers in age feature.

In [None]:
plt.figure(figsize=(20,5))
sns.boxplot(x='bmi',data=st)

In [None]:
plt.figure(figsize=(20,5))
sns.boxplot(x='avg_glucose_level',data=st)

*Outliers are present in both bmi and avg_glucose_level.*

## OUTLIER REMOVAL

***STEPS-***

* *We first find the outliers in each of the feature.*
* *Then we will find out the minimum(starting value) value from outliers.*
* *Then values above this minimum value will be replaced by median (as it is not affected by outliers).*

In [None]:
#function to find outliers
def iqr_outliers(df):
    out=[]
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)
    iqr = q3-q1
    Lower_tail = q1 - 1.5 * iqr
    Upper_tail = q3 + 1.5 * iqr
    for i in df:
        if i > Upper_tail or i < Lower_tail:
            out.append(i)
    return out

In [None]:
d=iqr_outliers(st['bmi'])

In [None]:
#finding minimum of outliers in bmi
d.sort()
d[0]

In [None]:
e=iqr_outliers(st['avg_glucose_level'])

In [None]:
#finding minimum of outliers in avg_glucose_level
e.sort()
e[0]

In [None]:
#median imputation in bmi
med=st.bmi.median()
for i in st.bmi:
    if i>=47.6:
        st.bmi=st.bmi.replace(i,med)

In [None]:
#median imputation in avg_glucose_level
med=st.avg_glucose_level.median()
for i in st.avg_glucose_level:
    if i>=168.68:
        st.avg_glucose_level=st.avg_glucose_level.replace(i,med)

In [None]:
st.shape #201 outliers have been removed

In [None]:
st.describe()

***'bmi'*** *has maximum value of **47.5** and **'avg_glucose_level'** has maximum value of **168.15**.Therefore it can be seen that all the outliers from both of the features have been removed.*

## ENCODING VARIABLES

In [None]:
y=st.stroke
X=st.drop('stroke',axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
X['gender']=le.fit_transform(X['gender'])
X['ever_married']=le.fit_transform(X['ever_married'])
X['work_type']=le.fit_transform(X['work_type'])
X['Residence_type']=le.fit_transform(X['Residence_type'])
X['smoking_status']=le.fit_transform(X['smoking_status'])

In [None]:
X.head()

## FEATURE SELECTION

In [None]:
#splitting the original dataset
X_train_or,X_test_or,Y_train_or,Y_test_or=tts(X,y,test_size=0.25,random_state=27)

In [None]:
#separating numerical and categorical for feature selection
numerical=X_train_or[['age','avg_glucose_level','bmi']]
categorical=X_train_or.drop(columns=numerical.columns)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
test = SelectKBest(score_func=chi2, k='all')
test.fit(categorical, Y_train_or)
for i in range(len(test.scores_)):
    print('Feature %s: %f' % (categorical.columns[i], test.scores_[i]))

***hypertension and heart_disease are the most important categorical features.***

In [None]:
from sklearn.feature_selection import RFE
model = DecisionTreeClassifier()
rfe = RFE(model, 2)
fit = rfe.fit(numerical, Y_train_or)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s %s" % (fit.support_,numerical.columns))
print("Feature Ranking: %s" % (fit.ranking_))

***avg_glucose_level and bmi are the most important numerical features.***

In [None]:
X_train_or=X_train_or[['hypertension','heart_disease','avg_glucose_level','bmi']]
X_test_or=X_test_or[['hypertension','heart_disease','avg_glucose_level','bmi']]

## SCALING DATA(ORIGINAL DATASET)

In [None]:
#using standard scaler to scale training data and applying it to testing data
sc=StandardScaler()
X_train_scaled_or=sc.fit_transform(X_train_or)
X_test_scaled_or=sc.transform(X_test_or)

## MODEL BUILDING AND EVALUATION(ORIGINAL DATASET)

*Since it is an imbalanced dataset we will use Random Forest Classifier to see the predictions on original dataset.*

In [None]:
rf=RandomForestClassifier(random_state=25)
rf.fit(X_train_or,Y_train_or)
pred=rf.predict(X_test_scaled_or)
plot_confusion_matrix(rf,X_test_scaled_or,Y_test_or,cmap=plt.cm.Blues,normalize='all')
print(classification_report(pred,Y_test_or))

* *Accuracy is 96% but the model is unable to correctly classify people who suffer from a stroke.*

* *The model classifies people who have a stroke as people who don't have a stroke.*

* *This is due to imbalance of target variable as majority of values consists of people who don't suffer from a stroke and therefore the model learns that.*

## HANDLING IMBALANCED DATA

In [None]:
st['stroke'].value_counts()

***Since the dataset contains most instances of an negative stroke(4700) so there might be a possibility that the model builded on this dataset classifies the person having stroke as the person who don't have stroke(known as False Negative).***

*So to avoid this handling of imbalanced dataset is mandatory in classification models.*

In [None]:
#splitting before applying smote
X_train,X_test,Y_train,Y_test=tts(X,y,test_size=0.25,random_state=27)

In [None]:
X_train=X_train[['avg_glucose_level','bmi','hypertension','heart_disease']]
X_test=X_test[['avg_glucose_level','bmi','hypertension','heart_disease']]

In [None]:
#using SMOTE to generate synthetic examples in target variables 
over = SMOTE(random_state=27)
under = RandomUnderSampler(random_state=27)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_train_balanced, Y_train_balanced = pipeline.fit_resample(X_train, Y_train)

In [None]:
Y_train_balanced.value_counts()

*Hence we can see that now the stroke feature is balanced in training dataset containing equal sets of both 0 and 1 after using SMOTE.*

## SCALING DATA(AFTER SMOTE)

In [None]:
sc=StandardScaler()
X_train_scaled=sc.fit_transform(X_train_balanced)
X_test_scaled=sc.transform(X_test)

## MODEL BUILDING AND EVALUATION(AFTER SMOTE)

In [None]:
#function to fit models
def model(model):
    mod=model
    mod.fit(X_train_scaled,Y_train_balanced)
    mod_pred=mod.predict(X_test_scaled)
    plot_confusion_matrix(mod,X_test_scaled,Y_test,cmap=plt.cm.Blues,normalize='all')
    print(classification_report(mod_pred,Y_test))

***LOGISTIC REGRESSION***

In [None]:
model(LogisticRegression(random_state=25)) 

***DECISION TREE CLASSIFIER***

In [None]:
model(DecisionTreeClassifier(random_state=25))

***KNN CLASSIFIER***

In [None]:
model(KNeighborsClassifier())

***XGBOOST CLASSIFIER***

In [None]:
model(XGBClassifier(use_label_encoder=False,random_state=25))

***RANDOM FOREST CLASSIFIER*** 

In [None]:
model(RandomForestClassifier(random_state=25)) 

***MORE TECHNIQUES TO IMPROVE CLASSIFICATION WILL BE EXPLORED IN NEXT UPDATE.***