# **Importing Libraries and Data**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import PolynomialFeatures
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

# Data exploration

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#Pair Plot
sns.pairplot(df.drop('id',axis=1),hue="stroke",plot_kws={'alpha':0.3},diag_kind='hist',diag_kws={'multiple':'dodge','bins':5})

In [None]:
#Label hisogram 
sns.histplot(df['stroke'])

In [None]:
#Skewed Data !!!

### Age

In [None]:
#Histogram
g = sns.histplot(x= 'age',hue='stroke',data=df,bins=5,multiple='dodge')

In [None]:
#It is obvious that older people are more exposed to the risk of stroke, let's split age column and compute the proportion of positive labels per segment

In [None]:
#Splitting function
def age_split(col):
    age = col
    if age<=18 :
        return '1'
    elif 18<age<=30 :
        return '2'
    elif 30<age<=40 :
        return '3'    
    elif 40<age<=50 :
        return '4'
    elif 50<age<=60 :
        return '5'
    elif 60<age<=85:
        return '6'

In [None]:
df['Age_Category'] = df['age'].apply(age_split)
age_2 = pd.DataFrame(df.groupby('Age_Category')['stroke'].value_counts())
age_2['proportion'] = round(age_2['stroke']/df.groupby('Age_Category')['id'].count(),4)
age_2

In [None]:
#Plot proportions
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.plot(['3','4','5','6'],age_2[(age_2.index.get_level_values(1)==1)&(age_2.index.get_level_values(0)!='1')]['proportion'],
        marker='x')
plt.xlabel('Age category')
plt.ylabel('Proportion of strokes')
plt.title('Proportion of strokes per age category')

In [None]:
#According to the Data, out of 100 person older than 60 years, approximately 14 had a stroke

### Smoking status

In [None]:
#Barplot
sns.histplot(x='smoking_status',data=df,hue='stroke',multiple='dodge')

In [None]:
#No conclusions can be drawn from the previous plot, lets use the same approach as for Age. 


In [None]:
##Computing the proportion of positive labels per smoking status
smoke_ =  pd.DataFrame(df.groupby('smoking_status')['stroke'].value_counts())
smoke_['proportion'] = smoke_['stroke']/df.groupby('smoking_status')['id'].count()
smoke_

In [None]:
#Redoing Barplot using proportions
sns.barplot(smoke_.index.get_level_values(0).unique(),
            smoke_[smoke_.index.get_level_values(1)==1]['proportion'],order=['formerly smoked','smokes','never smoked','Unknown'])

In [None]:
#From that plot we can say that smoking status might be related to stroke proportion . 
#Still, one might think that old people had more time to smoke, enjoy smoking then quit smoking HAHA. Let's look at the relation between Age and Smoking Status

### Age & Smoking_Status

In [None]:
#Box Plot
sns.boxplot(x='smoking_status',y='age',data=df,order=['formerly smoked','smokes','never smoked','Unknown'])

In [None]:
#As expected, Formerly_smoked category has a higher age average... Let's look deeper using proportions by Age_Category & Smoking_status

In [None]:
age_smoke = pd.DataFrame(df.groupby(['Age_Category','smoking_status'])['stroke'].value_counts())
age_smoke['proportion'] = np.round(age_smoke['stroke']/df.groupby(['Age_Category','smoking_status'])['id'].count(),4)
age_smoke

In [None]:
#Plotting proportions per Age_Category & smoking_status, the blue line represents propotions computed without using the smoking_status information 
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.plot(['3','4','5','6'],age_2[(age_2.index.get_level_values(1)==1)&(age_2.index.get_level_values(0)!='1')]['proportion'],
        label ='All smoking status',marker='x')
ax.plot(['3','4','5','6'],age_smoke[(age_smoke.index.get_level_values(1)=='formerly smoked')&(age_smoke.index.get_level_values(2)==1)]['proportion'],
        label ='formerly smoked',marker='x')
ax.plot(['3','4','5','6'],age_smoke[(age_smoke.index.get_level_values(1)=='smokes')&(age_smoke.index.get_level_values(2)==1)]['proportion'],
        label ='smokes',marker='x')
ax.plot(['3','4','5','6'],age_smoke[(age_smoke.index.get_level_values(1)=='never smoked')&(age_smoke.index.get_level_values(2)==1)]['proportion'],
        label ='never smoked',marker='x')
plt.legend()
plt.xlabel('Age category')
plt.ylabel('Proportion of strokes')
plt.title('Proportion of strokes per age category & smoking status')

In [None]:
#For Formerly_smoked status the proportion of stroke is higher for all age categories. For other smoking status, the effect tends to vary among age categories.

### Gender

In [None]:
#Countplot
sns.countplot(x='gender',hue='stroke',data=df)

In [None]:
#Proportion of stroke per Gender
gender_ = pd.DataFrame(df.groupby(['gender','stroke'])['id'].count())
gender_['proportion'] = round(gender_['id']/df.groupby('gender')['id'].count(),4)
gender_

In [None]:
#Slight difference between Male and Female in term of stoke proportion. 
#Keep in mind we are dealing with skewed Data, proportions of stroke is approximately 4%...

In [None]:
#Boxplot (gender vs Age)
sns.boxplot(x='gender',y='age',data=df)

In [None]:
#Slight difference between Male and Female in term of Age average

### Ever_married

In [None]:
#Count plot
sns.countplot('ever_married',hue='stroke',data=df)

In [None]:
#Proportions of stroke per ever_married status
married_ = pd.DataFrame(df.groupby(['ever_married','stroke'])['id'].count())
married_['proportion'] = married_['id']/df.groupby('ever_married')['id'].count()
married_

In [None]:
sns.barplot(married_.index.get_level_values(0).unique(),
            married_[married_.index.get_level_values(1)==1]['proportion'])

In [None]:
#WOW, rethinking marriage is a must LOL!! 

In [None]:
#Boxplot (ever_married vs age)
sns.boxplot(x='ever_married',y='age',data=df)

In [None]:
#fortunately, it's the age effect...

### Heart Disease & Hypertension

In [None]:
#box plot showing average age of people having Heart disease/hypertension, both or none 
sns.boxplot(x=df['hypertension']+df['heart_disease'],y=df['age'])

In [None]:
#Let's focus on people older than 50 (Age_Category = 5 & 6).

In [None]:
#Proportion of stroke per Age_Category,heart_disease & Hypertension
age_disease = pd.DataFrame(df.groupby(['Age_Category','heart_disease','hypertension','stroke'])['id'].count())
age_disease['proportion'] = age_disease['id']/df.groupby(['Age_Category','heart_disease','hypertension'])['id'].count()


In [None]:
age_disease['age']=age_disease.index.get_level_values(0)
age_disease['heart_disease']=age_disease.index.get_level_values(1)
age_disease['hypertension']=age_disease.index.get_level_values(2)
age_disease['stroke']=age_disease.index.get_level_values(3)

In [None]:
#Ploting propotions for people older than 50 (Age_Category =5,6)
f = sns.FacetGrid(data=age_disease,col='heart_disease',row='hypertension',hue='stroke')
f.map(sns.barplot,'age','proportion')

In [None]:
#The orange color represents the proportion of stroke. It's clear that heart disease and hypertension increase the probability of experiencing strokes

### BMI & avg_glucose_level

In [None]:
#JoinPlot
sns.jointplot(x='bmi',y='avg_glucose_level',data=df,hue='stroke',alpha=0.2)

### Correlation 

In [None]:
#Heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(),annot=True)

# Data Preprocessing

### Missing Data

In [None]:
#heatmap
sns.heatmap(df.isnull())

In [None]:
#Some bmi values are missing

### Fill BMI Column

In [None]:
#Correlation matrix shows that bmi is correlated with age (corr = 0.33), hypertension (corr = 0.17) and average glucose level (corr = 0.18).
#We can create a linear model using these three parameters to predict missing BMI or just use mean per Age_Category.

In [None]:
#Mean BMI per Age_Category & Hypertension & Stroke
dg = df.groupby(['Age_Category','stroke','hypertension'])['bmi'].mean().reset_index()
dg.head()

In [None]:
#Let's use this table to fill the missing values,
def fill_bmi(cols) :
    global g
    t = cols[3]
    b = cols[0]
    s = cols[1]
    a = cols[2]
    if np.isnan(b)==False :
        return b
    else : 
        return round(float(dg[(dg['stroke']==s) & (dg['Age_Category']==a) & (dg['hypertension']==t)]['bmi']),1)

In [None]:
df['filled_bmi'] = df[['bmi','stroke','Age_Category','hypertension']].apply(fill_bmi,axis=1)

### Get dummy variables for categorical data

In [None]:
dum1 = pd.get_dummies(df['gender'],drop_first=True)
dum2 = pd.get_dummies(df['ever_married'],drop_first=True)
dum3 = pd.get_dummies(df['work_type'],drop_first=True)
dum4 = pd.get_dummies(df['Residence_type'],drop_first=True)
dum5 = pd.get_dummies(df['smoking_status'],drop_first=True)

In [None]:
df = pd.concat([df,dum1,dum2,dum3,dum4,dum5],axis=1)

In [None]:
#New correlation matrix 
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),annot=True)

# k-nearest neighbors model

In [None]:
X = df.drop(['id','gender','ever_married','work_type','Residence_type','smoking_status','Age_Category','stroke','bmi'],axis=1)
y = df['stroke']

In [None]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### KNN using SMOTE 

In [None]:
#SMOTE on training Set
smote = SMOTE(sampling_strategy=0.7)
X_trainS , y_trainS = smote.fit_resample(X_train,y_train)

In [None]:
#Choosing K value by cross validation 
#Result Dataframe
report = pd.DataFrame(index = ['avg accuracy','avg sensitivity','avg precision','avg specifity','avg negative-precision','f-score'])
#Kvalues
K = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

for i in K :
    #Pipeline for scaling Data and fitting Knn model
    pipe = Pipeline([('Scaler',StandardScaler()),('Classifier',KNeighborsClassifier(n_neighbors=i))])
    arr = np.zeros(6)

    #Using 5 folds for cross validation
    for j in range(5) :
        
        X_fit, X_cv, y_fit, y_cv = train_test_split(X_trainS, y_trainS, test_size=0.16)
        pipe.fit(X_fit,y_fit)
        predictions = pipe.predict(X_cv)
        acc = np.mean(predictions==y_cv) 
        #Computing metrics
        TP = np.sum(np.array(predictions ==1)*np.array(predictions==y_cv))
        TN = np.sum(np.array(predictions ==0)*np.array(predictions==y_cv))
        FP = np.sum(np.array(predictions ==1)*np.array(predictions!=y_cv))
        FN = np.sum(np.array(predictions ==0)*np.array(predictions!=y_cv))
        if TP==0 and FP==0:
            prec = 0
        else : 
            prec = TP/(TP+FP)
        arr = arr + np.array([acc,TP/(TP+FN),prec,TN/(TN+FP),TN/(TN+FN),2*(prec*TP/(TP+FN))/(prec+(TP/(TP+FN)))])
        
    arr = arr/5
    #Adding result to Dataframe
    report[str(f'{i}')] = arr

In [None]:
#Report average accuracy, precision, specifity and Predictive value of negative class
report.head()

In [None]:

#Maximum values are :
summary_max = pd.DataFrame([report.max(axis=1),report.idxmax(axis=1)],index=['Value','K value'])
summary_max

In [None]:
#K=1
pipe_c = Pipeline([('Scaler',StandardScaler()),('Classifier',KNeighborsClassifier(n_neighbors=1))])
pipe_c.fit(X_trainS,y_trainS)
predictions_test = pipe_c.predict(X_test)
print(confusion_matrix(y_test,predictions_test))
print('\n')
print(classification_report(y_test,predictions_test))

In [None]:
#K=2
pipe_c = Pipeline([('Scaler',StandardScaler()),('Classifier',KNeighborsClassifier(n_neighbors=2))])
pipe_c.fit(X_trainS,y_trainS)
predictions_test = pipe_c.predict(X_test)
print(confusion_matrix(y_test,predictions_test))
print('\n')
print(classification_report(y_test,predictions_test))

In [None]:
#If we try KNN using k=1 and without SMOTE ...
#K=1
pipe_c = Pipeline([('Scaler',StandardScaler()),('Classifier',KNeighborsClassifier(n_neighbors=1))])
pipe_c.fit(X_train,y_train)
predictions_test = pipe_c.predict(X_test)
print(confusion_matrix(y_test,predictions_test))
print('\n')
print(classification_report(y_test,predictions_test))

In [None]:
#SMOTE did help in improving f1 score of positive class wich is the minority in our Data...

### Knn with modified majority rule (probability treshhold)

In [None]:
#For this part, SMOTE wont be used

In [None]:
report2 = pd.DataFrame(index = ['avg accuracy','avg sensitivity','avg precision','avg specifity','avg negative precision','f-score'])
#K values
K = range(1,50)
#Treshhold values
Treshhold = [0.1,0.2,0.3,0.4,0.5]
for i in K :
    for t in Treshhold : 
        #pipeline for scaling and fitting model
        pipe = Pipeline([('Scaler',StandardScaler()),('Classifier',KNeighborsClassifier(n_neighbors=i))])
        arr = np.zeros(6)

        #Cross Validation using 5folds
        for j in range(5) :
        
            X_fit, X_cv, y_fit, y_cv = train_test_split(X_train, y_train, test_size=0.16)
            pipe.fit(X_fit,y_fit)
            #Computing probabilities
            probabilities = pipe.predict_proba((X_cv))
            #Selecting treshhold and predicting labels
            predictions = (probabilities>=t)[:,1].astype(int)
            #Computing Metrics
            acc = np.mean(predictions==y_cv) 
            TP = np.sum(np.array(predictions ==1)*np.array(predictions==y_cv))
            TN = np.sum(np.array(predictions ==0)*np.array(predictions==y_cv))
            FP = np.sum(np.array(predictions ==1)*np.array(predictions!=y_cv))
            FN = np.sum(np.array(predictions ==0)*np.array(predictions!=y_cv))
            if TP==0 and FP==0:
                prec = 0
            else : 
                prec = TP/(TP+FP)
            arr = arr + np.array([acc,TP/(TP+FN),prec,TN/(TN+FP),TN/(TN+FN),2*(prec*TP/(TP+FN))/(prec+(TP/(TP+FN)))])
        
        arr = arr/5
        report2[str(f'({i},{t})')] = arr

In [None]:
#Maximum values are :
summary_max2 = pd.DataFrame([report2.max(axis=1),report2.idxmax(axis=1)],index=['Value','(K value,threshhold)'])
summary_max2

In [None]:
#K=49 & Treshhold = 0.1
pipe_2 = Pipeline([('Scaler',StandardScaler()),('Classifier',KNeighborsClassifier(n_neighbors=49))])
pipe_2.fit(X_train,y_train)
probabilities_test = pipe_2.predict_proba(X_test)
predictions_test = np.array([(probabilities_test >0.1)[:,1].astype(int)]).reshape(1533,)
print(confusion_matrix(y_test,predictions_test))
print('\n')
print(classification_report(y_test,predictions_test))

In [None]:
#Better F1-score for positive class with a loss of accuracy ...