In [None]:
#-------------------------IMPORT LIBRARIES-------------------

import pandas as pd
import numpy as np
import seaborn as sns
import plotly as py
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")


In [None]:
#--------------------------------reading and analyzing data-----------------------------

dataset = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

print(dataset.info())

print('-----------------------------------\n')

print(dataset.isnull().sum())
#201 null values in 'bmi'

print('-----------------------------------\n')

#replacing null values of 'bmi' with mean
dataset.bmi.replace(to_replace=np.nan, value=dataset.bmi.mean(), inplace=True)

print(dataset.isnull().sum())
#no null values 

print('-----------------------------------\n')

#describing the dataset
print(dataset.describe())

print('-----------------------------------\n')

print(dataset.stroke.value_counts())
#imbalanced

print('-----------------------------------\n')


In [None]:
#-------------------------------removing outliers and redundant columns----------------------------

#redundant
dataset.drop(labels='id', axis=1, inplace=True)

#----------outliers for bmi----------
sns.boxplot(data=dataset, y='bmi')
plt.title('Boxplot of bmi')
plt.show()

for i in np.arange(0, 1.1, 0.1):
    print(f'The {99+i}th percentile of BMI is: {np.percentile(dataset.bmi, 99+i)}')
    
#99.9% of people have BMI less than 65
dataset.drop(dataset[dataset.bmi>65].index, inplace=True)

#-----------outliers for avg glucose level----------
sns.boxplot(data=dataset, y='avg_glucose_level')
plt.title('Boxplot of avg_glucose_level')
plt.show()

for i in np.arange(0, 1.1, 0.1):
    print(f'The {99+i}th percentile of Average Glucose Level is: {np.percentile(dataset.avg_glucose_level, 99+i)}')
    
#close values
    
#----------outliers for age-------------
sns.boxplot(data=dataset, y='age')
plt.title('Boxplot of age')
plt.show()

#no outliers

In [None]:
#----------------determining the types of columns----------------------
categorical = (dataset.dtypes == "object")
categorical_list = list(categorical[categorical].index)

print("Categorical variables:")
print(categorical_list)

print('-----------------------------------\n')

numerical = (dataset.dtypes == "float64")
numerical_list = list(numerical[numerical].index)

print("Numerical variables:")
print(numerical_list)

print('-----------------------------------\n')

In [None]:
#----------------DATA COPY FOR VISUALIZATION--------------------

StrokeAnalysis = dataset.copy()
StrokeAnalysis['hypertension'] = StrokeAnalysis['hypertension'].apply(lambda x : 'Hypertension' if x == 1 else 'No Hypertension') 
StrokeAnalysis['heart_disease'] = StrokeAnalysis['heart_disease'].apply(lambda x : 'Heart Disease' if x == 1 else 'No Heart Disease') 
StrokeAnalysis['stroke'] = StrokeAnalysis['stroke'].apply(lambda x : 'Suffered Stroke' if x == 1 else 'Never Suffered Stroke') 
StrokeAnalysis['ever_married'] = StrokeAnalysis['ever_married'].apply(lambda x : 'Married' if x == 'Yes' else 'Unmarried') 


#NO HYPERTENSION VS HYPERTENSION & NO HEART DISEASE VS HEART DISEASE

plt.figure(figsize=(10,6))
placement = 1

for i in ['hypertension','heart_disease']:
    label = []
    value = []
    for j in range(len(StrokeAnalysis[i].value_counts().index)):
        label.append(StrokeAnalysis[i].value_counts().index[j])
        value.append(StrokeAnalysis[i].value_counts()[j])
        
    plt.subplot(1,2,placement)    
    explode = (0.1, 0.2)
    plt.pie(value, labels = label,autopct='%1.2f%%',colors=['darkslategrey','paleturquoise'],shadow=True,explode=explode)
    plt.title("{} VS {}".format(label[0],label[1]))
    placement += 1
    
plt.tight_layout(pad=0.4)  

#The propotion of patients with hypertension and heart diseases are very low.

In [None]:
#MARRIED VS UNMARIED & URBAN VS RURAL

plt.figure(figsize=(10,6))
placement = 1

for i in ['ever_married','Residence_type']:
    label = []
    value = []
    for j in range(len(StrokeAnalysis[i].value_counts().index)):
        label.append(StrokeAnalysis[i].value_counts().index[j])
        value.append(StrokeAnalysis[i].value_counts()[j])
        
    plt.subplot(1,2,placement)    
    explode = (0.1, 0.2)
    plt.pie(value, labels = label,autopct='%1.2f%%',colors=['darkslategrey','paleturquoise'],shadow=True,explode=explode)
    plt.title("{} VS {}".format(label[0],label[1]))
    placement += 1
    
plt.tight_layout(pad=0.4) 

#We see that the number of patients married are much more. The proportion of rural and urban patients are almost equal.

In [None]:
#WORK TYPE

plt.subplot(1,1,1)  
label = StrokeAnalysis['work_type'].value_counts().index
value = StrokeAnalysis['work_type'].value_counts().values
plt.pie(value, labels = label,autopct='%1.2f%%',colors=['deepskyblue','steelblue','lightslategrey','skyblue','crimson'],shadow=True,explode=None)
plt.title("Work Types")

plt.tight_layout(pad=0.4) 

#--------------------visualization of the distribution of numerical columns------------------

for n in numerical_list:
     plt.figure(figsize = (9,3))
     plt.hist(dataset[n], bins = 50)
     plt.xlabel(n)
     plt.ylabel("Frequency")
     plt.title("{} Distribution with Histogram".format(n))
     plt.show()

#-----------------------------visualization of stroke vs other columns----------------------------

for i in categorical_list:
    ax = sns.countplot(data=dataset, x=i,hue="stroke")
    plt.title("Effect of {} on Stroke".format(i))
    
    for p in ax.patches:
        ax.annotate(f'{round(p.get_height()/len(dataset)*100,2)} %', xy=(p.get_x() + p.get_width() / 2,  
            p.get_height()), ha='center', va='center', size=13, xytext=(0, 8), textcoords='offset points')
    plt.show()

#----------------------------------heatmap correlation----------------------------------

plt.figure(figsize = (8,6))  
sns.heatmap(dataset.corr(),annot = True,cmap="Purples")
plt.show()


x = dataset.iloc[:,:-1].values
y = dataset.iloc[:, -1].values

In [None]:
#-------------------ENCODING------------------------

#one-hot encoding of categorical data (gender, work_type, smoking status)

ct = ColumnTransformer(transformers= [('encoder', OneHotEncoder(), [0,5,9])], remainder= 'passthrough')
x = np.array(ct.fit_transform(x))


#label encoding of binary columns (ever_married, residence_type)

le = LabelEncoder()
x[:, 15] = le.fit_transform(x[:, 15])
x[:, 16] = le.fit_transform(x[:, 16])

print('Shape of X: ', x.shape)
print('Shape of Y: ', y.shape)

print('-----------------------------------\n')

#--------------------------------splitting the dataset into the training set and test set-------------------------


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, stratify=y)

print("x_train shape: ",x_train.shape)
print("x_test shape: ",x_test.shape)
print("y_train shape: ",y_train.shape)
print("y_test shape: ",y_test.shape)

print('-----------------------------------\n')

#----------------SCALING----------------

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)


In [None]:
#--------------------MODEL SELECTION-------------------------

models = [['Logistic Regreesion', LogisticRegression(random_state=0)],
          ['SGD Classifier', SGDClassifier(loss='log', n_jobs=-1, random_state=0)],
          ['SVM', SVC(random_state=0)],
          ['KNeighbors Classifier', KNeighborsClassifier()],
          ['GaussianNB', GaussianNB()],
          ['BernoulliNB', BernoulliNB()],
          ['Decision Tree Classifier', DecisionTreeClassifier(random_state=0)],
          ['Random Forest Classifier', RandomForestClassifier(random_state=0)]]

list1= []

for m in range(len(models)):
    
    list2= []
    model = models[m][1]
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    #Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)  
    #K-Fold Validation
    accuracies = cross_val_score(estimator = model, X = x_train, y = y_train, cv = 10) 
    #Precision Score
    precision = precision_score(y_test, y_pred)  
    #Recall Score
    recall = recall_score(y_test, y_pred) 
    #F1 Score
    f1 = f1_score(y_test, y_pred)  
    
    print(models[m][0],':',"\n")
    print("Confusion matrix:\n ")
    print(cm,"\n")
    print('Accuracy Score: ',accuracy_score(y_test, y_pred))
    print("\nK-Fold Validation Mean Accuracy: {:.2f} %".format(accuracies.mean()*100),"\n")
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100),"\n")
    print('Precision: {:.2f}'.format(precision),"\n")
    print('Recall: {:.2f}'.format(recall),"\n")
    print('F1: {:.2f}'.format(f1),"\n")
    print('-----------------------------------\n')
    list2.append(models[m][0])
    list2.append((accuracy_score(y_test, y_pred))*100) 
    list2.append(accuracies.mean()*100)
    list2.append(accuracies.std()*100)
    list2.append(precision)
    list2.append(recall)
    list2.append(f1)
    list1.append(list2)


df = pd.DataFrame(list1, columns= ['Model', 'Accuracy Score', 'K-Fold Mean Accuracy', 'Std. Deviation', 'Precision', 'Recall', 'F1'])
df.sort_values(by= ['Accuracy Score', 'K-Fold Mean Accuracy'], inplace= True, ascending= False)
print(df)


In [None]:
dst_st_gen = dataset.query('gender != "Other"').groupby(['gender', 'stroke']).agg({'stroke': 'count'}).rename(columns = {'stroke': 'count'}).reset_index()
dst_st_gen.iloc[[0, 2], 1] = "didn't have a stroke"
dst_st_gen.iloc[[1, 3], 1] = "had a stroke"

fig = px.sunburst(dst_st_gen, path = ['gender', 'stroke'], values = 'count', color = 'gender',
                 color_discrete_map = {'Female': '#e381bc', 'Male': '#81a8e3'}, width = 700, height = 700)

fig.update_layout(annotations = [dict(text = 'Distribution of stroke by gender', 
                                      x = 0.5, y = 1.1, font_size = 22, showarrow = False, 
                                      font_family = 'Arial Black',
                                      font_color = 'black')])

fig.update_traces(textinfo = 'label + percent parent')
                  
fig.show()