# **TABLE OF CONTENT**

**0 PROLOGUE**
 
**1 IMPORTING LIBRARIES**

**2 DATA DESCRIPTION AND DATA CLEANING**

*2.1 Import Data*

*2.2 Data types*

*2.3 Missing values*

*2.4 Duplicates*

**3 VISUALIZATION**

**4 PREDICTION**


> **0 PROLOGUE**

In this work, exploratory data analysis has been carried out for World Health Organization (WHO) stroke.

Features

* id: unique identifier
* gender: "Male", "Female" or "Other"
* age: age of the patient
* hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
* heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
* ever_married: "No" or "Yes"
* work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
* Residence_type: "Rural" or "Urban"
* avg_glucose_level: average glucose level in blood
* bmi: body mass index
* smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
* stroke: 1 if the patient had a stroke or 0 if not


> **1 IMPORTING LIBRARIES**

1. Library **pandas** will be required to work with data in tabular representation.

2. Library **numpy** will be required to round the data in the correlation matrix.

3. Library **matplotlib, seaborn, plotly** required for data visualization


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


> **2 DATA DESCRIPTION AND DATA CLEANING**

In this block, exploratory data analysis will be carried out, data types, missing values, duplicates and also the relationship between variables will be described.

**2.1 Import Data**

In [None]:
#Reading Data
df_stroke = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df = df_stroke.copy() #Copy the dataset 
df.head() #Loading the First Five Rows:

**2.2 Describe Data**

In [None]:
df.columns

In [None]:
df.describe().T

In [None]:
df.drop(['id'],axis=1 ,inplace = True)

**2.3 Missing Data**

In [None]:
df.isnull().sum()

In [None]:
df["bmi"].fillna(df['bmi'].mean(),inplace=True)

In [None]:
df["gender"] = df["gender"].replace(["Other"],"Male")

In [None]:
df["gender"].nunique()

**3. VISUALIZATION**

In [None]:
(df["work_type"]
.value_counts()
.plot.barh()
.set_title("Çalışan Sınıfının Frekansları"));

In [None]:
sns.displot(df, x = "work_type",hue = "gender",col= "gender",shrink = 1);

In [None]:
df.groupby(["work_type"]).mean()

In [None]:
sns.displot(df, x = "avg_glucose_level",hue = "work_type",col= "work_type",binwidth=10);

In [None]:
sns.relplot(data=df, x = "avg_glucose_level", y="bmi", hue = "stroke" );

In [None]:
plt.figure(1, figsize=(20,10))
the_grid = GridSpec(3, 3)

cmap = plt.get_cmap('Spectral')
colors = [cmap(i) for i in np.linspace(0, 1, 8)]
gender_colors = ["hotpink","b"]
stroke_colors = ["g","r"]

label_hyper = df['work_type'].value_counts().index
hyper_values = df['work_type'].value_counts()
plt.subplot(the_grid[0, 0], aspect=1, title='Work Type of Pie')
hyper_pie = plt.pie(hyper_values, labels=label_hyper, autopct='%1.1f%%', shadow=True,colors=colors)


label_gender = df['gender'].value_counts().index
gender_values = df['gender'].value_counts()
plt.subplot(the_grid[0, 1], aspect=1, title='Gender of Pies')
gender_pie = plt.pie(gender_values, labels=label_gender, autopct='%1.1f%%', shadow=True,colors=gender_colors)



label_stroke = df['stroke'].value_counts().index
stroke_values = df['stroke'].value_counts()
plt.subplot(the_grid[0, 2], aspect=1, title='Stroke Rate')
hyper_pie = plt.pie(stroke_values, labels=label_stroke, autopct='%1.1f%%', shadow=True,colors=stroke_colors)

In [None]:
sns.barplot(x=hyper_values,y= label_hyper, data=df, palette='Spectral');

In [None]:
sns.histplot(data=df, x="avg_glucose_level", hue="gender", multiple="stack");


In [None]:
fig1 = plt.figure()
ax1 = fig1.add_subplot(1, 1, 1)
ax1.text(-20,0.045,'Age & Stroke Relationship',fontsize=18,fontweight='bold', fontfamily='monospace')
sns.kdeplot(data=df[df.stroke==1],x='age',hue = "stroke", shade=True,ax = ax1 ,color='lightcoral',alpha=1)
sns.kdeplot(data=df[df.stroke==0],x='age',shade=True,ax = ax1,color='palegreen',alpha=0.5);


In [None]:
fig1 = plt.figure()
ax2 = fig1.add_subplot(1, 1, 1)
ax2.text(-20,0.020,'Glucose Level & Stroke Relationship',fontsize=18,fontweight='bold', fontfamily='monospace')
sns.kdeplot(data=df[df.stroke==0],x='avg_glucose_level',shade=True,ax = ax2, color='palegreen',alpha=0.5)
sns.kdeplot(data=df[df.stroke==1],x='avg_glucose_level',hue = "stroke",shade=True,ax = ax2,color='Red',alpha=0.6);

In [None]:
fig1 = plt.figure()
ax3 = fig1.add_subplot(1, 1, 1)
ax3.text(-10,0.10,'BMI & Stroke Relationship',fontsize=18,fontweight='bold', fontfamily='monospace')
sns.kdeplot(data=df[df.stroke==0],x='bmi',shade=True,ax = ax3, color='yellow',alpha=0.5)
sns.kdeplot(data=df[df.stroke==1],x='bmi',hue = "stroke",shade=True,ax = ax3,color='lightcoral',alpha=0.6);

In [None]:
fig1 = plt.figure()
ax4 = fig1.add_subplot(1, 1, 1)
sns.kdeplot(data=df[df.stroke==0],x='hypertension',shade=True,ax = ax4, color='Purple',alpha=0.5)
sns.kdeplot(data=df[df.stroke==1],x='hypertension',hue = "stroke",shade=True,ax = ax4,color='lightcoral',alpha=0.6)
fig1.suptitle('Hypertension & Stroke Relationship', fontsize=14, fontweight='bold');

In [None]:
fig5 = plt.figure()
ax5 = fig5.add_subplot(1, 1, 1)
sns.kdeplot(data=df[df.stroke==0],x='heart_disease',shade=True,ax = ax5, color='palegreen',alpha=0.5)
sns.kdeplot(data=df[df.stroke==1],x='heart_disease',hue = "stroke",shade=True,ax = ax5,color='lightcoral',alpha=0.6)
fig5.suptitle('Heart Disease & Stroke Relationship', fontsize=14, fontweight='bold');

In [None]:
plt.subplot(2,1,1)
plt.title('Stroke Sample Distribution Based On Bmi And Glucose Level')
ax1 = sns.scatterplot(data=df, x="avg_glucose_level", y="bmi", hue="gender",style="gender");
plt.subplot(2,1,2)
plt.title('Stroke')
ax2 = sns.scatterplot(data=df, x="avg_glucose_level", y="bmi", hue="stroke",style="stroke");
plt.tight_layout()
plt.show()

In [None]:
sns.relplot(data=df, x="avg_glucose_level", y="bmi",hue="stroke", dashes=False, markers=True,col="gender");

In [None]:
sns.countplot(x = "hypertension", data=df,hue="stroke")

In [None]:
sns.countplot(x = "gender", data=df,hue="stroke");

In [None]:
sns.countplot(x = "heart_disease", data=df,hue="stroke");

In [None]:
fig = plt.figure(figsize = (15,15),dpi = 40)

ax1 = fig.add_subplot(3,1,1)
ax2 = fig.add_subplot(3,1,2)
ax3 = fig.add_subplot(3,1,3)

stroke_col = '#fe346e'
healthy_col = '#2c003e'

title_args = {'font':'Serif', 'weight':'bold','color': 'black', 'size':24}
font_dict = {'size':16, 'family':'Serif', 'color':'black', 'weight':'bold'}
health_dict = {'font':'Serif', 'color': '#2c003e', 'size':15, 'weight':'bold'}
dash_dict = {'font':'Serif', 'color': 'black', 'size':15,'weight':'bold'}
stroke_dict = {'font':'Serif', 'color': '#fe346e', 'size':15,'weight':'bold'}


healthy_gen = df[df['stroke'] == 0].gender.value_counts()
stroke_gen = df[df['stroke'] == 1].gender.value_counts()

ax1.barh( stroke_gen.index , width = healthy_gen.values[0:2], height = 0.2, color = healthy_col)
ax1.barh( np.arange(len(stroke_gen.index)) , width = stroke_gen.values, height = 0.5, color = stroke_col)

ax1.set_yticklabels(stroke_gen.index, **font_dict)
ax1.axes.get_yaxis().set_visible(True)
ax1.axes.get_xaxis().set_visible(True)
ax1.spines['bottom'].set_visible(True)
ax1.spines['left'].set_visible(True)
ax1.text(-5,2, 'Gender Risk',**title_args)
ax1.text(0,1.35, 'Healthy',**health_dict)
ax1.text(790,1.35, '|',**dash_dict)
ax1.text(870,1.35, 'Stroke',**stroke_dict)


#ax2 : work-type

healthy_gen = df[df['stroke'] == 0].work_type.value_counts()
stroke_gen = df[df['stroke'] == 1].work_type.value_counts()

ax2.bar( healthy_gen.index , height = healthy_gen.values, width = 0.2, color = healthy_col)
ax2.bar( np.arange(len(stroke_gen.index)) , height = stroke_gen.values, width = 0.5, color= stroke_col)
ax2.set_xticklabels(['Private','Self-Employed','Children', 'Gov-Job','Never worked'], **font_dict)


ax2.text(-3,1600, 'Employment Risk',**title_args)
ax2.text(-0.45,2950, 'Healthy',**health_dict)
ax2.text(0.18,2950, '|',**dash_dict)
ax2.text(0.25,2950, 'Stroke',**stroke_dict)


#ax3-smoking

healthy_gen = df[df['stroke'] == 0].smoking_status.value_counts()
stroke_gen = df[df['stroke'] == 1].smoking_status.value_counts()

ax3.bar( healthy_gen.index, height = healthy_gen.values, width = 0.2,color = healthy_col)
ax3.bar( np.arange(len(stroke_gen.index)) , height = stroke_gen.values, width = 0.5,color= stroke_col)
ax3.set_xticklabels(['Never Smoked', 'Unknown','Formaly Smoked' ,'Smokes'], **font_dict)

ax3.text(-2.5,1000, 'Smoking Status And Risk',**title_args)
ax3.text(-0.4,1900, 'Healthy',**health_dict)
ax3.text(0.095,1900, '|',**dash_dict)
ax3.text(0.18,1900, 'Stroke',**stroke_dict)


> **4.PREDICTION**

In [None]:
df['gender'] = df['gender'].replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)
df['Residence_type'] = df['Residence_type'].replace({'Rural':0,'Urban':1}).astype(np.uint8)
df['work_type'] = df['work_type'].replace({'Private':0,'Self-employed':1,'Govt_job':2,'children':-1,'Never_worked':-2}).astype(np.uint8)

In [None]:
X  = df[['gender','age','hypertension','heart_disease','work_type','avg_glucose_level','bmi']]
y = df['stroke']

# creating dataset split for prediction
from sklearn.model_selection import train_test_split
X_train, X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42) # 80-20 split

# Checking split 
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

In [None]:
X_test.head(5)

In [None]:
>>> from sklearn.svm import SVC
>>> from sklearn.preprocessing import StandardScaler
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.model_selection import cross_val_score
>>> from sklearn.datasets import make_classification
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.pipeline import Pipeline
svc_pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])

X_train.fillna(X_train.mean(), inplace=True)
y_train.fillna(X_train.mean(),inplace=True)
svc_pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])
X_test.fillna(X_test.mean(), inplace=True)
y_test.fillna(X_test.mean(),inplace=True)
svc_pipe.score(X_test, y_test)


In [None]:
rf_pipe = Pipeline([('scaler', StandardScaler()), ('rf',RandomForestClassifier(random_state=42))])
rf_pipe.fit(X_train,y_train)
Pipeline(steps=[('scaler', StandardScaler()), ('rf', RandomForestClassifier(random_state=42))])
rf_pipe.score(X_test,y_test)

In [None]:
logistic_reg_pipe = Pipeline(steps = [('scale',StandardScaler()),('LR',LogisticRegression(random_state=42))])
logistic_reg_pipe.fit(X_train,y_train)
Pipeline(steps = [('scale',StandardScaler()),('LR',LogisticRegression(random_state=42))])
logistic_reg_pipe.score(X_test,y_test)

**The Cross-Validation of Classification and Regression models**

In [None]:
#RandomForestClassifier Cross-Validation
rf_cv=cross_val_score(rf_pipe,X,y,cv=10)


In [None]:
#Standard Scaler Cross-Validation
svc_cv = cross_val_score(svc_pipe,X,y,cv=10)


In [None]:
#Logistic Regression Cross-Validation
logistic_reg_cv =cross_val_score(logistic_reg_pipe,X,y,cv=10)


In [None]:
print('Random Forest :',rf_cv)
print('SVM  :',svc_cv)
print('Logistic Regression  :',logistic_reg_cv)