# **Library Import**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, ShuffleSplit,cross_val_score
from sklearn import metrics
import warnings
from sklearn.ensemble import RandomForestClassifier 
warnings.filterwarnings("ignore")

# **Data Preprocessing & Visualization**

In [None]:
raw_data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv').iloc[:,1:]

In [None]:
raw_data

In [None]:
raw_data.info()

**suggest that there are some NAs in bmi column.** <br><br/>
So, we want to fill NAs with the corresponding bmi value in PDF based on age interval.<br><br/>
And we drop the "Other" value in gender.

In [None]:
raw_data.drop(raw_data[raw_data['gender'] == 'Other'].index, inplace=True)

Add age labels to each age groups.

In [None]:
age_interval = ['Teenager','Adult','Middleage','Elderly','Old']
age_bin = [0,20,40,60,80,100]
age_set = pd.cut(raw_data['age'],age_bin,labels=age_interval)
age_set = pd.DataFrame(age_set)
age_set.columns = ["age_interval"]
raw_data = pd.concat([raw_data,age_set],axis=1)
raw_data

In [None]:
sns.displot(raw_data,x="bmi",hue="age_interval",kind="kde",fill=True)

**Fill NAs with bmi values corresponding to the highest density and seperate them by age group.**

In [None]:
Teenager_data  = pd.DataFrame()
Adult_data     = pd.DataFrame()
Middleage_data = pd.DataFrame()
Elderly_data   = pd.DataFrame()
Old_data       = pd.DataFrame()
frame_list = [Teenager_data, Adult_data, Middleage_data, Elderly_data, Old_data]

for i in range(5):
    data = raw_data[raw_data['age_interval']==age_interval[i]]['bmi'][raw_data[raw_data['age_interval']==age_interval[i]]['bmi'].notnull()]
    value = np.argmax(np.bincount(data.astype(np.int32)))
    frame_list[i] = raw_data[raw_data['age_interval']==age_interval[i]].copy()
    frame_list[i].bmi = frame_list[i].bmi.fillna(value)
frame_list[3]

In [None]:
plt.subplots(figsize=(20,5))
sns.set_style(style="darkgrid")

plt.subplot(1,3,1)
sns.countplot("ever_married",data=raw_data,palette="Paired",hue="stroke")

plt.subplot(1,3,2)
sns.countplot("hypertension",data=raw_data,palette="crest",hue='stroke')

plt.subplot(1,3,3)
sns.countplot("heart_disease",data=raw_data,palette="ocean",hue='stroke')

In [None]:
plt.subplots(figsize=(20,10))

plt.subplot(2,3,1)
sns.countplot("gender",data=raw_data,palette="Paired",hue='stroke')

plt.subplot(2,3,2)
sns.countplot("work_type",data=raw_data,palette="crest",hue='stroke')

plt.subplot(2,3,3)
sns.countplot("Residence_type",data=raw_data,palette="ocean",hue='stroke')

In [None]:
data = pd.concat([frame_list[i] for i in range(5)],axis=0)
data

In [None]:
import plotly.figure_factory as ff
group_labels = ['0', '1']
l = [data['age'][(data["stroke"] == 0)],data['age'][(data["stroke"] == 1)]]
fig = ff.create_distplot(l, group_labels,curve_type='kde',colors = ['orange', 'darkblue'])
fig.update_layout(title_text='Age & Stroke Distribution',xaxis_title="Age Distribution",yaxis_title="Frequency")
fig.show()
l = [data['bmi'][(data["stroke"] == 0)],data['bmi'][(data["stroke"] == 1)]]
fig = ff.create_distplot(l, group_labels,curve_type='kde',colors = ['orange', 'darkblue'])
fig.update_layout(title_text='BMI & Stroke Distribution',xaxis_title="BMI Distribution",yaxis_title="Frequency")
fig.show()
l = [data['avg_glucose_level'][(data["stroke"] == 0)],data['avg_glucose_level'][(data["stroke"] == 1)]]
fig = ff.create_distplot(l, group_labels,curve_type='kde',colors = ['orange', 'darkblue'])
fig.update_layout(title_text='Avg Glucose Level & Stroke Distribution',xaxis_title="Avg_Glucose_Level Distribution",yaxis_title="Frequency")
fig.show()

**Convert category to numeric.**

In [None]:
data['gender'] = data['gender'].map({'Male': int(0),'Female':int(1)})
data['ever_married'] = data['ever_married'].map({'Yes':int(1), 'No':int(0)})
data['work_type'] = data['work_type'].map({'Private':int(3), 'Self-employed':int(4),'Govt_job':int(2), 'children':int(1), 'Never_worked':int(0)})
data['Residence_type'] = data['Residence_type'].map({'Urban':int(2), 'Rural':int(1)})
data['smoking_status'] = data['smoking_status'].map({'formerly smoked':int(1),'never smoked':int(2), 'smokes':int(3),'Unknown':int(0)})
data['age_interval'] = data['age_interval'].map({'Teenager':int(1),'Adult':int(2), 'Middleage':int(3),'Elderly':int(4),'Old':int(5)})
data

In [None]:
shuffled_data = data.sample(frac=1,random_state=4)
stroke = shuffled_data.loc[shuffled_data["stroke"] == 1]
normal = shuffled_data.loc[shuffled_data["stroke"] == 0].sample(n=249,random_state=50)
merged_data = pd.concat([stroke,normal],axis=0)
merged_data

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
colorscale_1= [[1.0 , "#A50026"],[0.8888888888888888, "#D73027"],[0.7777777777777778, "#F46D43"],[0.6666666666666666, "#FDAE61"],
                [0.5555555555555556, "#FEE090"],[0.4444444444444444, "#F3F0E2"],[0.3333333333333333, "#E3FFFF"],[0.2222222222222222, "#ACE4FF"],[0.1111111111111111, "#74ADD1"],[0.0 , "#313695"]]
fig = make_subplots(rows=1, cols=1)
dataset_val = merged_data.corr('pearson')
dataset_idx = dataset_val.index
dataset_col = dataset_val.columns
dataset_val = dataset_val.values
fig.add_trace(
go.Heatmap(x=dataset_col,y=dataset_idx,z=dataset_val,name='pearson',xgap=1,ygap=1,colorscale=colorscale_1),row=1, col=1)
fig.update_layout(height=800, width=900)
fig.show()

In [None]:
columns = ['gender','age','hypertension','heart_disease','avg_glucose_level','bmi','smoking_status','age_interval']
x = merged_data[columns]
y = merged_data.iloc[:,-2]
x

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size =0.2,random_state =206)
x_train

# **Support Vector Machine**

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf')
svc.fit(x_train,y_train)
y_predict = svc.predict(x_test)
print("SVM Accuracy:",metrics.accuracy_score(y_test,y_predict))
print("SVM F1 score:",metrics.f1_score(y_test,y_predict))

svc = SVC(kernel='rbf')
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=206)
score = cross_val_score(svc,x,y,cv=cv)
f1_score = cross_val_score(svc,x,y,cv=cv,scoring='f1_macro')
print("SVM 5 folds CV Acc:",score.mean())
print("SVM 5 folds F1 score:",f1_score.mean())

# **Random Forest**

In [None]:
rdf = RandomForestClassifier(random_state=163,bootstrap=True,max_depth=100,max_features = 'sqrt',
                              min_samples_leaf= 1, min_samples_split= 5,n_estimators= 5)
rdf.fit(x_train, y_train)
y_predict = rdf.predict(x_test)
print("Random Forest Accuracy:",metrics.accuracy_score(y_test, y_predict))
print("Random Forest F1 score:",metrics.f1_score(y_test,y_predict))