In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import sklearn
import pandas_profiling as pp
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, classification_report, roc_curve,precision_recall_curve, auc,confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.impute import KNNImputer

from xgboost import XGBClassifier

from catboost import CatBoostClassifier

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

In [None]:
df = df.drop('id', axis=1)
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['bmi'].fillna(df['bmi'].mean(), inplace = True) #Filled empty bmi with mean value

In [None]:
df.isnull().sum()

In [None]:
sizes

In [None]:
labels =df['stroke'].value_counts(sort = True).index
sizes = df['stroke'].value_counts(sort = True)

colors = ["lightblue","red"]
explode = (0.05,0) 
 
plt.figure(figsize=(7,7))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90,)

plt.title('Number of stroke in the dataset')
plt.show()

In [None]:
y_smote

In [None]:
sizes = y_smote.value_counts(sort = True)
colors = ["lightblue","red"]
explode = (0.05,0) 
 
plt.figure(figsize=(7,7))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90,)

plt.title('Number of stroke in the dataset')
plt.show()

In [None]:
#****************
en_df = df.copy()
en_df.head()

In [None]:
df.head()

In [None]:
smoke_to_int = {
    'never smoked': 0,
    'formerly smoked': 1,
    'smokes': 2,
    'Unknown': -1
}
en_df['smoking_status'] = [smoke_to_int[s] for s in en_df['smoking_status']]
en_df.head()


In [None]:
work_to_int = {
    'Private': 1,
    'Self-employed': 2,
    'Govt_job': 3,
    'children': 4,
    'Never_worked': 0
}
en_df['work_type'] = [work_to_int[s] for s in en_df['work_type']]
en_df.head()

In [None]:
en_df['gender'] = [int(m) for m in en_df['gender'] == 'Female']
en_df['Residence_type'] = [int(r) for r in en_df['Residence_type'] == 'Urban']
en_df['ever_married'] = [int(b) for b in en_df['ever_married'] == 'Yes']
en_df.head()

In [None]:
df.head()

In [None]:
#  Using Lable encoding
#le = LabelEncoder()
#en_df = df.apply(le.fit_transform)
#en_df.head()

In [None]:
pp.ProfileReport(en_df)

In [None]:
def plot_hist(col, bins=30, title="",xlabel="",ax=None):
    sns.distplot(col, bins=bins,ax=ax)
    ax.set_title(f'Histogram of {title}',fontsize=20)
    ax.set_xlabel(xlabel)
    


In [None]:
fig, axes = plt.subplots(1,3,figsize=(11,7),constrained_layout=True)
plot_hist(df.bmi,
          title='Bmi',
          xlabel="Level of the BMI",
          ax=axes[0])
plot_hist(df.age,
          bins=30,
          title='Age',
          xlabel='Age',
          ax=axes[1])
plot_hist(df.avg_glucose_level,
          title='Serum Creatinine', 
          xlabel='Level of serum creatinine in the blood (mg/dL)',
          ax=axes[2])

plt.show()

In [None]:
sns.catplot(y="work_type", hue="stroke", kind="count",
            palette="pastel", edgecolor=".6",
            data=df)

In [None]:
sns.catplot(y="smoking_status", hue="stroke", kind="count",
            palette="pastel", edgecolor=".6",
            data=df)

In [None]:
plt.figure(figsize=(17,7))
sns.catplot(x="gender", y="stroke", hue="heart_disease", palette="pastel", kind="bar", data=df)
sns.catplot(x="gender", y="stroke", hue="Residence_type", palette="pastel", kind="bar", data=df)
sns.catplot(x="gender", y="stroke", hue="hypertension", palette="pastel", kind="bar", data=df)
plt.show()

In [None]:
len_data = len(df)
len_w = len(df[df["gender"]=="Male"])
len_m = len_data - len_w

men_stroke = len(df.loc[(df["stroke"]==1)&(df['gender']=="Male")])
men_no_stroke = len_m - men_stroke

women_stroke = len(df.loc[(df["stroke"]==1) & (df['gender']=="Female")])
women_no_stroke = len_w - women_stroke

labels = ['Men with stroke','Men healthy','Women with stroke','Women healthy']
values = [men_stroke, men_no_stroke, women_stroke, women_no_stroke]

fig = go.Figure(data=[go.Pie(labels=labels, values=values,textinfo='label+percent',hole=0.4)])
fig.update_layout(
    title_text="Distribution of stroke EVENT according to their gender")
fig.show()

In [None]:
df.columns

In [None]:
features=['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type',
       'smoking_status']
from matplotlib.offsetbox import AnchoredText
correlation_table = []
for cols in features:
    y = en_df["stroke"]
    x = en_df[cols]
    corr = np.corrcoef(x, y)[1][0]
    dict ={
        'Features': cols,
        'Correlation coefficient' : corr,
        'Feat_type': 'numerical'
    }
    correlation_table.append(dict)
dF1 = pd.DataFrame(correlation_table)
fig = plt.figure(figsize=(10,6), facecolor='#EAECEE')
ax = sns.barplot(x="Correlation coefficient", y="Features", 
                     data=dF1.sort_values("Correlation coefficient", ascending=False),
                     palette='viridis', alpha=0.75)
ax.grid()
#ax.set_title("Correlation of numerical features with Target", fontsize=20, y=1.05)

title =  'Correlation features with target'
sub_title = 'In comparison with categorical features \
\nnumericals are less correlated with target.'

plt.gcf().text(0.05, 1.02, title, fontsize=24)
#plt.gcf().text(0.05, 0.9, sub_title, fontsize=14)

at1 = AnchoredText(sub_title,
                   loc='lower left', frameon=True,
                   bbox_to_anchor=(-0.1, 1.01),
                   bbox_transform=ax.transAxes,
                   #prop=dict(size=8),
                   )
at1.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
ax.add_artist(at1)

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(en_df.corr(),cmap="Blues");

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

X = en_df[features]
y = en_df['stroke']
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)
forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    
    
# Plot the impurity-based feature importances of the forest
plt.figure()

plt.title("Feature importances")
sns.barplot(x=np.array(features)[indices], y=importances[indices], palette="deep",yerr=std[indices])
plt.xticks(range(X.shape[1]), np.array(features)[indices],rotation=60)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
#en_df_imputed = en_df
#imputer = KNNImputer(n_neighbors=4, weights="uniform")
#imputer.fit_transform(en_df_imputed)

In [None]:
#en_df_imputed.isnull().sum()

In [None]:
df.head()

In [None]:
#en_df_imputed.head()

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
# ## drop target variable for training
# X = en_df.drop(["stroke"],axis = 1)
# y = en_df.pop("stroke")

# ## data split
# X_train, X_test,y_train,y_test = train_test_split(X, y,test_size=0.2,random_state=1)

# ## SMOTE oversampling
# SMOTE_oversample = SMOTE(random_state=1)
# X_train,y_train = SMOTE_oversample.fit_resample(X_train, y_train.ravel())
models_score = []

def plot_cm(cm,title):
    z = cm
    x = ['No stroke', 'stroke']
    y = x
    # change each element of z to type string for annotations
    z_text = [[str(y) for y in x] for x in z]

    # set up figure 
    fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='deep')

    # add title
    fig.update_layout(title_text='<i><b>Confusion matrix {}</b></i>'.format(title),
                      #xaxis = dict(title='x'),
                      #yaxis = dict(title='x')
                     )

    # add custom xaxis title
    fig.add_annotation({'font':{'color':"black",'size':14},
                            'x':0.5,
                            'y':-0.10,
                            'showarrow':False,
                            'text':"Predicted value",
                            'xref':"paper",
                            'yref':"paper"})
    
    fig.add_annotation({'font':{'color':"black",'size':14},
                            'x':-0.15,
                            'y':0.5,
                            'showarrow':False,
                            'text':"Real value",
                            'textangle':-90,
                            'xref':"paper",
                            'yref':"paper"})


    # adjust margins to make room for yaxis title
    fig.update_layout(margin={'t':50, 'l':20},width=750,height=750)
    


    # add colorbar
    fig['data'][0]['showscale'] = True
    fig.show()

    

X , y = en_df[features],en_df["stroke"]
#data split to Training Set and Test Set
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
sm = SMOTE()
x_smote, y_smote = sm.fit_resample(x_train,y_train)

y_smote.value_counts()
sns.histplot(y_smote)

# labels =en_df['stroke'].value_counts(sort = True).index
# sizes = en_df['stroke'].value_counts(sort = True)

# colors = ["lightblue","red"]
# explode = (0.05,0) 
 
# plt.figure(figsize=(7,7))
# plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90,)

# plt.title('Number of stroke in the dataset')
# plt.show()

# create object model
best_score = 0
best_model = None
for critetion in ['gini', 'entropy']:
    for i in range(10, 301, 20):
        RF_model = RandomForestClassifier(n_estimators=i, criterion=criterion)

        # fit the model
        RF_model.fit(x_train,y_train)

        # model score
        predict_train_RF = RF_model.predict(x_train)
        predict_test_RF = RF_model.predict(x_test)
        RF_test_score = RF_model.score(x_test,y_test)
        if RF_test_score > best_score:
            best_model = RF_model


# accuracy score
RF_train_score = RF_model.score(x_train,y_train)

cm_model = confusion_matrix(y_test, predict_test_RF)

print("Random Forest model")
print(cm_model)
print('Validation Acuuracy: ',accuracy_score(y_test,predict_test_RF))
print('Training Accuracy: ',accuracy_score(y_train, predict_train_RF))
plot_cm(cm_model,title="Random Forest model") #print Confusion Matrix 

models_score.append(accuracy_score(y_test,predict_test_RF))




#----------------------------------------------



# # create object model
# SVC_model = SVC()

# # fit the model
# SVC_model.fit(x_train,y_train)

# # model score
# predict_train_SVC = SVC_model.predict(x_train)
# predict_test_SVC = SVC_model.predict(x_test)

# # accuracy score
# SVM_train_score = SVC_model.score(x_train,y_train)
# SVM_test_score = SVC_model.score(x_test,y_test)

# cm_model1 = confusion_matrix(y_test, predict_test_SVC)

# print("SVC model")
# print(cm_model1)
# print('Validation Acuuracy: ',accuracy_score(y_test, predict_test_SVC))
# print('Training Accuracy: ',accuracy_score(y_train, predict_train_SVC))
# plot_cm(cm_model1,title="SVC model") #print Confusion Matrix 

# models_score.append(accuracy_score(y_test,predict_test_SVC))



In [None]:
# def plot_cm(cm,title):
#     z = cm
#     x = ['No stroke', 'stroke']
#     y = x
#     # change each element of z to type string for annotations
#     z_text = [[str(y) for y in x] for x in z]

#     # set up figure 
#     fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='deep')

#     # add title
#     fig.update_layout(title_text='<i><b>Confusion matrix {}</b></i>'.format(title),
#                       #xaxis = dict(title='x'),
#                       #yaxis = dict(title='x')
#                      )

#     # add custom xaxis title
#     fig.add_annotation({'font':{'color':"black",'size':14},
#                             'x':0.5,
#                             'y':-0.10,
#                             'showarrow':False,
#                             'text':"Predicted value",
#                             'xref':"paper",
#                             'yref':"paper"})
    
#     fig.add_annotation({'font':{'color':"black",'size':14},
#                             'x':-0.15,
#                             'y':0.5,
#                             'showarrow':False,
#                             'text':"Real value",
#                             'textangle':-90,
#                             'xref':"paper",
#                             'yref':"paper"})


#     # adjust margins to make room for yaxis title
#     fig.update_layout(margin={'t':50, 'l':20},width=750,height=750)
    


#     # add colorbar
#     fig['data'][0]['showscale'] = True
#     fig.show()



# def hist_score(score):
#     models_names = [
#     'Random Forest Classifier',
#     'SVM']

#     plt.rcParams['figure.figsize']=20,8
#     sns.set_style('darkgrid')
#     ax = sns.barplot(x=models_names, y=score, palette = "inferno", saturation =2.0)
#     plt.xlabel('Classifier Models', fontsize = 20 )
#     plt.ylabel('% of Accuracy', fontsize = 20)
#     plt.title('Accuracy of different Classifier Models on test set', fontsize = 20)
#     plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
#     plt.yticks(fontsize = 12)
#     for i in ax.patches:
#         width, height = i.get_width(), i.get_height()
#         x, y = i.get_xy() 
#         ax.annotate(f'{round(height,2)}%', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
#     plt.show()

# def run_exp_on_feature(x_train,y_train,x_test,y_test):
#     #x_train,x_test,y_train,y_test = train_test_split(features,labels, test_size=0.2, random_state=23)
#     models= [['Random Forest Classifier ',RandomForestClassifier()],
#             ['SVM ',SVC()]]
# # n_estimators=estimators, criterion = criterion, random_state = 42
# # for criterion in criterions:
# #   for estimator in estimators:
# #     models.append(randomforest(critertion=criterion, estimators=estimators)

#     models_score = []
#     for name,model in models:

#         model = model
# #         for i in range(10):
#         model.fit(x_train,y_train)
            
#         model_pred = model.predict(x_test)
#         cm_model = confusion_matrix(y_test, model_pred)
#         print(cm_model)
#         models_score.append( (y_test,model.predict(x_test)))

#         print(name)
#         print('Validation Acuuracy: ',accuracy_score(y_test,model.predict(x_test)))
#         print('Training Accuracy: ',accuracy_score(y_train,model.predict(x_train)))
#         print('############################################')
#         plot_cm(cm_model,title=name+"model")
#         fpr, tpr, thresholds = roc_curve(y_test, model_pred)
        
#     return models_score

# models_score = run_exp_on_feature(x_train,y_train,x_test,y_test)

# hist_score(models_score)

In [None]:
from sklearn.model_selection import train_test_split as tts
import tensorflow as tf
from sklearn.metrics import accuracy_score


In [None]:
#SPLITTING THE DATA INTO TRAINING AND TESTING DATA
x_train,x_test,y_train,y_test_ann=tts(x,y,test_size=0.2)

In [None]:
#CREATING ARTIFICIAL NEURAL NETWORK MODEL[ANN]
ann=tf.keras.Sequential()

ann.add(tf.keras.layers.Dense(units=25,activation='relu'))

ann.add(tf.keras.layers.Dense(units=25,activation='relu'))

ann.add(tf.keras.layers.Dense(units=1,activation='sigmoid'))

ann.compile('adam','binary_crossentropy',metrics=['accuracy'])

In [None]:
# TRAINING ANN MODEL
result=ann.fit(x_train,y_train,epochs=10)

In [None]:
#CREATING CONFUSION MATRIX FOR THE ACTUAL AND PREDICTED VALUE
from sklearn.metrics import confusion_matrix
y_pred=[]
for i in ann.predict(x_test):
    if i>0.5:
        y_pred.append(1)
    if i<0.5:
        y_pred.append(0)
confusion_matrix(y_test_ann,y_pred)

In [None]:
# ACCURACY SCORE FOR TESTING DATA
accuracy=accuracy_score(y_test_ann,y_pred)
print('accuracy: ', accuracy)



In [None]:
models_score.append(accuracy_score(y_test_ann,y_pred))

def hist_score(score):
    models_names = [
    'Random Forest Classifier',
    'SVM',
    'ANN']

    plt.rcParams['figure.figsize']=20,8
    sns.set_style('darkgrid')
    ax = sns.barplot(x=models_names, y=score, palette = "inferno", saturation =2.0)
    plt.xlabel('Classifier Models', fontsize = 20 )
    plt.ylabel('% of Accuracy', fontsize = 20)
    plt.title('Accuracy of different Classifier Models on test set', fontsize = 20)
    plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
    plt.yticks(fontsize = 12)
    for i in ax.patches:
        width, height = i.get_width(), i.get_height()
        x, y = i.get_xy() 
        ax.annotate(f'{round(height,2)}%', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
    plt.show()
    
hist_score(models_score)

In [None]:
# from imblearn.over_sampling import SMOTE


# X , y = en_df[features],en_df["stroke"] #en_df_imputed[features],en_df_imputed["stroke"]
# x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=23)
# sm = SMOTE()
# X_res, y_res = sm.fit_resample(x_train,y_train)



# def hist_score(score):
#     models_names = [
#     'Random Forest Classifier',
#     'SVM',
#     'ANN']

#     plt.rcParams['figure.figsize']=20,8
#     sns.set_style('darkgrid')
#     ax = sns.barplot(x=models_names, y=score, palette = "inferno", saturation =2.0)
#     plt.xlabel('Classifier Models', fontsize = 20 )
#     plt.ylabel('% of Accuracy', fontsize = 20)
#     plt.title('Accuracy of different Classifier Models on test set', fontsize = 20)
#     plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
#     plt.yticks(fontsize = 12)
#     for i in ax.patches:
#         width, height = i.get_width(), i.get_height()
#         x, y = i.get_xy() 
#         ax.annotate(f'{round(height,2)}%', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
#     plt.show()
    

# def run_exp_on_feature(x_train,y_train,x_test,y_test):
#     #x_train,x_test,y_train,y_test = train_test_split(features,labels, test_size=0.2, random_state=23)
#     models= [['Random Forest Classifier ',RandomForestClassifier()],
#             ['SVM ',SVC()]]

#     models_score = []
#     for name,model in models:

#         model = model
#         model.fit(x_train,y_train)
#         model_pred = model.predict(x_test)
#         cm_model = confusion_matrix(y_test, model_pred)
#         #print(cm_model)
#         models_score.append(accuracy_score(y_test,model.predict(x_test)))

#         fpr, tpr, thresholds = roc_curve(y_test, model_pred)
        
#     return models_score


# models_score = run_exp_on_feature(x_train,y_train,x_test,y_test)

# models_score.append(accuracy_score(y_test_ann,y_pred))

# hist_score(models_score)