<a href="https://colab.research.google.com/github/esragcetnky/Stroke-Prediction/blob/main/Stroke_Prediction_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1.Import Libraries

In [None]:
# at first these 3 libraries are enough,
# when trying to build model, i will add other necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# let's read our data
data= pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

# 2.Analyzing and Understanding Dataset

* let's put data to dataframe

In [None]:
dataframe=pd.DataFrame(data)

In [None]:
dataframe.shape

* So we have 12 features and 5110 samples in our datasets.
* One of the 12 features is the label

Let's see the names of features

In [None]:
dataframe.columns

In [None]:
# let's see the top 5 sample, so we can understand dataset better
dataframe.head()

# 3.Visualize Dataset

* visualizing data is really helpful
* it can give better understanding relations between features


Matplotlib library will be used for visualization

First, i wanna check on distributions

In [None]:
plt.style.use('seaborn-darkgrid')

## *Histogram Plot*

In [None]:
fig, (ax0,ax1,ax2)=plt.subplots(nrows=1,
                                ncols=3,
                                figsize=(30,10))

# AGE DISTRIBUTION 
ax0.hist(dataframe['age'],
         bins=15,
         color='blue')
ax0.set_ylim([200,500])
ax0.set_xlim([0,80])
ax0.set_xlabel('Age')
ax0.set_title('Age distribution')

# AVG GLUCOSE LEVEL DISTRIBUTION
ax1.hist(dataframe['avg_glucose_level'],
         bins=25)
ax1.set_xlabel('Average Glucose Level')
ax1.set_xlim([50,250])
ax1.set_title('Average Glucose Level distribution')

# BMI DISTRIBUTION
ax2.hist(dataframe['bmi'],
         bins=25,
         color='green')
ax2.set_xlabel('Bmi')
ax2.set_ylim([0,1000])
ax2.set_xlim([10,60])
ax2.set_title('Bmi distribution')

fig.show()

## *Pie Chart*

In [None]:
fig, ((ax0,ax1),(ax2,ax3))=plt.subplots(nrows=2,
                                       ncols=2,
                                       figsize=(15, 15))
def label_function(val):
    return f'{val / 100 * len(dataframe):.0f}\n{val:.0f}%'

# EVER MARRIED
dataframe.groupby('ever_married').size().plot(kind='pie', 
                                              autopct=label_function,
                                              textprops={'fontsize': 15},
                                              ax=ax0)
ax0.set_ylabel('Ever Married',size=15)

# GENDER
dataframe.groupby('gender').size().plot(kind='pie', 
                                      autopct=label_function, 
                                      textprops={'fontsize': 15},
                                      colors=['violet', 'lime','tomato'],
                                      ax=ax1)
ax1.set_ylabel('Per gender',size=15)

# SMOKING STATUS
dataframe.groupby('smoking_status').size().plot(kind='pie',
                                                 autopct=label_function, 
                                                 textprops={'fontsize': 15},
                                                 colors=['tomato', 'gold','violet','lime'],
                                                 ax=ax2)
ax2.set_ylabel('Smoking Status',size=15)

# RESIDENCE TYPE 
dataframe.groupby('Residence_type').size().plot(kind='pie',
                                                 autopct=label_function, 
                                                 textprops={'fontsize': 15},
                                                 ax=ax3)                                         
ax3.set_ylabel('Residence Type',size=15)

# showing the figure
fig.show()

# 4.Split and Formatting Data

## *Creating dummy variables*

* when training model, cant use string values so that is why i am creating dummy variables

In [None]:
# creating dummy variables
def dummy_creation(dataset,dummy_categories):
  for i in dummy_categories:
    dataset_dummy=pd.get_dummies(dataset[i])
    dataset=pd.concat([dataset,dataset_dummy],
                      axis=1)
    dataset=dataset.drop(i,axis=1)
  return dataset

In [None]:
dataframe=dummy_creation(dataframe, ['gender','ever_married','work_type','Residence_type','smoking_status'])

## *Dropping useless columns*


* i dont need id column, so dropped that column

In [None]:
dataframe=dataframe.drop(columns='id')

In [None]:
dataframe

* in gender feature, there is one sample with 'Other' value, i think dont need that value so i am gonna delete this sample

In [None]:
dataframe[dataframe['Other']==1]

In [None]:
dataframe =dataframe.drop(3116)

* now let's check

In [None]:
dataframe[dataframe['Other']==1]

* perfect, now i can drop 'Other' column, i dont need that

In [None]:
dataframe= dataframe.drop(columns='Other')

In [None]:
dataframe.head()

## *Checking for null values*

* before split data, i wanna if there are null values

In [None]:
dataframe.isnull().values.any(), dataframe.isnull().sum().sum()

* looks like there are 201 null values in the dataset, lets look which columns has null value

In [None]:
count_nan=dataframe.isnull().sum()
print(count_nan)

In [None]:
dataframe['bmi'].isnull().sum()

* I am dropping nan values, now we have 5110-201 sample

In [None]:
dataframe.dropna(subset=['bmi'],
                 inplace=True)

In [None]:
dataframe.isnull().values.any()

In [None]:
dataframe.shape

## *Heatmap*

* i wanna check on colleration between feature using heatmap

In [None]:
import seaborn as sns
fig, ax=plt.subplots(figsize=(20,20))
sns.heatmap(dataframe.corr(),
            annot=True,
            linewidth=.5,
            ax=ax)
fig.show()

## Splitting data as train and test dataset

* first, i will split target and data

In [None]:
y=dataframe['stroke']

In [None]:
y.shape

In [None]:
y[y==1]

In [None]:
x=dataframe.drop(columns='stroke')

In [None]:
x.columns, x.shape

* i am going to split dataset using 'train_test_split' from sklearn

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

# 5.Build Model

## *Import libraries*

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout,Softmax,MaxPool2D

## *Creating models*

In [None]:
model=Sequential()

## *Adding Layers*

In [None]:
input_size=x_train.shape[1]
model.add(Dense(units=50,
                activation='relu',              
                input_shape=[input_size,]))

In [None]:
model.add(Dropout(0.2))

In [None]:
model.add(Dense(units=50,
                activation='relu'))

In [None]:
model.add(Dropout(0.2))

In [None]:
model.add(Dense(units=50,
                activation='relu'))

In [None]:
model.add(Dense(units=1,
                activation='sigmoid'))

In [None]:
model.summary()

## *Compile Model*

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics='accuracy')

## *Train Model*

In [None]:
model.fit(x_train,
          y_train,
          batch_size=16,
          epochs=50)

## *Test Model*

In [None]:
y_pred=model.predict(x_test).flatten()

In [None]:
y_pred=np.round(y_pred)

In [None]:
loss_value, accuracy_value= model.evaluate(x_test,y_test)

In [None]:
print(f'Accuracy of the model: {accuracy_value}')
print(f'Loss of the model: {loss_value}')

In [None]:
from sklearn.metrics import confusion_matrix , classification_report

print(classification_report(y_test, y_pred))

In [None]:
cf=confusion_matrix(y_test,y_pred)
print(cf)

## *Saving Model*

In [None]:
model.save('/content/drive/My Drive/Stroke_Prediction')