In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
#id is unecessary
df = df.drop('id', axis = 1)

In [None]:
df.isnull().sum()

# **Exploratory Data Analysis**

In [None]:
sns.countplot(df['gender'])

In [None]:
sns.countplot(df['smoking_status'])

In [None]:
#very few people in dataset with stroke
sns.countplot(df['stroke'])

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), cmap = 'viridis')

In [None]:
sns.countplot(df['heart_disease'])

In [None]:
sns.countplot(df['ever_married'])

# Data Manipulation 

In [None]:
#Filled empty bmi with mean value
df['bmi'].fillna(df['bmi'].mean(), inplace = True)

In [None]:
df.isnull().sum()

In [None]:
df['smoking_status'].value_counts()

In [None]:
df['smoking_status'].replace('Unknown', df['smoking_status'].mode()[0], inplace = True)

In [None]:
df['smoking_status'].value_counts()

In [None]:
df.head()

In [None]:
#creating dummy variables for categorical data
df = pd.get_dummies(df, columns=['gender'], drop_first=True)
df = pd.get_dummies(df, columns=['ever_married'], drop_first=True)
df = pd.get_dummies(df, columns=['Residence_type'], drop_first=True)
df = pd.get_dummies(df, columns=['work_type'], drop_first=True)
df = pd.get_dummies(df, columns=['smoking_status'], drop_first=True)
df.head()

# **ANN Classification Model**

In [None]:
X = df.drop('stroke', axis = 1).values
y = df['stroke'].values

In [None]:
#split data into training and testing
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
#Scale and Fit the data
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout

In [None]:
X_train.shape

In [None]:
#ANN MODEL
model = Sequential()

model.add(Dense(units = 15, activation = 'relu'))
model.add(Dropout(.1))
model.add(Dense(units = 10, activation = 'relu'))
model.add(Dropout(.1))
model.add(Dense(units = 5, activation = 'relu'))
model.add(Dropout(.1))
model.add(Dense(units = 1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'adam')

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
#stops when validation loss is not dropping for 3 epochs
early_stop = EarlyStopping(monitor = 'val_loss', mode = 'min', patience = 10, verbose = 1)

In [None]:
#Fitting model
model.fit(X_train, y_train, 
          epochs = 800,
         validation_data=(X_test, y_test),
         verbose = 1, 
         callbacks=[early_stop])

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.plot()

In [None]:
#Predictions
prediction = model.predict_classes(X_test)

#  Accuracy Reports


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test,prediction))
print()
print(confusion_matrix(y_test,prediction))

In [None]:
# Was not able to predict any of the people with stroke (not enough data with people who have stroke)

# **SVM MODEL**

In [None]:
from sklearn.svm import SVC

In [None]:
model = SVC()

In [None]:
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
print(classification_report(y_test,prediction))
print()
print(confusion_matrix(y_test,prediction))

In [None]:
# Same results as ANN 
# Lets try tuning the SVM model with GridSearch

# **GRID SEARCH TUNING**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'C': [0.1, 1, 10, 1000], 'gamma':[1, 0.1, 0.01, 0.001]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)

In [None]:
#Fit the data
grid.fit(X_train, y_train)

In [None]:
#Best Paramaters based on param grid
grid.best_params_

In [None]:
#Grid Model With Best Paramaters
grid.best_estimator_

In [None]:
# Predictions with Grid Tuned SVM model
predictions = grid.predict(X_test)

In [None]:
print(classification_report(y_test,prediction))
print()
print(confusion_matrix(y_test,prediction))

# **Conclusion**

In [None]:
# Summary
# Models are not able to accurately predict if someone has Stroke based on too little data in the data set
print("People with stroke is {} % of the dataset".format((df['stroke'].value_counts()[1])/len(df)* 100))

# Let Me Know What You Think!

# Thanks!!!