In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()

In [None]:
le = LabelEncoder()

columnsToEncode = list(df.select_dtypes(include=['category','object']))

for feature in columnsToEncode:
    df[feature] = le.fit_transform(df[feature])

In [None]:
df.head()

In [None]:
df = df.drop(['id'], axis=1)

In [None]:
x_s = df.iloc[:,:-1]
y_s = df.iloc[:,-1]

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x_s,y_s,test_size=0.2)

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

# Use XGBOOST


In [None]:
from xgboost import XGBClassifier

xgboost = XGBClassifier()
xgboost.fit(X_train,y_train)

In [None]:
y_pred = xgboost.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))

# USE GradientBoosting


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gradientboost = GradientBoostingClassifier()
gradientboost.fit(X_train,y_train)

In [None]:
y_pred = gradientboost.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))

# Use PCA to combine all attributes into one single attribute.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
x_train_pca = pca.fit_transform(X_train)
x_test_pca = pca.transform(X_test)

### XGBoost PCA

In [None]:
xgboost = XGBClassifier()
xgboost.fit(x_train_pca,y_train)

In [None]:
y_pred_pca = xgboost.predict(x_test_pca)
print(confusion_matrix(y_test,y_pred_pca))

### GradientBoosting PCA

In [None]:
gradientboost = GradientBoostingClassifier()
gradientboost.fit(x_train_pca,y_train)

In [None]:
y_pred_pca = gradientboost.predict(x_test_pca)
print(confusion_matrix(y_test,y_pred_pca))

# Tensorflow DNN

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
model = Sequential()
model.add(Dense(32, activation='relu', input_shape=[10]))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=100)

In [None]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plt.plot(epochs, acc, label='acc')
plt.plot(epochs, val_acc, label='val_acc')
plt.title('Accuracy')

plt.subplot(1,2,2)
plt.plot(epochs, loss, label='loss')
plt.plot(epochs, val_loss, label='val_loss')
plt.title('Loss')


In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred = (y_pred > 0.5)

In [None]:
print(np.concatenate((y_pred.reshape(len(y_pred),1), np.array(y_test).reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_pred, y_test)


### From all results, they show errors due to imbalanced dataset. To solve this, let's try using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm = SMOTE()
X_over, y_over = sm.fit_resample(x_s,y_s)

In [None]:
import seaborn as sns

sns.countplot(x = y_over, data=df)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_over,y_over,test_size=0.2,random_state=42)

### USE XGBOOST with SMOTE

In [None]:
xgboost = XGBClassifier()
xgboost.fit(X_train,y_train)

In [None]:
y_pred = xgboost.predict(X_test)
print(confusion_matrix(y_test,y_pred))

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import classification_report

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1: ",f1)

print(classification_report(y_test,y_pred,target_names=['not 1','1']))

In [None]:
feature_importance = np.array(xgboost.feature_importances_)
feature_names = np.array(x_s.columns)

feat_imp = pd.DataFrame({'feature_names':feature_names,
                         'feature_importance':feature_importance})

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x=feat_imp['feature_importance'],y=feat_imp['feature_names'])

## LightGBM


In [None]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()
lgbm.fit(X_train,y_train)

In [None]:
y_pred = lgbm.predict(X_test)
print(confusion_matrix(y_test,y_pred))

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ",accuracy)

In [None]:
print(classification_report(y_test,y_pred,target_names=['not 1','1']))