In [None]:
!pip3 install -q jupyterthemes

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import os
import sys
import time
from jupyterthemes import jtplot
jtplot.style(context="notebook", theme="monokai", ticks=True)

In [None]:
os.getcwd()

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
print(f'Shape of dataset : {df.shape}')

In [None]:
df.head(5)

In [None]:
def calculate_miss():
    df_miss = pd.DataFrame((df.isna().sum() / df.isna().count()) * 100).reset_index()\
             .rename(columns={"index":"cols", 0:"missing_percent"}).sort_values(by=["missing_percent"], ascending=False)
    print(df_miss.head(10))
    
calculate_miss()

In [None]:
df.loc[(df["gender"] == "Male") & (df["bmi"].isna()), "bmi"] = 24.5
df.loc[(df["gender"] == "Female") & (df["bmi"].isna()), "bmi"] = 29.1

In [None]:
calculate_miss()

In [None]:
plt.figure(figsize=(15, 10))
df.groupby(["age"]).size().hist()
plt.xlabel("Cumulative Age")
plt.ylabel("Frequency")
plt.show()

In [None]:
df[df["age"] < 20]

In [None]:
df.groupby(["work_type", "stroke"]).agg({'id':'count'}).rename(columns={'id':'counts'})

In [None]:
plt.figure(figsize=(15, 10))
sns.countplot(x="work_type", data=df, hue="stroke")
plt.legend(loc="best")
plt.show()

In [None]:
df.groupby(["gender", "stroke"]).agg({'id':'count'}).rename(columns={'id':'counts'})

In [None]:
plt.figure(figsize=(15, 10))
sns.countplot(x="gender", data=df, hue="stroke")
plt.legend(loc="best")
plt.show()

In [None]:
df.groupby(["heart_disease", "stroke"]).agg({'id':'count'}).rename(columns={'id':'counts'})

In [None]:
plt.figure(figsize=(15, 10))
sns.countplot(x="heart_disease", data=df, hue="stroke")
plt.legend(loc="best")
plt.show()

In [None]:
df.groupby(["gender", "heart_disease"]).agg({'id':'count'}).rename(columns={'id':'counts'})

In [None]:
plt.figure(figsize=(15, 10))
sns.countplot(x="heart_disease", data=df, hue="gender")
plt.legend(loc="best")
plt.show()

In [None]:
df.groupby(["hypertension", "stroke"]).agg({'id':'count'}).rename(columns={'id':'counts'})

In [None]:
plt.figure(figsize=(15, 10))
sns.countplot(x="hypertension", data=df, hue="stroke")
plt.legend(loc="best")
plt.show()

In [None]:
df.groupby(["ever_married", "stroke"]).agg({'id':'count'}).rename(columns={'id':'counts'})

In [None]:
plt.figure(figsize=(15, 10))
df.groupby(["bmi"]).size().hist()
plt.xlabel("Cumulative BMI")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.figure(figsize=(15, 10))
sns.countplot(x="stroke", data=df)
plt.legend(loc="best")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_copy = df.copy()
X_numeric = df[["age", "bmi", "avg_glucose_level"]].values

In [None]:
df_categorical = df.drop(["age", "bmi", "avg_glucose_level", "stroke"], axis=1)
y = df["stroke"].values

In [None]:
df_copy

In [None]:
df_one_hot = pd.get_dummies(df_categorical, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'hypertension',  'smoking_status', 'heart_disease']) 
df_one_hot

In [None]:
X_categorical = df_one_hot.values

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_numeric = scaler.fit_transform(X_numeric)

In [None]:
X = np.concatenate([X_numeric, X_categorical], axis=1)
print(f'Shape of data X : {X.shape}')
print(f'Shape of data Y : {y.shape}')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

print("Shape of Training Data  -----")
print(f'Shape of X :{X_train.shape}')
print(f'Shape of Y : {y_train.shape}')

print("Shape of Test Data ------")
print(f'Shape of X : {X_test.shape}')
print(f'Shape of Y : {y_test.shape}')

In [None]:
logreg = LogisticRegression(random_state=1234)
logreg.fit(X_train, y_train)

print(f'Score on Training data : {logreg.score(X_train, y_train)}')
print(f'Score on Test data : {logreg.score(X_test, y_test)}')

In [None]:
dtree = DecisionTreeClassifier(random_state=1234)
dtree.fit(X_train, y_train)

print(f'Score on Training data : {dtree.score(X_train, y_train)}')
print(f'Score on Test data : {dtree.score(X_test, y_test)}')

In [None]:
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

print(f'Score on Training data : {rf.score(X_train, y_train)}')
print(f'Score on Test data : {rf.score(X_test, y_test)}')

In [None]:
print("-------- Confusion Matrix for Logistic Regression -------------")

print(confusion_matrix(y_test, logreg.predict(X_test)))

print("\n")

print("-------- Confusion Matrix for Decision Tree -------------")

print(confusion_matrix(y_test, dtree.predict(X_test)))

print("\n")


print("-------- Confusion Matrix for Random Forest -------------")

print(confusion_matrix(y_test, rf.predict(X_test)))

print("\n")



In [None]:
print("----- Classification Report for Logistic Regression ------")
print(classification_report(y_test, logreg.predict(X_test)))
print("\n")

print("----- Classification Report for Decision Tree ------")
print(classification_report(y_test, dtree.predict(X_test)))
print("\n")

print("----- Classification Report for Random Forest ------")
print(classification_report(y_test, rf.predict(X_test)))
print("\n")