In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
print(df.info())
print(df.describe())
print(df.head())

In [None]:
# One Hot Encoding  ever_married, smoking_status, Residence_type, work_type, gender
df  = pd.get_dummies(df, columns=["ever_married", "smoking_status", "Residence_type", "work_type", "gender" ])

In [None]:
df.isnull().sum()
#only bmi values missing

In [None]:
# deal with missing bmis 
# what indicates/influences the bmi?
corr = df.corr("pearson")
plt.figure(figsize=(20,20))
sns.heatmap(corr ,annot=True,cmap="RdYlGn")

In [None]:
#sns.pairplot(df, hue="bmi")

In [None]:
names = list()

for index,element in corr["bmi"].items(): 
    if element>0.2 or element < -0.2 :
        names.append(index)

names

In [None]:
sns.displot(df, x="age", bins=10)

In [None]:
print(df["age"].max())

In [None]:
#create age_class to fill missing bmis more accuratly 
for i,e in df["age"].items(): 
    if e <= 16.4:
        df.at[i, "age_class"] = 1
    if e > 16.4 and e <= 32.8: 
        df.at[i, "age_class"] = 2
    if e > 32.8 and e <= 49.2: 
         df.at[i, "age_class"] = 3
    if e > 49.2 and e <= 65.6:
         df.at[i, "age_class"] = 4
    if e > 65: 
         df.at[i, "age_class"] = 5

In [None]:
#Preprocessing
#test train split
y = df["stroke"]
X = df.drop(["stroke"], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=46)
scaler = StandardScaler()


In [None]:
#change class balance by oversampling
#val train spilt
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
#                                                 test_size = 0.2,random_state=22)

In [None]:
#create median df after splitting in train/test to prevent leakage of information 
median_df_train = X_train.groupby(["age_class", "ever_married_No", "smoking_status_Unknown", "work_type_Private","work_type_children"]).median()
median_df_train = median_df_train.reset_index()

median_df_test = X_test.groupby(["age_class", "ever_married_No", "smoking_status_Unknown", "work_type_Private","work_type_children"]).median()
median_df_test = median_df_test.reset_index()

#median_df_val = X_val.groupby(["age_class", "ever_married_No", "smoking_status_Unknown", "work_type_Private","work_type_children"]).median()
#median_df_val  = median_df_val.reset_index()


In [None]:
def get_value(row, df_median): 
    # return mean of cells that meet the rows conditions
    condition = ((median_df_train["age_class"] == row["age_class"]) &
                (median_df_train["ever_married_No"] == row["ever_married_No"]) &
                (median_df_train["smoking_status_Unknown"] == row["smoking_status_Unknown"]) & 
                (median_df_train["work_type_Private"] == row["work_type_Private"]) &
                (median_df_train["work_type_children"] == row["work_type_children"]))
    return median_df_train[condition]['bmi'].values[0]

def fill_bmi(df, df_median): 
    bmis = list()
    for index, row in df.iterrows():
        if np.isnan(row["bmi"]) : 
            row["bmi"] = get_value(row, df_median)
        bmis.append(row["bmi"])
    return bmis

In [None]:
#fill bmi in test/train with corresponding medians
X_train = X_train.copy()
X_test = X_test.copy()
X_train.loc[:,"bmi"] = fill_bmi(X_train, median_df_train )
X_test.loc[:,"bmi"] = fill_bmi(X_test, median_df_test)
#X_val.loc[:,"bmi"] = fill_bmi(X_val, median_df_val)

In [None]:
#drop age_class 
X_train = X_train.drop("age_class", axis = 1)
X_test = X_test.drop("age_class", axis = 1)
#X_val = X_val.drop("age_class", axis = 1)

In [None]:
#check for mulitcollinearity
vif_data = pd.DataFrame()
X_temp = sm.add_constant(X_train)
vif_data["feature"] = X_temp.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X_temp.values, i)
                          for i in range(len(X_temp.columns))]

print(vif_data)
#-> low mulitcollinearity -> try Logistic Regression

In [None]:
print(df.loc[df["stroke"] ==1, "id"].count())
print(df.loc[df["stroke"] ==0, "id"].count())   
#imbalanced target

In [None]:
# standardize features to improve performance 
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
# use SMOTE to oversample class 1
sm = SMOTE(random_state=42)
x_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

In [None]:
# Logistic Regression with oversampled class 1 
# 1 (positive) is stroke, 0 (negative) no stroke 
log_model_smote = LogisticRegression(max_iter=1000, solver = 'liblinear', random_state = 44)
log_model_smote.fit(x_train_smote, y_train_smote)

In [None]:
# train results 
log_smote_pred_train = log_model_smote.predict(x_train_smote)
print(classification_report(y_train_smote,log_smote_pred_train))


# test results 
log_smote_pred_test = log_model_smote.predict(X_test)
print(classification_report(y_test,log_smote_pred_test))
print(confusion_matrix(y_test, log_smote_pred_test))
fpr, tpr, thresholds = metrics.roc_curve(y_test, log_smote_pred_test)
print(metrics.auc(fpr, tpr))
pyplot.plot(fpr, tpr, marker='.', label='log')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
#Logistic Regression with Class_weigth "balanced"
log_model = LogisticRegression(max_iter = 10, class_weight = "balanced", solver = 'liblinear', random_state = 44)
log_model.fit(X_train, y_train)

In [None]:
# train results 
log_pred_train = log_model.predict(X_train)
print(classification_report(y_train,log_pred_train))


# test results 
log_pred_test = log_model.predict(X_test)
print(classification_report(y_test,log_pred_test))
print(confusion_matrix(y_test, log_pred_test))
fpr, tpr, thresholds = metrics.roc_curve(y_test, log_pred_test)
print(metrics.auc(fpr, tpr))
pyplot.plot(fpr, tpr, marker='.', label='log')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
# knn 
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(x_train_smote, y_train_smote)

# train results 
knn_pred_train = knn_model.predict(x_train_smote)
print(classification_report(y_train_smote,knn_pred_train))


# test results 
knn_pred_test = knn_model.predict(X_test)
print(classification_report(y_test,knn_pred_test))
fpr, tpr, thresholds = metrics.roc_curve(y_test, knn_pred_test)
print(metrics.auc(fpr, tpr))
pyplot.plot(fpr, tpr, marker='.', label='log')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
#random forest 
rndf_model = RandomForestClassifier(max_depth = 5, random_state = 45, class_weight='balanced_subsample')
rndf_model.fit(X_train, y_train)

# train results 
rndf_pred_train = rndf_model.predict(X_train)
print(classification_report(y_train,rndf_pred_train))


# test results 
rndf_pred_test = rndf_model.predict(X_test)
print(classification_report(y_test,rndf_pred_test))
fpr, tpr, thresholds = metrics.roc_curve(y_test, rndf_pred_test)
print(confusion_matrix(y_test, rndf_pred_test))
print(y_test.value_counts())
print(metrics.auc(fpr, tpr))
pyplot.plot(fpr, tpr, marker='.', label='log')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
# Create Pipe for Tuning of LogReg and RandomForest 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


In [None]:
logReg = LogisticRegression(random_state = 42)
#rndf = RandomForestClassifier()
#pipe = Pipeline([('logReg', logReg),('rndf', RandomForestClassifier())])
param_grid = [
    {'penalty' : ['l1', 'l2'],
    'class_weight' : ['balanced', {0:0.1, 1:0.8}, {0:0.1, 1:0.9}, {0:0.1, 1:0.3}],
    'solver' : ['liblinear'],
    'max_iter' : list(range(100,200))}]
    #'rndf__n_estimators' : list(range(10,101,10)),
    #'rndf__max_features' : list(range(6,10,5))}]

grid_search_log = GridSearchCV(logReg, param_grid=param_grid, cv = 5, verbose=True, scoring = 'recall')
best = grid_search_log.fit(X_train, y_train)
print(best)

In [None]:
best_logReg_train = grid_search_log.predict(X_train)
best_logReg_test = grid_search_log.predict(X_test)

print(classification_report(y_train , best_logReg_train))
print(classification_report(y_test,best_logReg_test))
fpr, tpr, thresholds = metrics.roc_curve(y_test, best_logReg_test)
print(confusion_matrix(y_test, best_logReg_test))
print(y_test.value_counts())
print(metrics.auc(fpr, tpr))
pyplot.plot(fpr, tpr, marker='.', label='log')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

 **Feel free to leave comments! Every input is greatly appreciated!**