# Import the packages, algorithms and metrics

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, plot_roc_curve, precision_score, recall_score

# Load in data and create dataframe

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
print(df.shape, df.columns.to_list())

# Checking out and adjusting the data

In [None]:
#Checking out the data
# 0 is no, #1 is yes
pd.set_option("display.max.columns", None)
df.head(10)

In [None]:
#Checking for nulls
df.isnull().sum()

In [None]:
#We can't have nulls in the "bmi" value so I'll replace the nulls with the means

df['bmi'].fillna(value=df['bmi'].mean(), inplace=True)

In [None]:
#Checking for nulls again
df.isnull().sum()

# Visualizing the data

In [None]:
#Gender count
df['gender'].value_counts().plot(kind='bar')

In [None]:
#Stroke count, 1 is yes and 0 is no
df['stroke'].value_counts().plot(kind='bar')

In [None]:
#Work type count
df['work_type'].value_counts().plot(kind='bar')

In [None]:
glucose_column = df["avg_glucose_level"]
bmi_column = df["bmi"]
age_column = df["age"]

In [None]:
#Distribution of Age

age_column.plot(kind="hist")

In [None]:
#Distribution of Average Glucose Level

glucose_column.plot(kind="hist")

In [None]:
#Distribution of BMI

bmi_column.plot(kind="hist")

In [None]:
#Stroke information by gender

df.groupby(["gender", "stroke"]).size()

In [None]:
#Stroke information by work type

df.groupby(["work_type", "stroke"]).size()

In [None]:
#Stroke information by smoking status

df.groupby(["smoking_status", "stroke"]).size()

In [None]:
#Stroke information by married status

df.groupby(["ever_married", "stroke"]).size()

# Data Analysis

In [None]:
#Drop id column as that won't be useful for our analysis

df2 = df.drop(columns = ['id'],axis =1)
df2.head()

In [None]:
#Converting columns with text to numbers for modelling purposes

#Gender
#Male = 1, Female = 0, Other = 2

df2 = df2.replace(to_replace = 'Male', value = 1)
df2 = df2.replace(to_replace = 'Female', value = 0)
df2 = df2.replace(to_replace = 'Other', value = 2)
df2.head(15)

In [None]:
df2.work_type.value_counts()

In [None]:
#Work type
#Govt job = 0, Never_worked = 1, Private = 2, Self-employed = 3, Children = 4

df2["work_type"] = df2["work_type"].astype("category")
df2["work_type"] = df2["work_type"].cat.codes
df2.head()

In [None]:
df2.smoking_status.value_counts()

In [None]:
#Smoking Status
#Unknown = 0, #Formerly smoked = 1, never smoked = 2, Smokes = 3

df2["smoking_status"] = df2["smoking_status"].astype("category")
df2["smoking_status"] = df2["smoking_status"].cat.codes
df2.head(15)

In [None]:
#Marriage status
#Ever Married (Yes) = 1, (No) = 0

df2 = df2.replace(to_replace = 'Yes', value = 1)
df2 = df2.replace(to_replace = 'No', value = 0)
df2.head(15)

In [None]:
#Residence_type 
#(Urban) = 1, (Rural) = 0

df2 = df2.replace(to_replace = 'Urban', value = 1)
df2 = df2.replace(to_replace = 'Rural', value = 0)
df2.head(15)

# Creating the X and Y variables

In [None]:
#Dropping out the outcome variable to prevent any errors

y = df2['stroke']
x = df2.drop(['stroke'],axis = 1)

# Train-Test split

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

x_oversample, y_oversample = smote.fit_resample(x, y)

x_train, x_test, y_train, y_test = train_test_split(x_oversample, y_oversample, test_size = 0.2, random_state = 0)

In [None]:
# Logistic regression

In [None]:
#S/O to user: Payton Fisher for help with the code

log = LogisticRegression()
log.fit(x_train, y_train)
y_pred_log = log.predict(x_test)
cr = classification_report(y_test, y_pred_log)
print(cr)

In [None]:
print('Precision Score: ', round(precision_score(y_test, y_pred_log), 2))
print('Recall Score: ', round(recall_score(y_test, y_pred_log), 2))
print('F1 Score: ', round(f1_score(y_test, y_pred_log), 2))
print('Accuracy Score: ', round(accuracy_score(y_test, y_pred_log), 2))
print('ROC AUC: ', round(roc_auc_score(y_test, y_pred_log), 2))

# Ensemble Model: Gradient Boosting

In [None]:
# set-up grid of parameters to search
param_grid = {'n_estimators': [10, 100, 250], 'learning_rate': [.01, .1, .25],
              'subsample': [.5, 1],
              'max_depth': [3, 5]} 

# n_estimators controls how many models to aggregate (i.e., how many weak learners)
# learning rate shrinks contribution from individual models (how vigorously errors are corrected);

# max_depth controls tree depth/complexity of individual models

# instantiate grid search object
grid = GridSearchCV(GradientBoostingClassifier(), param_grid, cv = 3)

# fitting the model for grid search 
grid.fit(x, y)

# print parameters, mean, and standard deviation of scores by iteration
for z in range(0, len(grid.cv_results_['params'])):
    print('\nparams:', grid.cv_results_['params'][z])
    print('mean of accuracies:', grid.cv_results_['mean_test_score'][z])
    print('std dev of accuracies:', grid.cv_results_['std_test_score'][z])

# print best parameter after tuning 
print('\n***best parameters:', grid.best_params_)
print('best score:', grid.best_score_)

# store the best estimator
best_sgb = grid.best_estimator_

# Ensemble Model: Random Forest Classifier

In [None]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)
cr_rf = classification_report(y_test, y_pred_rf)
print(cr_rf)

In [None]:
print('Precision Score: ', round(precision_score(y_test, y_pred_rf), 2))
print('Recall Score: ', round(recall_score(y_test, y_pred_rf), 2))
print('F1 Score: ', round(f1_score(y_test, y_pred_rf), 2))
print('Accuracy Score: ', round(accuracy_score(y_test, y_pred_rf), 2))
print('ROC AUC: ', round(roc_auc_score(y_test, y_pred_rf), 2))

# Let's Start Predicting (Using the logistic regression model)

In [None]:
#Bringing in new  health data

mydata = pd.read_csv('../input/health-data-for-predictionns/Health_Data_For_Predictions.csv')
print(mydata.shape, mydata.columns.to_list(),'\n')
mydata

In [None]:
#Checking for nulls
mydata.isnull().sum()

In [None]:
#We can't have nulls in the "bmi" value so I'll replace the nulls with the means

mydata['bmi'].fillna(value=mydata['bmi'].mean(), inplace=True)

In [None]:
#Checking for nulls
mydata.isnull().sum()

# Data Analysis

In [None]:
#Converting columns with text to numbers for predicting purposes

#Gender
#Male = 1, Female = 0, Other = 2

mydata = mydata.replace(to_replace = 'Male', value = 1)
mydata = mydata.replace(to_replace = 'Female', value = 0)
mydata = mydata.replace(to_replace = 'Other', value = 2)
mydata.head(15)

In [None]:
#Work Type

#Govt job = 0, Never_worked = 1, Private = 2, Self-employed = 3, Children = 4

mydata["work_type"] = mydata["work_type"].astype("category")
mydata["work_type"] = mydata["work_type"].cat.codes
mydata.head()

In [None]:
#Smoking status

#Unknown = 0, #Formerly smoked = 1, never smoked = 2, Smokes = 3

mydata["smoking_status"] = mydata["smoking_status"].astype("category")
mydata["smoking_status"] = mydata["smoking_status"].cat.codes
mydata.head(15)

In [None]:
#Marriage Status

#Ever Married (Yes) = 1, (No) = 0

mydata = mydata.replace(to_replace = 'Yes', value = 1)
mydata = mydata.replace(to_replace = 'No', value = 0)
mydata.head(15)

In [None]:
#Residence_type 
#(Urban) = 1, (Rural) = 0

mydata = mydata.replace(to_replace = 'Urban', value = 1)
mydata = mydata.replace(to_replace = 'Rural', value = 0)
mydata.head(15)

In [None]:
#Creating the list for which we will use to predict

xcols4 = mydata.columns.to_list()
print(xcols4,'\n')

In [None]:
#Prediction

pred = log.predict(mydata[xcols4])

In [None]:
#As we can see, the model successfully predicts some people are at risk at having a stroke (1) while others are not (0)
print(pred)