# Predicting If Someone Will have a Stroke

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In this notebook I have built a machine learning model that is able to predict who will have a stroke in the future based on some features. 

### Library imports:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math as math
from scipy import stats
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, fbeta_score

from imblearn.over_sampling import SMOTE
np.random.seed(11)

In [None]:
df_ = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv',index_col=0)

In [None]:
df_.head()

### All the people who had strokes are at the top of the dataset, i will reshuffle the data:

In [None]:
df = df_.sample(n = len(df_),)
df.index = range(0,len(df))
df.head()

In [None]:
df.info()

### Some missing data in the BMI column

In [None]:
def clean_df():
    df_ = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv',index_col=0)
    df = df_.sample(n = len(df_),)
    df.index = range(0,len(df))
    # puts all nan bmi values == mean
    df["bmi"] = df["bmi"].apply(lambda x: round(df["bmi"].mean(),1) if math.isnan(x) else x)
    return df

## EDA

- check for distribution of classes for all labels

In [None]:
df = clean_df()
df.head()

In [None]:
sns.pairplot(df, hue="stroke", corner=True, kind="hist", palette="coolwarm")

- we can see that as age inceases so does the frequency of strokes, this is true for all other features.

In [None]:
sns.pairplot(df, hue="stroke", corner=True, kind="reg", palette="coolwarm")

- regression plot shows how a linear line would be fit to the data.
- a low bmi and high age seems to have a high correlation and the most different in terms of gradient of stroke and non-stroke.
- could create new feature of age and bmi combined?

In [None]:
df_corr = df.corr()
df_corr

In [None]:
mask = np.zeros_like(df_corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(7, 5))
    ax = sns.heatmap(df_corr, mask=mask, vmax=.3, vmin=0, square=True, cmap="coolwarm", annot=True)

In [None]:
corr = np.array(df_corr.iloc[-1][:-1]).reshape(1,-1)
plt.figure(figsize=(10,1))
_=plt.imshow(corr, cmap="coolwarm", aspect="auto", extent=[0.5,5.5,0,1])
_=plt.xticks(ticks=[1,2,3,4,5], labels=list(df_corr.index[:-1]))
_=plt.yticks(ticks=[0.5], labels=["Stroke Correlation"])
_=plt.title("Feature Correlation with Strokes")

- looking at the bottom row we can see what features have the largest positive correlation with having a stroke

### i want to check the counts of the binary data and plot it with stroke and non-stroke

In [None]:
x=["No Hypertension","Yes Hypertension"]
_=plt.title("Stokes compared with Hypertension:")
_=plt.bar(x, height = df[(df["stroke"]==0)]["hypertension"].value_counts(), color="blue")
_=plt.bar(x, height = df[(df["stroke"]==1)]["hypertension"].value_counts(), color="red")

In [None]:
strokes = df[df["hypertension"]==0]["stroke"].value_counts()
print("The percentage of (hypertension = no) with strokes is {:.1f}%".format(100*strokes[1]/sum(strokes)))

strokes = df[df["hypertension"]==1]["stroke"].value_counts()
print("The percentage of (hypertension = yes) with strokes is {:.1f}%".format(100*strokes[1]/sum(strokes)))

In [None]:
df["heart_disease"].value_counts()

In [None]:
x=["No Heart Disease","Yes Heart Disease"]
_=plt.title("Stokes compared with Heart Disease:")
_=plt.bar(x, height = df[(df["stroke"]==0)]["heart_disease"].value_counts(), color="blue")
_=plt.bar(x, height = df[(df["stroke"]==1)]["heart_disease"].value_counts(), color="red")

In [None]:
strokes = df[df["heart_disease"]==0]["stroke"].value_counts()
print("The percentage of (heart disease = no) with strokes is {:.1f}%".format(100*strokes[1]/sum(strokes)))

strokes = df[df["heart_disease"]==1]["stroke"].value_counts()
print("The percentage of (heart disease = yes) with strokes is {:.1f}%".format(100*strokes[1]/sum(strokes)))

## Pair Plots to check Clusters and Groupings

### Age vs BMI:

In [None]:
sns.jointplot(x="age", y="bmi", data=df, hue="stroke", palette="coolwarm", kind="kde")

- **could make a new feature if someone is over 60, and has a bmi bewtween 22 - 38**.

In [None]:
df["stroke"].value_counts()

- 4% have a stroke in the data

In [None]:
df[(df["age"] >= 60) & (df["bmi"] > 22) & (df["bmi"] < 38)]["stroke"].value_counts()

In [None]:
100*df[(df["age"] >= 60) & (df["bmi"] > 22) & (df["bmi"] < 38)]["stroke"].value_counts()[1]/sum(df[(df["age"] >= 60) & (df["bmi"] > 22) & (df["bmi"] < 38)]["stroke"].value_counts())

- **13.6% have a stroke in this category! with 22% of people falling into this group.**

### Age vs Heart Disease:

In [None]:
sns.jointplot(x="age", y="heart_disease", data=df, hue="stroke", palette="coolwarm", kind="kde")

- **new feature could be to make over 70 and no heart disease?**

In [None]:
df[(df["age"] >= 70) & (df["heart_disease"] ==0)]["stroke"].value_counts()

In [None]:
100*df[(df["age"] >= 70) & (df["heart_disease"] ==0)]["stroke"].value_counts()[1]/sum(df[(df["age"] >= 70) & (df["heart_disease"] ==0)]["stroke"].value_counts())

- **17.5% of people over 70 and with no heart disease had a stoke! however, only 12% of people fall into this category**.

## Investigate Text Data:

In [None]:
df.head()

### Gender:

In [None]:
df["gender"].value_counts()

In [None]:
x=["Female","Male"]
_=plt.title("Gender and Strokes")
_=plt.bar(x, height = df[(df["stroke"]==0) & (df["gender"]!="Other")]["gender"].value_counts(), color="blue")
_=plt.bar(x, height = df[(df["stroke"]==1) & (df["gender"]!="Other")]["gender"].value_counts(), color="red")

In [None]:
male_strokes = df[df["gender"]=="Male"]["stroke"].value_counts()
print("The percentage of males with strokes is {:.1f}%".format(100*male_strokes[1]/sum(male_strokes)))
female_strokes =df[df["gender"]=="Female"]["stroke"].value_counts()
print("The percentage of females with strokes is {:.1f}%".format(100*female_strokes[1]/sum(female_strokes)))

- has far more females than males, and one other.
- **males and females seem to have very little diffence in terms of strokes.**

### Ever Married

In [None]:
df["ever_married"].value_counts()

In [None]:
x=["Yes","No"]
_=plt.title("Marrige and Strokes")
_=plt.bar(x, height = df[(df["stroke"]==0)]["ever_married"].value_counts(), color="blue")
_=plt.bar(x, height = df[(df["stroke"]==1)]["ever_married"].value_counts(), color="red")

In [None]:
married_strokes = df[df["ever_married"]=="Yes"]["stroke"].value_counts()
print("The percentage of Married with strokes is {:.1f}%".format(100*married_strokes[1]/sum(married_strokes)))
not_married_strokes = df[df["ever_married"]=="No"]["stroke"].value_counts()
print("The percentage of Not Married with strokes is {:.1f}%".format(100*not_married_strokes[1]/sum(not_married_strokes)))

- marrige seems to induce more strokes.

### Work Type

In [None]:
x=["Gov Job", "Private", "Self-employed", "Children"]
plt.figure(figsize=(10,5))
_=plt.title("Strokes for each Work Type")
_=plt.bar(x, height = df[(df["stroke"]==0) & (df["work_type"]!="Never_worked")]["work_type"].value_counts().sort_index(), color="blue")
_=plt.bar(x, height = df[(df["stroke"]==1)]["work_type"].value_counts().sort_index(), color="red")

In [None]:
private_strokes = df[df["work_type"]=="Private"]["stroke"].value_counts()
print("The percentage of (work type = Private) with strokes is {:.1f}%".format(100*private_strokes[1]/sum(private_strokes)))
chil_strokes = df[df["work_type"]=="children"]["stroke"].value_counts()
print("The percentage of (work type = Children) with strokes is {:.1f}%".format(100*chil_strokes[1]/sum(chil_strokes)))
self_strokes = df[df["work_type"]=="Self-employed"]["stroke"].value_counts()
print("The percentage of (work type = self employed) with strokes is {:.1f}%".format(100*self_strokes[1]/sum(self_strokes)))
gov_strokes = df[df["work_type"]=="Govt_job"]["stroke"].value_counts()
print("The percentage of (work type = Govt_job) with strokes is {:.1f}%".format(100*gov_strokes[1]/sum(gov_strokes)))

**The below looks at low stress jobs and high stress jobs**

In [None]:
high_strokes = df[(df["work_type"]!="children") | (df["work_type"]!="Never_worked")]["stroke"].value_counts()
print("The percentage of (work type = high pressure) with strokes is {:.1f}%".format(100*high_strokes[1]/sum(high_strokes)))
low_strokes = df[(df["work_type"]=="children") | (df["work_type"]=="Never_worked")]["stroke"].value_counts()
print("The percentage of (work type = low pressure) with strokes is {:.1f}%".format(100*low_strokes[1]/sum(low_strokes)))

- **Could create feature of Self-employed/private/Gov_job or Children-work/never worked**
- self-employed shows the highest stroke percentage
- Never worked and children work shows the lowest

### Residence_type

In [None]:
df["Residence_type"].value_counts()

In [None]:
x=["Urban","Rural"]
plt.figure(figsize=(10,5))
_=plt.title("Strokes for Residence Type")
_=plt.bar(x, height = df[(df["stroke"]==0)]["Residence_type"].value_counts(), color="blue")
_=plt.bar(x, height = df[(df["stroke"]==1)]["Residence_type"].value_counts(), color="red")

In [None]:
Urban_strokes = df[df["Residence_type"]=="Urban"]["stroke"].value_counts()
print("The percentage of (Residence = Urban) with strokes is {:.1f}%".format(100*Urban_strokes[1]/sum(Urban_strokes)))
Urban_strokes = df[df["Residence_type"]=="Rural"]["stroke"].value_counts()
print("The percentage of (Residence = Rural) with strokes is {:.1f}%".format(100*Urban_strokes[1]/sum(Urban_strokes)))

### smoking_status

In [None]:
df["smoking_status"].value_counts().sort_index()

In [None]:
x=["Unknown", "formerly smoked", "never smoked", "smokes"]
plt.figure(figsize=(10,5))
_=plt.title("Strokes for Smokers/Non-smokers")
_=plt.bar(x, height = df[df["stroke"]==0]["smoking_status"].value_counts().sort_index(), color="blue")
_=plt.bar(x, height = df[df["stroke"]==1]["smoking_status"].value_counts().sort_index(), color="red")

In [None]:
strokes = df[df["smoking_status"]=="Unknown"]["stroke"].value_counts()
print("The percentage of (smoked = unknown) with strokes is {:.1f}%".format(100*strokes[1]/sum(strokes)))

strokes = df[df["smoking_status"]=="formerly smoked"]["stroke"].value_counts()
print("The percentage of (smoked = formerly) with strokes is {:.1f}%".format(100*strokes[1]/sum(strokes)))

strokes = df[df["smoking_status"]=="never smoked"]["stroke"].value_counts()
print("The percentage of (smoked = never) with strokes is {:.1f}%".format(100*strokes[1]/sum(strokes)))

strokes = df[df["smoking_status"]=="smokes"]["stroke"].value_counts()
print("The percentage of (smoked = smokes) with strokes is {:.1f}%".format(100*strokes[1]/sum(strokes)))

In [None]:
strokes = df[df["smoking_status"]!="Unknown"]["stroke"].value_counts()
print("The percentage of (smoked = known) with strokes is {:.1f}%".format(100*strokes[1]/sum(strokes)))

strokes = df[df["smoking_status"]=="Unknown"]["stroke"].value_counts()
print("The percentage of (smoked = unknown) with strokes is {:.1f}%".format(100*strokes[1]/sum(strokes)))

- **Perhaps include feature of known and unknown smoking habits**
- Unknown smoking habits have far less stokes than known smoking habits

## Create New Features From Findings:

**Features to Create/Edit:**
- age over 60, and has a bmi bewtween 22 - 38.
- age over 70 and no heart disease (seems counter intuative, but data doesnt lie).
- marrige as yes = 1, no = 0.
- Self-employed/private/Gov_job or Children-work/never worked
- known and unknown smoking habits.

In [None]:
df.head()

### Create Marrige Feature:

In [None]:
df["ever_married"] = df["ever_married"].apply(lambda x: 1 if x=="Yes" else 0)

### Create Feature for age over 60 and bmi 22-38

In [None]:
def age_bmi(x):
    if x[0] >= 60 and x[1] >= 22 and x[1] <= 38:
        return 1
    else:
        return 0

df["age bmi"] = df[["age", "bmi"]].apply(lambda x: age_bmi(x), axis=1)

### Over 70 and no heart disease:

In [None]:
def age_heart(x):
    if x[0] >= 70 and x[1] == 1:
        return 1
    else:
        return 0

df["over 70 no heart disease"] = df[["age", "heart_disease"]].apply(lambda x: age_heart(x), axis=1)

### Employment:

In [None]:
def employment(x):
    if x == "Private" or x == "Self-employed" or x == "Govt_job":
        return 1
    else:
        return 0 

df["job type"] = df["work_type"].apply(lambda x: employment(x))
# remove work type featrue.
df.drop(labels="work_type", axis=1, inplace = True)

### Known smoking habit/ unknown smoking habit:

In [None]:
df["smoking habbit"] = df["smoking_status"].apply(lambda x: 1 if x != "Unknown" else 0)
# remove work type featrue.
df.drop(labels="smoking_status", axis=1, inplace = True)

### Residency and Gender:

In [None]:
df["gender"] = df["gender"].apply(lambda x: 1 if x == "Male" else 0)
df["Residence_type"] = df["Residence_type"].apply(lambda x: 1 if x == "Urban" else 0)

In [None]:
# 1 always has increase stroke chance except for gender and residence_type
df.head()

In [None]:
df[(df["age"] < 30) & (df["stroke"]==1)]

## Outliers:

#### Age vs Stroke Outliers:

In [None]:
_=sns.boxplot(y="age", x="stroke", data=df)

In [None]:
# how to find the different quartiles for the data.
def qaurtiles(data):
    q25, q50, q75 = np.percentile(data, [25, 50, 75])
    iqr = q75 - q25

    q0 = q25 - 1.5*iqr
    q100 = q75 + 1.5*iqr
    
    return q0, q25, q50, q75, q100

In [None]:
# finds the data points that lie outside of our quartiles.
data = df[df["stroke"]==1]["age"]
[x for x in data if x < qaurtiles(data)[0]]

In [None]:
# code below removes outliers from the age stroke dataset:
threshold = qaurtiles(data)[0]

def filter_age(x):
    if x[1] == 1 and x[0] < threshold:
        x[0] = threshold
        return x
    else:
        return x
    
# create apply threshold to data
age_thresh = df[["age","stroke"]].apply(lambda x: filter_age(x), axis=1)
age_thresh_df = pd.DataFrame(age_thresh)
age_thresh_df["stroke"] = df["stroke"]
age_thresh_df = age_thresh_df.rename(columns={0:"bmi"})

# visualize the new distrabution
_=plt.title("Age vs Stroke Outliers Removed")
_=sns.boxplot(y="age", x="stroke", data=age_thresh_df)

#### Glucose Levels vs Stroke Outliers:

In [None]:
_=sns.boxplot(y="avg_glucose_level", x="stroke", data=df)

In [None]:
data = df[df["stroke"]==0]["avg_glucose_level"]
[x for x in data if x > qaurtiles(data)[-1]][0:5]

In [None]:
threshold = qaurtiles(data)[-1]

def filter_glu(x):
    if x[1] == 0 and x[0] > threshold:
        x[0] = threshold
        return x
    else:
        return x
    
# create apply threshold to data
glu_thresh = df[["avg_glucose_level","stroke"]].apply(lambda x: filter_glu(x), axis=1)
glu_thresh_df = pd.DataFrame(glu_thresh)
glu_thresh_df["stroke"] = df["stroke"]
glu_thresh_df = glu_thresh_df.rename(columns={0:"bmi"})

# visualize the new distrabution
_=plt.title("Glucose vc Stroke Outliers Removed")
_=sns.boxplot(y="avg_glucose_level", x="stroke", data=glu_thresh_df)

#### BMI vs Stroke Outliers:

In [None]:
_=sns.boxplot(y="bmi", x="stroke", data=df)

In [None]:
data = df[df["stroke"]==0]["bmi"]
print("bmi no-stroke outliers: {}".format([x for x in data if x > qaurtiles(data)[-1]][0:5]))

data = df[df["stroke"]==1]["bmi"]
print("bmi stroke outliers: {}".format([x for x in data if x > qaurtiles(data)[-1]][0:5]))  

In [None]:
# remove outliers for stroke = 0 BMI:
data_no_stroke = df[df["stroke"]==0]["bmi"]
data_stroke = df[df["stroke"]==1]["bmi"]

threshold = qaurtiles(data_no_stroke)[-1]
threshold_up = qaurtiles(data_stroke)[-1]
threshold_low = qaurtiles(data_stroke)[0]

def filter_bmi_noStroke(x):
    if x[1] == 0 and x[0] > threshold:
        x[0] = threshold
        return x
    else:
        return x

def filter_bmi_Stroke(x):
    if x[1] == 1 and x[0] > threshold_up:
        x[0] = threshold_up
        return x
    elif x[1] == 1 and x[0] < threshold_low:
        x[0] = threshold_low
        return x
    else:
        return x
    
# create apply threshold to data
bmi_thresh = df[["bmi","stroke"]].apply(lambda x: filter_bmi_noStroke(x), axis=1)
bmi_thresh = bmi_thresh.apply(lambda x: filter_bmi_Stroke(x), axis=1)
bmi_thresh_df = pd.DataFrame(bmi_thresh)
bmi_thresh_df["stroke"] = df["stroke"]
bmi_thresh_df = bmi_thresh_df.rename(columns={0:"bmi"})

# visualize the new distrabution
_=plt.title("BMI vs Stroke Outliers Removed")
_=sns.boxplot(y="bmi", x="stroke", data=bmi_thresh_df)

### Apply Outlier Changes to df

In [None]:
data = df[df["stroke"]==1]["age"]
threshold = qaurtiles(data)[0]
df["age"] = df[["age","stroke"]].apply(lambda x: filter_age(x), axis=1)

data = df[df["stroke"]==0]["avg_glucose_level"]
threshold = qaurtiles(data)[-1]
df["avg_glucose_level"] = df[["avg_glucose_level","stroke"]].apply(lambda x: filter_glu(x), axis=1)

data_no_stroke = df[df["stroke"]==0]["bmi"]
data_stroke = df[df["stroke"]==1]["bmi"]
threshold = qaurtiles(data_no_stroke)[-1]
threshold_up = qaurtiles(data_stroke)[-1]
threshold_low = qaurtiles(data_stroke)[0]
df["bmi"] = df[["bmi","stroke"]].apply(lambda x: filter_bmi_noStroke(x), axis=1)
df["bmi"] = df[["bmi","stroke"]].apply(lambda x: filter_bmi_Stroke(x), axis=1)

### Scale skewed data to further remove outliers:

In [None]:
# The below code takes any numerical column that has a skew > 0.75 and transforms it using a log fucntion

skew_limit = 0.75
skew_values = df.skew()
skew_cols = skew_values[abs(skew_values)>skew_limit].sort_values(ascending=False)

for col in skew_cols.index:
    if col == "stroke":
        continue
    df[col] = df[col].apply(np.log1p)

## Supervised Feature Importance:

### Decision Trees:

In [None]:
X = df.drop(labels="stroke",axis=1)
Y = df["stroke"]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# scaling for a decision tree might not have much impact, as it does not calculate any euclidean distance.
# however, this will be useful for some of the unsupervised learning methods like k-mean-clusters.
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# loop throught some random states and append the feature importance to a dataframe.
df_decision_tree = pd.DataFrame()
for i in range(0,10):
    Dtree = DecisionTreeClassifier(random_state=i)
    model = Dtree.fit(X_train_scaled, y_train)
    importance = model.feature_importances_
    importance_labels = X.columns

    import_series = pd.Series(data=importance, index=importance_labels).sort_values(ascending=False)
    df_decision_tree[i] = import_series

# refind the dataframe to a series of the mean results from the for loop above
mean_feature = df_decision_tree.T.describe().loc["mean"].sort_values(ascending=False)
_=plt.figure(figsize=(10,6))
_=plt.bar(x=mean_feature.index, height = mean_feature.values)
_=plt.xticks(ticks=range(0,len(list(mean_feature.index))), labels = mean_feature.index, rotation='60')
_=plt.ylabel("Importance")
_=plt.title("Decision Tree Feature Importance for Strokes: Random state: {}".format(i))

**Interestingly some of the features i thougth would divide the data better didnt, and some that i thought would have no impact has**

- gender has a much larger impact than i thought
- people over 60 and bmi between 22 and 38 has a much smaller importance (could be because of the small number of people it applies to)
- average glucose levels had the largest importance, meaning it was probably the root node in the tree.

### Gradient Boosted Decision Trees:

In [None]:
GXDtree = GradientBoostingClassifier()
GXmodel = GXDtree.fit(X_train_scaled, y_train)
GXimportance = GXmodel.feature_importances_
GXimportance_labels = X.columns
GXimport_series = pd.Series(data=GXimportance, index=GXimportance_labels).sort_values(ascending=False)

_=plt.figure(figsize=(10,6))
_=plt.bar(x=GXimport_series.index, height = GXimport_series.values)
_=plt.xticks(ticks=range(0,len(list(GXimport_series.index))), labels = GXimport_series.index, rotation='60')
_=plt.ylabel("Importance")
_=plt.title("Gradient Boosted Decision Tree Feature Importance for Strokes")

**using gradient boosted decision trees we are able to see importance of the feature with more complex behaviour**

- both decision trees show that age, glucose levels, and bmi are the most importatn features.
- the over 60 and bmi between 22-38, as well as the over 70 features seem to be more important in the xgboosted tree.

## Machine Learning Section:

Im assuming that we want to detect as many people who will get a stroke as possible, only then can we act to try and prevent a stroke from happening. This means we want **recall** to be as high as possible, but whenever we ask about recall we should really ask "recall at what precision?". If we have the maximum recall value then we are likely to have very low precision (for this dataset). Low precision would mean that we would tell many people they are going to have a stroke, but they wont. Obvisouly this would be distressing for the person so we dont want precision too low either. Luckly we have a greate eveluation metric called the **harmonic f1 score**, if beta > 1 then we are more concerned with recall and if beta < 1 we are more concerned with precision. I will set beta = 4 as stopping people from having strokes is much more important than stressing people. Changing the value of beta is the same as moving the threshold of the model.

In [None]:
df.stroke.value_counts()

### Train, Test, Validation Split

In [None]:
# try using train_test_split
X = df.drop("stroke",axis=1)
Y = df["stroke"]

# train is 60%, test is 20% and validation is 20%
X_train, X_test_validation, y_train, y_test_validation = train_test_split(X, Y, test_size=0.4, random_state=13)
X_test, X_validation, y_test, y_validation = train_test_split(X_test_validation, y_test_validation, test_size=0.5, random_state=13)

# SMOTE to balance the minority class, THIS SHOULD ONLY BE APPLIED TO TRAINING DATA!
sm = SMOTE(random_state=1)
X_train, y_train = sm.fit_resample(X_train, y_train)

# scale data
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_validation = scaler.transform(X_validation)

### Evaluate Helper  Function

In [None]:
def evaluate(model, test=False, train=False, validation=False):
    # returns train results    
    if train == True:
        name = model
        pred = model.predict(X_train)
        confuction = confusion_matrix(y_train, pred)
        print("-----------------------TRAINING SCORES-----------------------")
        print("")
        print(confuction)
        print(str(model)+" accuracy score: {:.2f}".format(accuracy_score(y_train, pred)))
        print(str(model)+" precision score: {:.2f}".format(precision_score(y_train, pred)))
        print(str(model)+" recall score: {:.2f}".format(recall_score(y_train, pred)))
        print(str(model)+" f1 (beta = 4) score: {:.2f}".format(fbeta_score(y_train, pred, beta = 4)))
        print("")
        
    # returns test results
    if test == True:
        name = model
        pred = model.predict(X_test)
        confuction = confusion_matrix(y_test, pred)
        print("-----------------------TESTING SCORES-----------------------")
        print("")
        print(confuction)
        print(str(model)+" accuracy score: {:.2f}".format(accuracy_score(y_test, pred)))
        print(str(model)+" precision score: {:.2f}".format(precision_score(y_test, pred)))
        print(str(model)+" recall score: {:.2f}".format(recall_score(y_test, pred)))
        print(str(model)+" f1 (beta = 4) score: {:.2f}".format(fbeta_score(y_test, pred, beta = 4)))
        print("")
        
    # completely unseen data
    if validation == True:
        name = model
        pred = model.predict(X_validation)
        confuction = confusion_matrix(y_validation, pred)
        print("-----------------------VALIDATION SCORES-----------------------")
        print("")
        print(confuction)
        print(str(model)+" accuracy score: {:.2f}".format(accuracy_score(y_validation, pred)))
        print(str(model)+" precision score: {:.2f}".format(precision_score(y_validation, pred)))
        print(str(model)+" recall score: {:.2f}".format(recall_score(y_validation, pred)))
        print(str(model)+" f1 (beta = 4) score: {:.2f}".format(fbeta_score(y_validation, pred, beta = 4)))

### Null Metric Basline

In [None]:
dummy = DummyClassifier(strategy="stratified").fit(X_train, y_train)
evaluate(dummy, test = True, train= True, validation=True)

- This will be used as a comparison for the final model

### Default Models:

In [None]:
lr = LogisticRegression(max_iter=300).fit(X_train,y_train) # 50/50
evaluate(lr, test = True, train= True)

In [None]:
svc = SVC().fit(X_train, y_train)
evaluate(svc, test = True, train= True)

In [None]:
rf = RandomForestClassifier().fit(X_train,y_train)
evaluate(rf, test = True, train= True)

In [None]:
gbdt = GradientBoostingClassifier().fit(X_train, y_train)
evaluate(gbdt, test = True, train= True)

In [None]:
knc = KNeighborsClassifier().fit(X_train, y_train)
evaluate(knc, test = True, train= True)

In [None]:
nbc = GaussianNB().fit(X_train, y_train)
evaluate(nbc, test = True, train= True)

In [None]:
dtree = DecisionTreeClassifier().fit(X_train, y_train)
evaluate(dtree, test = True, train= True)

**I have selected a few models that show promising results**
- I have taken Random forests forwards as they seem to be massively over fitting the data, so perhaps a large improvement could be found using some form of regularization
- Logistic regression scored well so this will be taken forwards
- SVM shows good results and will be carried forwards.
- Gradient boosted decision trees will be carried forwards.
- Gaussian Naive Bayes shows good results, however, this is a very simple model that assumes each feature have no carry over. There is also very limited hyperparameter tuning avaliable for this model.

## How  to tune model with imbalanced Classes?

To accurately see how the model performs with real world data the testing and validation set should be representative of the real world data (e.g. imbalanced classes). However the training data needs to be balanced to allow for the model to better learn who will get a stroke and who wont. These different balanced classes mean that KFolds CV and GridsearchCV cant be used as these would train and test on one data set, either balanced classes (not representative of data) or imbalanced classes which would result in poor performance. 

What i've done bellow is create my own grid search and CV to allow for the model to be trained on the balanced data and then tested on the imbalanced data. 

Note: if i didnt use SMOTE or any other form of sampling then i could have used GridSearchCV.

In [None]:
# Using this for CV

def reshuffle():
    # train is 60%, test is 20% and validation is 20% (no random_state to allow for shuffling)
    X_train, X_test_validation, y_train, y_test_validation = train_test_split(X, Y, test_size=0.4)
    X_test, X_validation, y_test, y_validation = train_test_split(X_test_validation, y_test_validation, test_size=0.5, random_state=13)

    # SMOTE to balance the minority class, THIS SHOULD ONLY BE APPLIED TO TRAINING DATA!
    sm = SMOTE()
    X_train, y_train = sm.fit_resample(X_train, y_train)

    # scale data
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    X_validation = scaler.transform(X_validation)
    
    return X_train, y_train, X_test, y_test

#### Create Random C regularization and then Scale:

## Random Forest Hyperparameter Tuning

In [None]:
# hyperparameters im interested in using
n_trees = [10,20,50,100,300,600]
criterion = ["gini", "entropy"]
depth = [3,5,8,10]

# create a list of dictionaries to loop through
grid = []

# populate the grid with hyperparameters
for n in n_trees:
    for c in criterion:
        for d in depth:
            grid_dic = {}
            grid_dic["n_estimators"] = n
            grid_dic["criterion"] = c
            grid_dic["max_depth"] = d
            # append to grid list    
            grid.append(grid_dic)

# creat dictionaries for the train and test scores
train_cache = {}
test_cache = {}

# create lists to populate the train and test dictionaries
accuarcy_train = []
accuarcy_test = []
precision_train = []
precision_test = []
recall_train = []
recall_test = []
f1_train = []
f1_test = []

# loop through the grid to get scores for all hyperparameter combinations
for i in grid:
    
    # create lists for cross validation
    accuarcy_train_mean = []
    accuarcy_test_mean = []
    precision_train_mean = []
    precision_test_mean = []
    recall_train_mean = []
    recall_test_mean = []
    f1_train_mean = []
    f1_test_mean = []
    
    # perform cross validation
    for kfolds in range(1,6):
        # reshuffle data to allow for CV
        X_train, y_train, X_test, y_test = reshuffle()
        RS = RandomForestClassifier(n_estimators=i["n_estimators"], max_depth=i["max_depth"], criterion=i["criterion"], n_jobs=-2).fit(X_train, y_train)
    
        pred_train = RS.predict(X_train)
        pred_test = RS.predict(X_test)

        accuarcy_train_mean.append(accuracy_score(y_train, pred_train))
        precision_train_mean.append(precision_score(y_train, pred_train))
        recall_train_mean.append(recall_score(y_train, pred_train))
        f1_train_mean.append(fbeta_score(y_train, pred_train, beta = 3))

        accuarcy_test_mean.append(accuracy_score(y_test, pred_test))
        precision_test_mean.append(precision_score(y_test, pred_test))
        recall_test_mean.append(recall_score(y_test, pred_test))
        f1_test_mean.append(fbeta_score(y_test, pred_test, beta = 3))
    
    # append the mean of the CV values
    accuarcy_train.append(np.mean(accuarcy_train_mean))
    precision_train.append(np.mean(precision_train_mean))
    recall_train.append(np.mean(recall_train_mean))
    f1_train.append(np.mean(f1_train_mean))
    
    accuarcy_test.append(np.mean(accuarcy_test_mean))
    precision_test.append(np.mean(precision_test_mean))
    recall_test.append(np.mean(recall_test_mean))
    f1_test.append(np.mean(f1_test_mean))
    
    
# populate the dictionaries with all the scores from the above hyperparameters    
train_cache["accurary"] = accuarcy_train
train_cache["precision"] = precision_train
train_cache["recall"] = recall_train
train_cache["f1(beta=4)"] = f1_train
train_cache["grid"] = grid

test_cache["accurary"] = accuarcy_test
test_cache["precision"] = precision_test
test_cache["recall"] = recall_test
test_cache["f1(beta=4)"] = f1_test
test_cache["grid"] = grid

# create dataframe to easily work with these scores
train_df_rf = pd.DataFrame(train_cache).set_index("grid")
test_df_rf = pd.DataFrame(test_cache).set_index("grid")

comp_rf = pd.DataFrame(test_df_rf).sort_values("f1(beta=4)", ascending=False)
comp_rf.head(5)

In [None]:
RF_best = RandomForestClassifier(n_estimators=comp_rf.index[0]["n_estimators"], criterion=comp_rf.index[0]["criterion"], max_depth=comp_rf.index[0]["max_depth"]).fit(X_train, y_train)
evaluate(RF_best, test=True, train=True, validation=True)

## Logistic Regression

In [None]:
# hyperparameters im interested in using
C = [0.01, 0.1, 0.5, 0.1, 2, 5, 10, 50, 100]
penalty = ["l2"]

# create a list of dictionaries to loop through
grid = []

# populate the grid with hyperparameters
for p in penalty:
    for c in C:
        grid_dic = {}
        grid_dic["penalty"] = p
        grid_dic["C"] = c
        # append to grid list
        grid.append(grid_dic)

# creat dictionaries for the train and test scores
train_cache = {}
test_cache = {}

# create lists to populate the train and test dictionaries
accuarcy_train = []
accuarcy_test = []
precision_train = []
precision_test = []
recall_train = []
recall_test = []
f1_train = []
f1_test = []

# loop through the grid to get scores for all hyperparameter combinations
for i in grid:
    
    # create lists for cross validation
    accuarcy_train_mean = []
    accuarcy_test_mean = []
    precision_train_mean = []
    precision_test_mean = []
    recall_train_mean = []
    recall_test_mean = []
    f1_train_mean = []
    f1_test_mean = []
    
    # perform cross validation
    for kfolds in range(1,6):
        # reshuffle data to allow for CV
        X_train, y_train, X_test, y_test = reshuffle()
        LR = LogisticRegression(penalty=i["penalty"], C=i['C'], max_iter=1000).fit(X_train, y_train)
    
        pred_train = LR.predict(X_train)
        pred_test = LR.predict(X_test)

        accuarcy_train_mean.append(accuracy_score(y_train, pred_train))
        precision_train_mean.append(precision_score(y_train, pred_train))
        recall_train_mean.append(recall_score(y_train, pred_train))
        f1_train_mean.append(fbeta_score(y_train, pred_train, beta = 3))

        accuarcy_test_mean.append(accuracy_score(y_test, pred_test))
        precision_test_mean.append(precision_score(y_test, pred_test))
        recall_test_mean.append(recall_score(y_test, pred_test))
        f1_test_mean.append(fbeta_score(y_test, pred_test, beta = 3))
    
    # append the mean of the CV values
    accuarcy_train.append(np.mean(accuarcy_train_mean))
    precision_train.append(np.mean(precision_train_mean))
    recall_train.append(np.mean(recall_train_mean))
    f1_train.append(np.mean(f1_train_mean))
    
    accuarcy_test.append(np.mean(accuarcy_test_mean))
    precision_test.append(np.mean(precision_test_mean))
    recall_test.append(np.mean(recall_test_mean))
    f1_test.append(np.mean(f1_test_mean))
    
# populate the dictionaries with all the scores from the above hyperparameters    
train_cache["accurary"] = accuarcy_train
train_cache["precision"] = precision_train
train_cache["recall"] = recall_train
train_cache["f1(beta=4)"] = f1_train
train_cache["grid"] = grid

test_cache["accurary"] = accuarcy_test
test_cache["precision"] = precision_test
test_cache["recall"] = recall_test
test_cache["f1(beta=4)"] = f1_test
test_cache["grid"] = grid

# create dataframe to easily work with these scores
train_df_lr = pd.DataFrame(train_cache).set_index("grid")
test_df_lr = pd.DataFrame(test_cache).set_index("grid")

# find the hyperparamters with the smallest errors
comp_lr = pd.DataFrame(test_df_lr).sort_values("f1(beta=4)", ascending=False)
comp_lr.head(5)

In [None]:
LR_best_model = LogisticRegression(C=comp_lr.index[0]["C"]).fit(X_train, y_train)
evaluate(LR_best_model, test=True, train=True, validation=True)

## Support Vector Machine Hyperparameter Tuning

In [None]:
C_range = [0.01, 100]
C_start = np.log10(min(C_range))
C_end = np.log10(max(C_range))
r_large = C_end*np.random.rand(4)
r_small = C_start*np.random.rand(4)
r = np.concatenate((r_small,r_large))

C = [round(x,2) for x in sorted(np.power(10,r))]
kernel = ["poly", "rbf", "sigmoid"]
gamma = ["scale", "auto"]
grid = []

for k in kernel:
    for g in gamma:
        for c in C:
            grid_dic = {}
            grid_dic["kernel"] = k
            grid_dic["gamma"] = g
            grid_dic["C"] = c

            grid.append(grid_dic)

train_cache = {}
test_cache = {}

accuarcy_train = []
accuarcy_test = []
precision_train = []
precision_test = []
recall_train = []
recall_test = []
f1_train = []
f1_test = []
    
# loop through the grid to get scores for all hyperparameter combinations
for i in grid:
    
    # create lists for cross validation
    accuarcy_train_mean = []
    accuarcy_test_mean = []
    precision_train_mean = []
    precision_test_mean = []
    recall_train_mean = []
    recall_test_mean = []
    f1_train_mean = []
    f1_test_mean = []
    
    # perform cross validation
    for kfolds in range(1,6):
        # reshuffle data to allow for CV
        X_train, y_train, X_test, y_test = reshuffle()
        svc = SVC(kernel=i["kernel"], C=i['C'], gamma=i["gamma"]).fit(X_train, y_train)
    
        pred_train = svc.predict(X_train)
        pred_test = svc.predict(X_test)

        accuarcy_train_mean.append(accuracy_score(y_train, pred_train))
        precision_train_mean.append(precision_score(y_train, pred_train))
        recall_train_mean.append(recall_score(y_train, pred_train))
        f1_train_mean.append(fbeta_score(y_train, pred_train, beta = 3))

        accuarcy_test_mean.append(accuracy_score(y_test, pred_test))
        precision_test_mean.append(precision_score(y_test, pred_test))
        recall_test_mean.append(recall_score(y_test, pred_test))
        f1_test_mean.append(fbeta_score(y_test, pred_test, beta = 3))
    
    # append the mean of the CV values
    accuarcy_train.append(np.mean(accuarcy_train_mean))
    precision_train.append(np.mean(precision_train_mean))
    recall_train.append(np.mean(recall_train_mean))
    f1_train.append(np.mean(f1_train_mean))
    
    accuarcy_test.append(np.mean(accuarcy_test_mean))
    precision_test.append(np.mean(precision_test_mean))
    recall_test.append(np.mean(recall_test_mean))
    f1_test.append(np.mean(f1_test_mean))
    
train_cache["accurary"] = accuarcy_train
train_cache["precision"] = precision_train
train_cache["recall"] = recall_train
train_cache["f1(beta=4)"] = f1_train
train_cache["grid"] = grid

test_cache["accurary"] = accuarcy_test
test_cache["precision"] = precision_test
test_cache["recall"] = recall_test
test_cache["f1(beta=4)"] = f1_test
test_cache["grid"] = grid

# create dataframe to easily work with these scores
train_df_svc = pd.DataFrame(train_cache).set_index("grid")
test_df_svc = pd.DataFrame(test_cache).set_index("grid")

# find the hyperparamters with the smallest errors
comp_svc = pd.DataFrame(test_df_svc).sort_values("f1(beta=4)", ascending=False)
comp_svc.head(5)

In [None]:
svc_best_model = SVC(kernel=comp_svc.index[0]["kernel"], C= comp_svc.index[0]["C"], gamma=comp_svc.index[0]["gamma"]).fit(X_train, y_train)
evaluate(svc_best_model, test=True, train=True, validation=True)

## Gradient Boosting Classifier Tuning

In [None]:
# hyperparameters im interested in using
learning_rate = [0.001, 0.01, 0.1, 0.2]
n_trees = [10,20,50,100,300]
depth = [3,5,8]

# create a list of dictionaries to loop through
grid = []

# populate the grid with hyperparameters
for n in n_trees:
    for l in learning_rate:
        for d in depth:
            grid_dic = {}
            grid_dic["n_estimators"] = n
            grid_dic["learning_rate"] = l
            grid_dic["max_depth"] = d
            # append to grid list    
            grid.append(grid_dic)

# creat dictionaries for the train and test scores
train_cache = {}
test_cache = {}

# create lists to populate the train and test dictionaries
accuarcy_train = []
accuarcy_test = []
precision_train = []
precision_test = []
recall_train = []
recall_test = []
f1_train = []
f1_test = []

# loop through the grid to get scores for all hyperparameter combinations
for i in grid:
    
    # create lists for cross validation
    accuarcy_train_mean = []
    accuarcy_test_mean = []
    precision_train_mean = []
    precision_test_mean = []
    recall_train_mean = []
    recall_test_mean = []
    f1_train_mean = []
    f1_test_mean = []
    
    # perform cross validation
    for kfolds in range(1,6):
        # reshuffle data to allow for CV
        X_train, y_train, X_test, y_test = reshuffle()
        BG = GradientBoostingClassifier(n_estimators=i["n_estimators"], max_depth=i["max_depth"], learning_rate=i["learning_rate"]).fit(X_train, y_train)
    
        pred_train = BG.predict(X_train)
        pred_test = BG.predict(X_test)

        accuarcy_train_mean.append(accuracy_score(y_train, pred_train))
        precision_train_mean.append(precision_score(y_train, pred_train))
        recall_train_mean.append(recall_score(y_train, pred_train))
        f1_train_mean.append(fbeta_score(y_train, pred_train, beta = 3))

        accuarcy_test_mean.append(accuracy_score(y_test, pred_test))
        precision_test_mean.append(precision_score(y_test, pred_test))
        recall_test_mean.append(recall_score(y_test, pred_test))
        f1_test_mean.append(fbeta_score(y_test, pred_test, beta = 3))
    
    # append the mean of the CV values
    accuarcy_train.append(np.mean(accuarcy_train_mean))
    precision_train.append(np.mean(precision_train_mean))
    recall_train.append(np.mean(recall_train_mean))
    f1_train.append(np.mean(f1_train_mean))
    
    accuarcy_test.append(np.mean(accuarcy_test_mean))
    precision_test.append(np.mean(precision_test_mean))
    recall_test.append(np.mean(recall_test_mean))
    f1_test.append(np.mean(f1_test_mean))
    
    
# populate the dictionaries with all the scores from the above hyperparameters    
train_cache["accurary"] = accuarcy_train
train_cache["precision"] = precision_train
train_cache["recall"] = recall_train
train_cache["f1(beta=4)"] = f1_train
train_cache["grid"] = grid

test_cache["accurary"] = accuarcy_test
test_cache["precision"] = precision_test
test_cache["recall"] = recall_test
test_cache["f1(beta=4)"] = f1_test
test_cache["grid"] = grid

# create dataframe to easily work with these scores
train_df_gb = pd.DataFrame(train_cache).set_index("grid")
test_df_gb = pd.DataFrame(test_cache).set_index("grid")

comp_gb = pd.DataFrame(test_df_gb).sort_values("f1(beta=4)", ascending=False)
comp_gb.head(5)

In [None]:
GBC_best = GradientBoostingClassifier(n_estimators=comp_gb.index[0]["n_estimators"], learning_rate=comp_gb.index[0]["learning_rate"], max_depth=comp_gb.index[0]["max_depth"]).fit(X_train, y_train)
evaluate(GBC_best, test=True, train=True, validation=True)

# Results:

The Gradient Boosting Classifier shows the best results on the unseen validation data with **recall = 87%, f1 = 66%, and accuracy = 73%**, the trade off here is that precision is very low. This means that the model will manage to catch 87% of patients that are going to have strokes. However, out of all the patients told they're going to have a stroke only 13% of them actually will. 