In [None]:
import os
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,ConfusionMatrixDisplay,precision_score,recall_score,f1_score,classification_report,roc_curve,plot_roc_curve,auc,precision_recall_curve,plot_precision_recall_curve,average_precision_score

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
dataset = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv',index_col= 0 )
dataset_v2 = dataset.copy()


# Brief description of the data set and a summary of its attributes

According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.

1. id: unique identifier

2. gender: "Male", "Female" or "Other"

3. age: age of the patient

4. hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension

5. heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease

6. ever_married: "No" or "Yes"

7. work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"

8. Residence_type: "Rural" or "Urban"

9. avg_glucose_level: average glucose level in blood

10. bmi: body mass index

11. smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*

12. stroke: 1 if the patient had a stroke or 0 if not

In [None]:
dataset

In [None]:
dataset.shape

# Main objective(s) of this analysis.
Main objective is to classify/predict if a particular person based on their parameters is likely to have a stroke. This anaylsis might be very helpful and interesting in case of real-world problems as strokes account for as much as 11% of all deaths in the world. To sum up, I'm going to create my target variable y(stroke 0=no,1=yes) and try to predict output using features(basically rest of the columns) using different classification models.

# Exploratory data analysis (EDA)

In [None]:
#I'm resetting index here to ensure normal index distribution from 0-5109
dataset.reset_index(inplace=True)

In [None]:
dataset.drop('id',axis=1,inplace=True)

In [None]:
dataset

In [None]:
#checking for correlation with target variable (y).
features =dataset.columns[:-1]
correlations = dataset[features].corrwith(dataset.stroke)
correlations.sort_values(inplace=True, ascending=False)
correlations

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
#201 NULL values in 'bmi' column.
dataset.isnull().sum()

In [None]:
plt.title('Missing Value Status',fontweight='bold')
ax = sn.heatmap(dataset.isna().sum().to_frame(),annot=True,fmt='d',cmap='vlag')
ax.set_xlabel('Amount Missing')
plt.show()

In [None]:
#mean of bmi's
bmi_mean = dataset.bmi.mean()
bmi_mean

In [None]:
#apply mean to NULL values using pandas fillna function.
dataset.bmi.fillna(bmi_mean,inplace=True)

In [None]:
dataset.bmi = dataset.bmi.round(2)

In [None]:
dataset.isnull().sum()

In [None]:
#we can see that 249 of 5110 people actually had stroke.
(dataset.stroke==1).value_counts()

In [None]:
#Simple calculation shows us that for every 1 value theres 20.5 0s in stroke column. It means that our target variable
# is very unbalanced. We need to focus on that as there might be huge problems with prediction.
a = 4861 + 249
b = a/249
print(b)

In [None]:
dataset.avg_glucose_level

In [None]:
dataset_v2 = dataset.copy()

In [None]:
dataset_v2['diabetes'] = np.where(dataset_v2['avg_glucose_level']>=128, '1', '0')

In [None]:
#changing columns order to make target variable last again.
dataset_v2 = dataset_v2[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'diabetes', 'stroke']]

In [None]:
dataset_v2

In [None]:
data_uniques = pd.DataFrame([[i, len(dataset_v2[i].unique())] for i in dataset_v2.columns], columns=['Variable', 'Unique Values']).set_index('Variable')
data_uniques

In [None]:
plt.figure(figsize=(8,8))
ax = plt.axes()
ax.hist(dataset_v2.age, bins=25);

ax.set(xlabel='age', 
       ylabel='Frequency',
       );

In [None]:
plt.figure(figsize=(8,8))
ax = plt.axes()
ax.hist(dataset_v2.avg_glucose_level, bins=25);

ax.set(xlabel='avg_glucose_level', 
       ylabel='Frequency',
       );

In [None]:
plt.figure(figsize=(8,8))
ax = plt.axes()
ax.hist(dataset_v2.bmi, bins=25);

ax.set(xlabel='bmi', 
       ylabel='Frequency',
       );

In [None]:
plt.figure(figsize=(10,10))
ax = plt.axes()

ax.scatter(dataset_v2.age, dataset_v2.bmi,s=10,c='r')

ax.set(xlabel='Age',
       ylabel='BMI',
       title='age-bmi distribution');

In [None]:
plt.figure(figsize=(10,10))
ax = plt.axes()

ax.scatter(dataset_v2.age, dataset_v2.avg_glucose_level,s=10,c='r')

ax.set(xlabel='Age',
       ylabel='glucose',
       title='age-glucose distribution');

In [None]:
#labels variable stores labels of stroke column (0,1)
labels =dataset_v2['stroke'].value_counts(sort = True).index
#sizes variables stores how many 0s and 1s there are in stroke column.
sizes = dataset_v2['stroke'].value_counts(sort = True)

colors = ["lightblue","red"]
#makes wedges to stand out.
explode = (0.3,0) 
 
plt.figure(figsize=(7,7))
plt.pie(sizes, explode=explode, labels=labels, colors=colors,autopct='%1.1f%%',  shadow=True, startangle=90,)

plt.title('Number of stroke in the dataset')
plt.show()

In [None]:
#Quick check if living area has any impact on patient's health. As we see below that's a false assumption.
residence_stroke = dataset_v2[['Residence_type','stroke']]
residence_stroke.value_counts()

In [None]:
#encoding categorical features 
le = LabelEncoder()
dataset_v2['gender'] = le.fit_transform(dataset_v2['gender'])
dataset_v2['ever_married'] = le.fit_transform(dataset_v2['ever_married'])
dataset_v2['work_type'] = le.fit_transform(dataset_v2['work_type'])
dataset_v2['Residence_type'] = le.fit_transform(dataset_v2['Residence_type'])
dataset_v2['smoking_status'] = le.fit_transform(dataset_v2['smoking_status'])

In [None]:
dataset_v2.head()

# MODEL 1 

Logistic Regression models with standard,l1 and l2 penalites. I had to use SMOTE module as my target variable (y) had 20 to 1 ratio.


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
dataset_v2[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'diabetes',]] = scaler.fit_transform(dataset_v2[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'diabetes']])

In [None]:
dataset_v2

In [None]:
# creating X and y.
X = dataset_v2.iloc[:,:11]
y = dataset_v2.iloc[:,-1]

In [None]:
print('X Shape', X.shape)
print('Y Shape',y.shape)

In [None]:
#splitting into train and test sets. 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=None)

#scalling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#
print('Number transations x_train df',X_train.shape)
print('Number transations x_test df',X_test.shape)
print('Number transations y_train df',y_train.shape)
print('Number transations y_test df',y_test.shape)

In [None]:
print('Before OverSampling, the shape of train_x: {}'.format(X_train.shape))
print('Before OverSampling, the shape of train_y: {}'.format(y_train.shape))
print('Before OverSampling, counts of label 1: {}'.format(sum(y_train==1)))
print('Before OverSampling, counts of label 0: {} \n'.format(sum(y_train==0)))

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train,y_train.ravel())

In [None]:
print('After OverSampling, the shape of train_x: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {}'.format(y_train_res.shape))
print('After OverSampling, counts of label 1: {}'.format(sum(y_train_res == 1)))
print('After OverSampling, counts of label 0: {}'.format(sum(y_train_res == 0)))

In [None]:
from sklearn.linear_model import LogisticRegressionCV

lr=LogisticRegression().fit(X_train_res, y_train_res)

#L1
lr_l1 = LogisticRegressionCV(Cs=10, cv=4, penalty='l1', solver='liblinear',max_iter=2500).fit(X_train_res, y_train_res)

#L2
lr_l2 = LogisticRegressionCV(Cs=10, cv=4, penalty='l2', solver='liblinear',max_iter=1300).fit(X_train_res, y_train_res)

In [None]:
y_pred = list()
y_prob = list()

coeff_labels = ['lr','lr1','lr2']
coeff_models = [lr,lr_l1,lr_l2 ]

for lab,mod in zip(coeff_labels, coeff_models):
    y_pred.append(pd.Series(mod.predict(X_test), name=lab))
    #Taking .max() of probability 
    y_prob.append(pd.Series(mod.predict_proba(X_test).max(axis=1), name=lab))
    
y_pred = pd.concat(y_pred, axis=1)
y_prob = pd.concat(y_prob, axis=1)

# y_pred.head()

In [None]:
y_prob.head()

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import label_binarize

metrics = list()
cm = dict()

for lab in coeff_labels:

    # Preciision, recall, f-score from the multi-class support function
    precision, recall, fscore, _ = score(y_test, y_pred[lab], average='weighted')
    
    # The usual way to calculate accuracy
    accuracy = accuracy_score(y_test, y_pred[lab])
    
    # ROC-AUC scores can be calculated by binarizing the data
    auc = roc_auc_score(label_binarize(y_test, classes=[0,1]),
              label_binarize(y_pred[lab], classes=[0,1]), 
              average='weighted')
    
    # Last, the confusion matrix
    cm[lab] = confusion_matrix(y_test, y_pred[lab])
    
    metrics.append(pd.Series({'precision':precision, 'recall':recall, 
                              'fscore':fscore, 'accuracy':accuracy,
                              'auc':auc}, 
                             name=lab))

metrics = pd.concat(metrics, axis=1)

In [None]:
metrics

# MODEL 2 
K-Nearest Neighbors. 


THAT'S BASICALLY IT BUT TRY TO UNDERSTAND IT YES?

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(X_train_res, y_train_res)
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy score: ', round(accuracy_score(y_test, y_pred), 2))
print('F1 Score: ', round(f1_score(y_test, y_pred), 2))

In [None]:
# Plot confusion matrix
sn.set_palette(sn.color_palette(colors))
_, ax = plt.subplots(figsize=(12,12))
ax = sn.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap=colors, annot_kws={"size": 40, "weight": "bold"})  
labels = ['True', 'False']
ax.set_xticklabels(labels, fontsize=25);
ax.set_yticklabels(labels, fontsize=25);
ax.set_ylabel('Prediction', fontsize=30);
ax.set_xlabel('Ground Truth', fontsize=30)

# MODEL 3
XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(learning_rate=0.1,objective='binary:logistic',random_state=0,eval_metric='mlogloss',use_label_encoder=False)
xgb = xgb.fit(X_train_res, y_train_res)

y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy score: ', round(accuracy_score(y_test, y_pred), 2))
print('F1 Score: ', round(f1_score(y_test, y_pred), 2))

# Recommendation for final model.
I trained 3 different models on the same training and test splits.I would definitely recommend XGBoost beacuse it got the best result and it was very fast.

# Summary Key Findings and Insights.
At the beginning I thought that there will be strong correlation between each features, like if patient is smoking cigarettes he/she would have higher chance of getting stroke, of course each column had positive correlation with target variable but none was higher than 0.5.

Also Residence_type has almost nothing to do with higher or lower probability of having a stroke which for me was quite surprising.


# Suggestions for next steps
n my opinion most efficient way to improve prediction score would be to add specific features that have high correlation with our target variable (stroke). I would focus on gathering more specific information about each patient and their health.