In [1]:
# Data analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Imputing missing values
from sklearn.impute import KNNImputer

from scipy.stats import chi2_contingency

# Feature engineering
from sklearn.preprocessing import StandardScaler

# Model processing and testing
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, plot_roc_curve, precision_score, recall_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

In [3]:
df = pd.read_csv("stroke_data.csv")

In [4]:
df.head()


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
s0 = round(df[df['stroke'] == 0].describe(), 2)
s1 = round(df[df['stroke'] == 1].describe(), 2)

pd.concat([s0, s1], axis = 1, keys = ['No Stroke', 'Stroke'])

Unnamed: 0_level_0,No Stroke,No Stroke,No Stroke,No Stroke,No Stroke,No Stroke,No Stroke,Stroke,Stroke,Stroke,Stroke,Stroke,Stroke,Stroke
Unnamed: 0_level_1,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4861.0,4861.0,4861.0,4861.0,4861.0,4700.0,4861.0,249.0,249.0,249.0,249.0,249.0,209.0,249.0
mean,36487.24,41.97,0.09,0.05,104.8,28.82,0.0,37115.07,67.73,0.27,0.19,132.54,30.47,1.0
std,21120.13,22.29,0.28,0.21,43.85,7.91,0.0,21993.34,12.73,0.44,0.39,61.92,6.33,0.0
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0,210.0,1.32,0.0,0.0,56.11,16.9,1.0
25%,17762.0,24.0,0.0,0.0,77.12,23.4,0.0,17013.0,59.0,0.0,0.0,79.79,26.4,1.0
50%,36958.0,43.0,0.0,0.0,91.47,28.0,0.0,36706.0,71.0,0.0,0.0,105.22,29.7,1.0
75%,54497.0,59.0,0.0,0.0,112.83,33.1,0.0,56669.0,78.0,1.0,0.0,196.71,33.7,1.0
max,72940.0,82.0,1.0,1.0,267.76,97.6,0.0,72918.0,82.0,1.0,1.0,271.74,56.6,1.0


In [6]:
df.isnull().sum()


id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
def count_negatives(data):
    neg_count = 0
    for n in data:
        if type(data) == 'int':
            if n < 0:
               neg_count += 1
    return neg_count

count_negatives(df)

0

In [8]:
df_knn = df.copy()
df_knn.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [9]:
impute = KNNImputer(n_neighbors = 5, weights = 'uniform')
df_knn['bmi'] = impute.fit_transform(df_knn[['bmi']])

In [10]:
df_knn.isnull().sum()


id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [11]:
df = pd.get_dummies(df, columns = ['gender', 'work_type', 'Residence_type', 'smoking_status'], prefix = ['sex', 'work', 'residence', 'smoke'])
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke,sex_Female,sex_Male,...,work_Never_worked,work_Private,work_Self-employed,work_children,residence_Rural,residence_Urban,smoke_Unknown,smoke_formerly smoked,smoke_never smoked,smoke_smokes
0,9046,67.0,0,1,Yes,228.69,36.6,1,0,1,...,0,1,0,0,0,1,0,1,0,0
1,51676,61.0,0,0,Yes,202.21,,1,1,0,...,0,0,1,0,1,0,0,0,1,0
2,31112,80.0,0,1,Yes,105.92,32.5,1,0,1,...,0,1,0,0,1,0,0,0,1,0
3,60182,49.0,0,0,Yes,171.23,34.4,1,1,0,...,0,1,0,0,0,1,0,0,0,1
4,1665,79.0,1,0,Yes,174.12,24.0,1,1,0,...,0,0,1,0,1,0,0,0,1,0


In [12]:
df['ever_married'] = df['ever_married'].apply(lambda x: 1 if x == 'Yes' else 0)
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke,sex_Female,sex_Male,...,work_Never_worked,work_Private,work_Self-employed,work_children,residence_Rural,residence_Urban,smoke_Unknown,smoke_formerly smoked,smoke_never smoked,smoke_smokes
0,9046,67.0,0,1,1,228.69,36.6,1,0,1,...,0,1,0,0,0,1,0,1,0,0
1,51676,61.0,0,0,1,202.21,,1,1,0,...,0,0,1,0,1,0,0,0,1,0
2,31112,80.0,0,1,1,105.92,32.5,1,0,1,...,0,1,0,0,1,0,0,0,1,0
3,60182,49.0,0,0,1,171.23,34.4,1,1,0,...,0,1,0,0,0,1,0,0,0,1
4,1665,79.0,1,0,1,174.12,24.0,1,1,0,...,0,0,1,0,1,0,0,0,1,0


In [13]:
num_cols = ['age', 'avg_glucose_level', 'bmi']

scaler = StandardScaler()

df[num_cols] = scaler.fit_transform(df[num_cols])

In [14]:
df = df.drop('id', axis = 1)
df.head()

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke,sex_Female,sex_Male,sex_Other,...,work_Never_worked,work_Private,work_Self-employed,work_children,residence_Rural,residence_Urban,smoke_Unknown,smoke_formerly smoked,smoke_never smoked,smoke_smokes
0,1.051434,0,1,1,2.706375,0.981345,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0
1,0.78607,0,0,1,2.121559,,1,1,0,0,...,0,0,1,0,1,0,0,0,1,0
2,1.62639,0,1,1,-0.005028,0.459269,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
3,0.255342,0,0,1,1.437358,0.701207,1,1,0,0,...,0,1,0,0,0,1,0,0,0,1
4,1.582163,1,0,1,1.501184,-0.623083,1,1,0,0,...,0,0,1,0,1,0,0,0,1,0


In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


In [16]:
gender = le.fit_transform(df['gender'])
smoking_status = le.fit_transform(df['smoking_status'])
work_type = le.fit_transform(df['work_type'])
Residence_type = le.fit_transform(df['Residence_type'])
ever_married = le.fit_transform(df['ever_married'])

In [17]:
df1['ever_married'] = ever_married
df1['Residence_type'] = Residence_type
df1['smoking_status'] = smoking_status
df1['gender'] = gender
df1['work_type'] = work_type