In [2]:
#author: Shen Chan Huang
#data: stroke_data.csv
#Problem: Binary Classification
#prediction for stroke
#assuming 0 for no stroke and 1 for having stroke

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_curve, auc, RocCurveDisplay
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.utils import resample

In [5]:
#loading data into a pandas dataframe
filename = 'stroke_data.csv'
df = pd.read_csv(filename, index_col='id')

In [6]:
#data cleaning. There's a negligible amount of other gender.
df = df[df['gender'] != 'Other']

In [7]:
# Split the data into features and target
target = 'stroke'
X = df.drop(target, axis=1)
y = df[target]

In [8]:
# Define the categorical and numerical columns in your dataframe
s = X.dtypes == 'object'
categorical_cols = list(s[s].index)
numerical_cols = [cname for cname in X if X[cname].dtype in ['int64', 'float64']]

#grab cat and num columns
X_cat = X[categorical_cols]
X_num = X[numerical_cols]

In [9]:
#impute numerical cols
#num_imp = IterativeImputer(random_state=0)
num_imp = SimpleImputer(strategy='mean')
num_imp.fit(X_num)
num_data = pd.DataFrame(num_imp.transform(X_num))

# Initialize the scaler and scale num data
std_scaler = StandardScaler()
X_num_std_scaled = pd.DataFrame(std_scaler.fit_transform(num_data))

#if we want to use min_max instead of standardization
minmax_scaler = MinMaxScaler()
X_num_minmax_scaled = pd.DataFrame(minmax_scaler.fit_transform(num_data))

In [10]:
# Initialize the imputer for categorical cols
cat_imp = SimpleImputer(strategy='most_frequent')
cat_imp.fit(X_cat)
cat_data = pd.DataFrame(cat_imp.transform(X_cat))

# Initialize the one-hot encoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
cat_data_ohe = pd.DataFrame(ohe.fit_transform(cat_data))

#put numerical and categorical columns together
full_col_names = list(X_num.columns) + list(ohe.get_feature_names_out())
X_ohe_std = pd.concat([X_num_std_scaled, cat_data_ohe], axis=1, ignore_index = True)
X_ohe_minmax = pd.concat([X_num_minmax_scaled, cat_data_ohe], axis=1, ignore_index = True)

In [33]:
#define the model: logistic regression is a specific GLM that gives intepretation 
model = LogisticRegression(solver='newton-cholesky', max_iter=100)
#model = SVC(kernel='rbf', C=1)
#model = KNeighborsClassifier(n_neighbors=50)
#model = RandomForestClassifier(n_estimators=100, max_depth=5, verbose=1)
#model = XGBClassifier(max_depth=3, learning_rate=0.01, n_estimators=100)

#define sampling
oversampling = SMOTE(random_state=33)
undersampling = RandomUnderSampler(random_state=33)
steps= [ ('o', oversampling), ('u', undersampling)]
pipline = Pipeline(steps=steps)

In [29]:
# Define the number of folds for cross-validation
n_splits = 10

# Initialize the stratified k-fold object
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=None)

In [34]:
# Loop over the folds
#ohe STD
save_coefs = list()
for fold, (train_index, test_index) in enumerate(skf.split(X_ohe_std, y)):
    # Split the data into training and testing sets
    X_train_resampled, y_train_resampled = pipline.fit_resample(X_ohe_std.iloc[train_index], y.iloc[train_index])
    X_test, y_test = X_ohe_std.iloc[test_index], y.iloc[test_index]
    
    # Train and evaluate your model using X_train_resampled and y_train_resampled, and X_test and y_test
    model.fit(X_train_resampled, y_train_resampled)

    predictions = model.predict(X_test)
    save_coefs.append(model.coef_[0])
    print(f'For fold {fold}:')
    print('')
    print(f'Confusion matrix: {confusion_matrix(y_test, predictions)}')
    fpr, tpr, thresholds = roc_curve(y_test, predictions)
    roc_auc = auc(fpr, tpr)
    print(f'Area under the ROC Curve: {roc_auc}')
    print(f'Classification report:')
    print(classification_report(y_test, predictions))
    print('------------------------------------------------------------------------')

For fold 0:

Confusion matrix: [[3201 1060]
 [  19   59]]
Area under the ROC Curve: 0.7538211807749475
Classification report:
              precision    recall  f1-score   support

           0       0.99      0.75      0.86      4261
           1       0.05      0.76      0.10        78

    accuracy                           0.75      4339
   macro avg       0.52      0.75      0.48      4339
weighted avg       0.98      0.75      0.84      4339

------------------------------------------------------------------------
For fold 1:

Confusion matrix: [[3212 1049]
 [  13   65]]
Area under the ROC Curve: 0.7935734960494407
Classification report:
              precision    recall  f1-score   support

           0       1.00      0.75      0.86      4261
           1       0.06      0.83      0.11        78

    accuracy                           0.76      4339
   macro avg       0.53      0.79      0.48      4339
weighted avg       0.98      0.76      0.84      4339

---------------------

In [35]:
pd.DataFrame({'Weights':save_coefs[0]}, index=full_col_names)

Unnamed: 0,Weights
age,1.954504
hypertension,0.089063
heart_disease,0.121911
metric_1,0.093186
metric_2,-0.058219
metric_3,-0.034172
metric_4,-0.035707
metric_5,0.093186
x0_Female,-0.057564
x0_Male,0.057564
