In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from collections import Counter


df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.shape, df.columns

In [None]:
df.head(12).T

In [None]:
sns.countplot(x=df['stroke'])
plt.show() # Shows a highly imbalanced dataset

In [None]:
sns.countplot(x=df['work_type'])
plt.show()

In [None]:
df.dropna(inplace=True)
df.shape

In [None]:
# Encoding certain categorical variables
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['ever_married'] = le.fit_transform(df['ever_married'])

le = LabelEncoder()
df['work_type'] = le.fit_transform(df['work_type'])

le = LabelEncoder()
df['Residence_type'] = le.fit_transform(df['Residence_type'])

le = LabelEncoder()
df['smoking_status'] = le.fit_transform(df['smoking_status'])

In [None]:
for x in df.columns:
    df[x] = df[x].astype(float)

In [None]:
sns.heatmap(df.corr())
plt.show()

In [None]:
df = df.sample(frac=1).reset_index(drop=True) # Reshuffle dataset

Y = df['stroke']
df.drop(['id', 'stroke'], axis=1, inplace=True)

In [None]:
# transform the dataset
oversample = SMOTE()
df, Y = oversample.fit_resample(df, Y)
# summarize the new class distribution
counter = Counter(Y)
print(counter)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score

In [None]:
rfc = RandomForestClassifier(n_jobs=-1)
kf = KFold(n_splits=10, shuffle=True)

In [None]:
for train_index, test_index in kf.split(df, Y):
    x, val_x = df.iloc[train_index], df.iloc[test_index]
    y, val_y = Y.iloc[train_index], Y.iloc[test_index]
    
    rfc.fit(x, y)
    p = rfc.predict(val_x)
    print(f"F1 Score for fold is -> {f1_score(val_y, p)} and AUC-ROC score -> {roc_auc_score(val_y, p)}")
    print(confusion_matrix(val_y, p))