# Stroke Prediction

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.inspection import permutation_importance
from yellowbrick.target import ClassBalance
from imblearn.over_sampling import SMOTE

## Data Preprocessing

In [None]:
# Load data
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

In [None]:
# View statistics of numerical data
df.describe()

In [None]:
# One-hot encode non-numerical data
df = pd.get_dummies(df)

# Drop rows with NaN values
df = df.dropna()

# Split data into X and y
X = df.drop(["id", "stroke"], axis=1)
y = df["stroke"]

# Normalize the data
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

In [None]:
# Visualize class imbalance
visualizer = ClassBalance()
visualizer.fit(y)
visualizer.show()

In [None]:
# Apply data augmentation to mitigate class imbalance
sm = SMOTE(random_state=0)
X_res, y_res = sm.fit_resample(X_norm, y)

In [None]:
# Split the data into train and test sets with a 80/20 ratio
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=0)

## Train Classifier

In [None]:
# Fit a random forest classifier on train set and predict on test set
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

## Calculate Metrics

In [None]:
# Output classification metrics
print(classification_report(y_test, y_pred))

In [None]:
# Plot the sorted feature importance
sorted_idx = clf.feature_importances_.argsort()
plt.barh(X.columns[sorted_idx], clf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")
plt.show()