In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error

In [None]:
data = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
data.head()

In [None]:
X = data.drop(["stroke"], axis=1)
y = data.stroke

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
type_sort = (X_train.dtypes == "object")
object_cols = list(type_sort[type_sort].index)

label_X_train = X_train.copy()
label_X_test = X_test.copy()

ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_test[object_cols] = ordinal_encoder.transform(X_test[object_cols])

my_imputer = SimpleImputer()
final_X_train = pd.DataFrame(my_imputer.fit_transform(label_X_train))
final_X_test = pd.DataFrame(my_imputer.transform(label_X_test))

final_X_train.columns = X_train.columns
final_X_test.columns = X_test.columns

In [None]:
model = RandomForestRegressor(random_state=1)
model.fit(final_X_train, y_train)

y_pred = model.predict(final_X_test)

In [None]:
mae = mean_absolute_error(y_pred, y_test)
print(mae)

In [None]:
plt.hist(y_test, label="actual data")
plt.hist(y_pred, label="predictions")
plt.legend()
plt.ylabel("number of people")
plt.xlabel("number of strokes")
plt.title("Predicted & Actual Number of People who will have a Stroke")
plt.show()
