# Importing Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Loading dataset

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df

# Data Cleaning

In [None]:
# Id is irrelevant to the model, it's just a metadata
df = df.drop(['id'], axis=1) 

# Removing rows with invalid numerical values
df = df.dropna(subset=['bmi', 'avg_glucose_level', 'stroke', 'age'], axis=0) 

# Transforming text columns into numerical labels
df_text_cleared = df[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']].apply(LabelEncoder().fit_transform) 
df = df.drop(['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], axis=1)

df_result = pd.concat([df, df_text_cleared], axis=1)
df_result 

# Defining Features and Label

In [None]:
X = df_result[df_result.columns].drop(['stroke'], axis=1)
y = df_result['stroke']

# Splitting dataset for training and evaluation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)

# Fitting the model

In [None]:
model = RandomForestClassifier(max_depth=15, criterion='entropy')
model.fit(X_train, y_train)

# Evaluation

In [None]:
result = model.score(X_test, y_test)
result