In [None]:
import pandas as pd
import numpy as np

# Importing the data

In [None]:
filepath = '../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv'
data_full = pd.read_csv(filepath, index_col='id')

data_full.head()

# Selecting important data

I am not taking the "ever_married", "work_type" and "Residence_type" since the have nothing to do with suffering from a stroke.

In [None]:
from sklearn.model_selection import train_test_split

features = ['gender', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'smoking_status']

X = data_full[features]

X['smoking_status'].replace('Unknown', np.nan, inplace=True)

y = data_full.stroke

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2)

# Preprocessing

First I divide the columns in numerical and categorical. After that I create a Pipeline for both of them, containing the SimpleImputer and OneHotEncoder.

In [None]:
numerical_cols = [col for col in X_train if X_train[col].dtype in ('int64', 'float64')]
categorical_cols = [col for col in X_train if X_train[col]. dtype == 'object']

for col in X_train:
    print(f"Number of missing values in {col.title()} = {X_train[col].isnull().sum()}")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

numerical_preprocessor = SimpleImputer(strategy='constant')

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore', sparse=False)),
])

preprocessor = ColumnTransformer(transformers=[
    ('numerical', numerical_preprocessor, numerical_cols),
    ('categorical', categorical_preprocessor, categorical_cols),
])

# Creating models

**MAE function**

In [None]:
from sklearn.metrics import mean_absolute_error

def get_m_a_e(model_used, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    "Function that gets the mean absolute error after fitting and predicting a model"
    model_used.fit(X_t, y_t)
    predictions = model_used.predict(X_v)
    mae = mean_absolute_error(predictions, y_v)
    return mae

**Models**

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier

model_1 = XGBRegressor(n_estimators=50, learning_rate=0.1)
model_2 = XGBRegressor(n_estimators=75, learning_rate=0.1)
model_3 = XGBRegressor(n_estimators=100, learning_rate=0.1)
model_4 = XGBRegressor(n_estimators=500, learning_rate=0.1)

model_5 = RandomForestClassifier(n_estimators=100, random_state=0)
model_6 = RandomForestClassifier(n_estimators=200, random_state=0)
model_7 = RandomForestClassifier(n_estimators=300, random_state=0)
model_8 = RandomForestClassifier(n_estimators=400, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5, model_6, model_7, model_8]

**Testing models**

In [None]:
i = 1

for model in models:
    my_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    result = get_m_a_e(my_pipeline)
    print(f"Mean absolute error for model {i} = {result}")
    i += 1

# Output

In [None]:
from sklearn.metrics import classification_report

model = RandomForestClassifier(n_estimators=300, random_state=0)

my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

my_pipeline.fit(X_train, y_train)
preds = my_pipeline.predict(X_valid)

print(classification_report(y_valid, preds))

output = pd.DataFrame({'Stroke_data': y_valid.iloc[:], 'Stroke_preds': preds})
output.to_csv('output.csv', index=False)