In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
def transform_target_log3(data, target_name):
    val = data[target_name].values 
    return np.array([0 if v == 0 else np.log(v) / np.log(3) for v in val])


def read_x_y(path, target_name):
    data = pd.read_csv(path)
    data[target_name] = transform_target_log3(data, target_name)
    X = data.drop(columns = target_name)
    y = data[target_name]
    return X, y


def get_categorical_features(data):
    num_unique = data.nunique()
    categorical_features = num_unique[num_unique <= 10].index.tolist()
    # Remove variables from categorical features list that can be treated as continuous
    for col in ["POVCAT15", "RTHLTH31", "MNHLTH31"]:
        categorical_features.remove(col)
    return categorical_features

---

In [3]:
path = ".\..\..\data\MEPS_data_preprocessed_train.csv"
X, y = read_x_y(path, "HEALTHEXP")

In [4]:
categorical_features = get_categorical_features(X)
numerical_features= [f for f in X.columns if f not in categorical_features]

for f in categorical_features:
    X[f] = X[f].astype("str")

categorical_transformer = Pipeline(
    steps = [
        ("onehot", OneHotEncoder(handle_unknown = "ignore"))
    ]
)

numerical_transformer = Pipeline(
    steps = [
        ("scaler", StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ("cat", categorical_transformer, categorical_features),
        ("num", numerical_transformer, numerical_features)
    ]
)

regressor = GradientBoostingRegressor(n_estimators = 76, 
                                    max_depth = 5,
                                    min_samples_split = 2,
                                    min_samples_leaf = 5,
                                    random_state = 123)

reg_xgb = Pipeline(steps = [("preprocessor", preprocessor),
                      ("regressor", regressor)])

In [5]:
reg_xgb.fit(X, y)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  Pipeline(memory=None,
                                                           steps=[('onehot',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='ignore',
                                                                                 sparse=True))],
                                                

In [6]:
pickle.dump(reg_xgb, open("MEPS_xgb_model_final_v2_cat_vars_str.pickle", "wb"))