In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv('../data/external/california_housing_train.csv')

In [None]:
data.head()
X = data.drop(columns=['median_house_value'], axis=1)
Y = data['median_house_value']

numeric_features = [col for col in X.columns if X[col].dtype == np.float]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, shuffle=True)

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer
from gplearn.genetic import SymbolicTransformer

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator

function_set = ['add', 'sub', 'mul', 'div',
                'sqrt', 'log', 'abs', 'neg', 'inv',
                'max', 'min']
gp = SymbolicTransformer(generations=20, population_size=2000,
                         hall_of_fame=100, n_components=5,
                         function_set=function_set,
                         parsimony_coefficient=1E-7,
                         max_samples=0.9, verbose=1,
                         random_state=0, n_jobs=3)

In [None]:
col_passthrough = ColumnTransformer([
                                     ('Imputer', SimpleImputer(), numeric_features)], remainder='passthrough')
gp_feat = FeatureUnion([('col_pass', col_passthrough),
                        ('gp_featurizer', gp)])
preprocessor = Pipeline([
                     ('FeatureUnion', gp_feat), 
                     ('StandrdScaler', StandardScaler())
])

In [None]:
preprocessor.named_steps

In [None]:
preprocessor.fit(X_train, y_train)

In [None]:
from catboost import CatBoostRegressor, Pool

catboost_regressor = CatBoostRegressor(iterations=8000, early_stopping_rounds=50)
eval_set = Pool(preprocessor.transform(X_test), np.log1p(y_test))
catboost_regressor.fit(preprocessor.transform(X_train), np.log1p(y_train), eval_set=eval_set, plot=True)

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test, np.expm1(catboost_regressor.predict(preprocessor.transform(X_test))))

In [None]:
score

In [None]:
import joblib

joblib.dump(preprocessor, '../models/preprocessor')

In [None]:
catboost_regressor.save_model('../models/catboost_regressor')

In [None]:
with simple catboost r2 score is 0.8350022987968354 (no Preprocessor)