In [54]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from xgboost import XGBRegressor, XGBRFRegressor

preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(), ['ocean_proximity'])
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('imputer', 'passthrough'),
    ('scaler', 'passthrough'),
    ('regressor', 'passthrough')
])

params = {
    'imputer': [SimpleImputer(), 'passthrough'],
    'scaler': [StandardScaler(), MinMaxScaler()],
    'regressor': [XGBRegressor(objective='reg:squarederror'), XGBRFRegressor(objective='reg:squarederror')],
    'regressor__learning_rate': [0.1, 0.01, 0.001],
}

model = GridSearchCV(
    estimator=pipeline,
    param_grid=params,
    scoring='neg_root_mean_squared_error', # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    cv=5
)

In [55]:
import pandas as pd

data = pd.read_csv('housing.csv')

target_variable = 'median_house_value'

X = data.drop(columns=target_variable)
y = data[target_variable]

In [56]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [57]:
model.fit(X_train, y_train)

In [59]:
model.best_params_

{'imputer': SimpleImputer(),
 'regressor': XGBRegressor(objective='reg:squarederror'),
 'regressor__learning_rate': 0.1,
 'scaler': StandardScaler()}

In [62]:
best_model = model.best_estimator_
best_model

In [16]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error

In [63]:
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')

MSE: 3321852925.8724313
RMSE: 57635.51791970322
MAE: 39196.122573734254


In [65]:
import tensorflow as tf

In [69]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

input_size = X_train.shape[1]

model = Sequential(name='CaliforniaHousing')
model.add(layers.Input(shape=(input_size,), name='input'))
model.add(layers.Dense(20, activation='relu', name='hidden1'))
model.add(layers.Dense(10, activation='relu', name='hidden2'))
model.add(layers.Dense(1, activation='linear', name='output'))

model.summary()

In [70]:
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import metrics

model.compile(
    optimizer=optimizers.Adam(),
    loss=losses.MeanSquaredError(),
    metrics=[
        metrics.MeanSquaredError(),
        metrics.RootMeanSquaredError()
    ]
)

In [71]:
from tensorflow.keras import callbacks

cb = [
    callbacks.EarlyStopping(
        monitor='val_mean_squared_error',
        mode='min',
        restore_best_weights=True,
        patience=10
    )
]

history = model.fit(
    X_train,
    y_train,
    epochs=100,
    batch_size=64,
    verbose=2,
    callbacks=cb,
    validation_data=(X_test, X_test)
)

ValueError: could not convert string to float: '<1H OCEAN'