In [1]:
import pickle
import pandas as pd

import numpy as np

In [2]:
with open('X.npy', 'rb') as f:
    X, y, column_names = pickle.load(f)

In [3]:
X, y, column_names

(array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   2.0804    ,   42.        ,    4.29411765, ...,    2.02689076,
           37.84      , -122.26      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 array([ 1.50983855,  1.27675847,  0.81845737, ..., -0.08012604,
        -0.16605458, -0.1120495 ]),
 ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'])

In [4]:
X = pd.DataFrame(X, columns=column_names)

In [5]:
X

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,2.0804,42.0,4.294118,1.117647,1206.0,2.026891,37.84,-122.26
3,2.1250,50.0,4.242424,1.071970,697.0,2.640152,37.85,-122.26
4,1.9911,50.0,5.343675,1.085919,990.0,2.362768,37.84,-122.26
...,...,...,...,...,...,...,...,...
18218,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
18219,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
18220,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
18221,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

X_train_val, X_val, y_train_val, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.2,
    random_state=42,
)

# Relembrando KMeans

In [7]:
from sklearn.cluster import KMeans

In [8]:
clusterer = KMeans(n_clusters=10, random_state=42)

clusterer.fit(X_train_val)

## Construindo um regressor

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (FunctionTransformer, PolynomialFeatures,
                                   StandardScaler)

In [34]:
# Create a pipeline to process the latitudes and longitudes.
def exp_neg(x):
    return np.exp(-x)

scaler = StandardScaler()
clusterer = KMeans(n_clusters=50, random_state=42)
inverter = FunctionTransformer(exp_neg)

pipeline_latlong = Pipeline([
    ('scaler', scaler),
    ('clusterer', clusterer),
    ('inverter', inverter),
])

In [35]:
# Create a pipeline for the remaining columns.
scaler = StandardScaler()
poly = PolynomialFeatures(degree=2, include_bias=False)

pipeline_remaining = Pipeline([
    ('scaler', scaler),
    ('poly', poly),
])

In [36]:
# Stage that transforms only the latitude and longitude columns.
col_transform = ColumnTransformer([
    (
        'latlong',
        pipeline_latlong,
        [
            'Latitude',
            'Longitude',
        ],
    ),
    (
        'remaining',
        pipeline_remaining,
        [
            'MedInc',
            'HouseAge',
            'AveRooms',
            'AveBedrms',
            'Population',
            'AveOccup',
        ],
    ),
])

In [37]:
# The final pipeline.
pipeline = Pipeline([
    ('col_transform', col_transform),
    ('model', Ridge(alpha=0.1)),
])

In [38]:
pipeline

In [39]:
# Fit the pipeline.
pipeline.fit(X_train_val, y_train_val)

# Evaluate the pipeline.
from sklearn.metrics import root_mean_squared_error

y_pred = pipeline.predict(X_val)

rmse = root_mean_squared_error(y_val, y_pred)
rmse


np.float64(0.2287871621552116)

In [40]:
100*(np.exp(rmse) - 1)

np.float64(25.707445751683867)

In [41]:
pipeline

In [42]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__alpha': [10**k for k in range(-4, 0)],
    'col_transform__latlong__clusterer__n_clusters': [500, 1000, 2000],
    'col_transform__remaining__poly__degree': [1, 2, 3, 4],
}


In [43]:
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    n_jobs=-1,
    scoring='neg_root_mean_squared_error',
    return_train_score=True,
)

In [44]:
grid.fit(X_train, y_train)

In [45]:
grid.best_params_

{'col_transform__latlong__clusterer__n_clusters': 2000,
 'col_transform__remaining__poly__degree': 3,
 'model__alpha': 0.0001}

In [46]:
grid.best_score_

np.float64(-0.19032118831905254)

In [47]:
100*(np.exp(-grid.best_score_) - 1)

np.float64(20.963805688378145)

In [48]:
best_model = grid.best_estimator_

In [49]:
best_model

In [50]:
best_model.fit(X_train, y_train)

In [51]:
y_pred = best_model.predict(X_test)

In [52]:
from sklearn.metrics import root_mean_squared_error

rmse = root_mean_squared_error(y_test, y_pred)
rmse

np.float64(0.1846409759231252)

In [53]:
100*(np.exp(rmse) - 1)

np.float64(20.278653327506866)

In [54]:
best_model.fit(X, y)

In [55]:
import joblib

joblib.dump(best_model, 'model.pkl')

['model.pkl']