# Carregar os dados

In [1]:
import pickle
import pandas as pd

import numpy as np

In [2]:
with open('X.npy', 'rb') as f:
    X, y, column_names = pickle.load(f)

In [3]:
X, y, column_names

(array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   2.0804    ,   42.        ,    4.29411765, ...,    2.02689076,
           37.84      , -122.26      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 array([ 1.50983855,  1.27675847,  0.81845737, ..., -0.08012604,
        -0.16605458, -0.1120495 ]),
 ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'])

In [4]:
X = pd.DataFrame(X, columns=column_names)

In [5]:
X

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,2.0804,42.0,4.294118,1.117647,1206.0,2.026891,37.84,-122.26
3,2.1250,50.0,4.242424,1.071970,697.0,2.640152,37.85,-122.26
4,1.9911,50.0,5.343675,1.085919,990.0,2.362768,37.84,-122.26
...,...,...,...,...,...,...,...,...
18218,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
18219,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
18220,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
18221,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


# Listar os modelos a serem testados

In [6]:
modelos = {}

## Modelo: Uma regressão linear simples

In [7]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
modelos['LinearRegression'] = model
model

## Modelos: Ridge com alphas 0.1, 1, 10, 100

In [8]:
from IPython.display import display

In [9]:
from sklearn.linear_model import Ridge

for alpha in [0.1, 1, 10, 100]:
    model = Ridge(alpha=alpha)
    modelos[f'Ridge_{alpha}'] = model
    display(model)

## Modelo: Regressão linear simples com features polinomiais de grau 2

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('model', LinearRegression()),
])
modelos['PolynomialFeatures'] = pipeline
display(pipeline)

## Modelo: Regressão linear simples com log na população

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

log_transformer = FunctionTransformer(np.log1p)

col_transform = ColumnTransformer(
    [
        ('log', log_transformer, ['Population']),
    ],
    remainder='passthrough',
    force_int_remainder_cols=False,
)

pipeline = Pipeline([
    ('col_transform', col_transform),
    ('model', LinearRegression()),
])

modelos['log_Population'] = pipeline
display(pipeline)

## Modelo: Regressão linear simples com clustering nas features Latitude e Longitude.

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans

In [30]:
# Create a pipeline to process the latitudes and longitudes.
scaler = StandardScaler()
clusterer = KMeans(n_clusters=1000, random_state=42)

pipeline_latlong = Pipeline([
    ('scaler', scaler),
    ('clusterer', clusterer),
])

In [31]:
# Stage that transforms only the latitude and longitude columns.
log_transformer = FunctionTransformer(np.log1p)

col_transform = ColumnTransformer(
    [
        ('latlong', pipeline_latlong, ['Latitude', 'Longitude']),
        ('log', log_transformer, ['Population']),
    ],
    remainder='passthrough',
    force_int_remainder_cols=False,
)

In [32]:
# The final pipeline.
pipeline = Pipeline([
    ('col_transform', col_transform),
    ('model', LinearRegression()),
])

In [33]:
modelos['clustering'] = pipeline

In [34]:
display(pipeline)

In [35]:
modelos

{'LinearRegression': LinearRegression(),
 'Ridge_0.1': Ridge(alpha=0.1),
 'Ridge_1': Ridge(alpha=1),
 'Ridge_10': Ridge(alpha=10),
 'Ridge_100': Ridge(alpha=100),
 'PolynomialFeatures': Pipeline(steps=[('scaler', StandardScaler()),
                 ('poly', PolynomialFeatures(include_bias=False)),
                 ('model', LinearRegression())]),
 'log_Population': Pipeline(steps=[('col_transform',
                  ColumnTransformer(force_int_remainder_cols=False,
                                    remainder='passthrough',
                                    transformers=[('log',
                                                   FunctionTransformer(func=<ufunc 'log1p'>),
                                                   ['Population'])])),
                 ('model', LinearRegression())]),
 'clustering': Pipeline(steps=[('col_transform',
                  ColumnTransformer(force_int_remainder_cols=False,
                                    remainder='passthrough',
                  

# Nivel 2: Escolha de modelo

In [36]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(
    X_train,
    y_train,
    test_size=0.2,
    random_state=42,
)


In [37]:
from sklearn.metrics import root_mean_squared_error

best_rmse = float('inf')

for name, model in modelos.items():
    model.fit(X_train_val, y_train_val)
    y_pred = model.predict(X_test_val)
    rmse = root_mean_squared_error(y_test_val, y_pred)
    print(name, rmse)
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_model_name = name

LinearRegression 0.29660470865850036
Ridge_0.1 0.29660512584056853
Ridge_1 0.2966090815857463
Ridge_10 0.29666487689941407
Ridge_100 0.29754335790273756
PolynomialFeatures 0.27034523706110963
log_Population 0.29636036481046696
clustering 0.19247839720576654


In [38]:
best_model, best_model_name

(Pipeline(steps=[('col_transform',
                  ColumnTransformer(force_int_remainder_cols=False,
                                    remainder='passthrough',
                                    transformers=[('latlong',
                                                   Pipeline(steps=[('scaler',
                                                                    StandardScaler()),
                                                                   ('clusterer',
                                                                    KMeans(n_clusters=1000,
                                                                           random_state=42))]),
                                                   ['Latitude', 'Longitude']),
                                                  ('log',
                                                   FunctionTransformer(func=<ufunc 'log1p'>),
                                                   ['Population'])])),
                 ('model', LinearRegre

# Nivel 1: Certificação

In [39]:
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(rmse)

0.1983495483577765


In [40]:
100*(np.exp(rmse) - 1)

np.float64(21.938855460215567)

# Nivel 0: treino final e deploy

In [41]:
best_model.fit(X, y)

In [42]:
import joblib

joblib.dump(best_model, 'model.pkl')

['model.pkl']

Desafios:

- Um modelo que usa o KMeans para clusterizar latitude e logitude DENTRO DE UMA PIPELINE.
- Usar o `GridSearchCV` para achar o melhor modelo.