<a href="https://colab.research.google.com/github/sspitz3/ml-practice/blob/main/homl/housing_prices/example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Housing Prices Example

## Load Data

In [3]:
import requests
import tarfile
import pandas as pd


def load_data(url):
  r = requests.get(url)
  with open("rawdata.tgz", "wb") as f:
    f.write(r.content)
  with tarfile.open("rawdata.tgz", "r") as f:
    f.extractall("datasets")

  return pd.read_csv("datasets/housing/housing.csv")


data = load_data("https://github.com/ageron/data/raw/main/housing.tgz")

## Splitting

In [28]:
from sklearn.model_selection import train_test_split

X = data.drop("median_house_value", axis=1)
y = data["median_house_value"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Preprocessing

In [21]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import numpy as np


log_pipeline = make_pipeline(SimpleImputer(strategy='median'), FunctionTransformer(np.log), StandardScaler())
ratio_pipeline = make_pipeline(SimpleImputer(strategy="median"), FunctionTransformer(lambda x: x[:, [0]] / x[:, [1]]), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder())

preprocessing = ColumnTransformer([
    ("bedrooms", ratio_pipeline, ["total_bedrooms", "households"]),
    ("rooms_per_house", ratio_pipeline, ["total_rooms", "households"]),
    ("people_per_house", ratio_pipeline, ["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
    ("cat", cat_pipeline, ["ocean_proximity"])
])

In [29]:
preprocessing.fit_transform(X_train)

array([[-0.10655541,  0.24193354, -0.06077974, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.09215342,  0.17140969,  0.06973988, ...,  0.        ,
         0.        ,  0.        ],
       [-0.19714243,  0.2417236 , -0.07156244, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.74957692,  0.5869686 , -0.03089537, ...,  0.        ,
         0.        ,  0.        ],
       [-0.01214683,  0.04270974, -0.08670236, ...,  0.        ,
         1.        ,  0.        ],
       [-0.3087774 , -0.31326269,  0.00959688, ...,  0.        ,
         0.        ,  0.        ]])

## Fit Models

In [50]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg = make_pipeline(preprocessing, LinearRegression())

lin_reg.fit(X_train, y_train)

mean_squared_error(lin_reg.predict(X_train), y_train, squared=False)

73489.28506854447

In [48]:
lin_reg.predict(X_train)

array([273239.7607379 , 107601.65794956, 359428.42364554, ...,
       150771.61134137, 327999.21247288, 255267.20455991])

In [58]:
from sklearn.model_selection import cross_val_score

cross_val_score(lin_reg, X_train, y_train, scoring='neg_root_mean_squared_error', cv=3)

array([-74662.50779645, -78961.91799587, -72331.65274316])

In [59]:
lin_reg.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('bedrooms',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('functiontransformer',
                                                     FunctionTransformer(func=<function <lambda> at 0x7d28bb714e50>)),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    ['total_bedrooms', 'households']),
                                   ('rooms_per_house',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='median'...
                                                     SimpleImputer(strategy='median')),
                                