In [None]:
import polars as pl
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config
set_config(transform_output='polars')

def tweak_housing(df):
    return (df
            .with_columns(zipcode=pl.col('zipcode').cast(pl.String).cast(pl.Categorical),
                          date=pl.date(pl.col('date_year'), pl.col('date_month'), pl.col('date_day')),
                          yr_renovated=pl.col('yr_renovated').replace(0, None),
                          )
            .select(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 
                     'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 
                     'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 
                     'sqft_lot15', 'date',  #'date_year', 'date_month', 'date_day', 
                     ])
    )

# make the pipeline
numeric_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 
                    'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 
                    'lat', 'long', 'sqft_living15', 'sqft_lot15', 'zip_mean']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['zipcode']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore',
                              sparse_output=False), categorical_features)])

def to_pandas(df):
    return df.to_pandas()
pandas_transformer = FunctionTransformer(to_pandas)

tweak_transformer = FunctionTransformer(tweak_housing)

class ZipAvgPriceAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        # assume X is a polars dataframe
        self.zip_avg_price = (X
                              .group_by('zipcode')
                              .agg(zip_mean=pl.col('price').mean())
        )
        return self
    
    def transform(self, X, y=None):
        with pl.StringCache():
            return X.join(self.zip_avg_price, on='zipcode')


# King County House Sales dataset from OpenML (includes Seattle)
# this is an ARFF file, which is a text file with a specific format
url = 'https://www.openml.org/data/download/22044765/dataset'
cols = ['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 
        'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
        'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date_year', 'date_month', 'date_day']

raw = pl.read_csv(url, new_columns=cols, skip_rows=31, has_header=False)


    

  return X.join(self.zip_avg_price, on='zipcode')


0.806993671862066

## Dummy Regressor

In [3]:
from sklearn.dummy import DummyRegressor

dummy = DummyRegressor(strategy='mean')
y = raw.select('price')
X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)
dummy_pipe = Pipeline(steps=[('tweak', tweak_transformer),
                      ('zip_avg_price', ZipAvgPriceAdder()),
                      ('preprocessor', preprocessor),
                      ('dummy', dummy),
                      ])

dummy_pipe.fit(X_train, y_train)
dummy_pipe.score(X_test, y_test)

  return X.join(self.zip_avg_price, on='zipcode')


-0.000889991449168015

## LinearRegression

In [None]:
lr =  LinearRegression()
y = raw.select('price')
X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)
lr_pipe = Pipeline(steps=[('tweak', tweak_transformer),
                      ('zip_avg_price', ZipAvgPriceAdder()),
                      ('preprocessor', preprocessor),
                      ('lr', lr),
                      ])

lr_pipe.fit(X_train, y_train)
lr_pipe.score(X_test, y_test)

## RandomForestRegressor

In [2]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=9)
y = raw.select('price')
X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)
rf_pipe = Pipeline(steps=[('tweak', tweak_transformer),
                          ('zip_avg_price', ZipAvgPriceAdder()),
                          ('preprocessor', preprocessor),
                          ('rf', rf),])

rf_pipe.set_params(rf__n_estimators=10)
rf_pipe.fit(X_train, y_train)
rf_pipe.score(X_test, y_test)

  return fit_method(estimator, *args, **kwargs)
  return X.join(self.zip_avg_price, on='zipcode')


0.8506932617272662

## DecisionTreeRegressor

In [4]:
from sklearn.tree import DecisionTreeRegressor


dt = DecisionTreeRegressor()
y = raw.select('price')
X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)
dt_pipe = Pipeline(steps=[('tweak', tweak_transformer),
                      ('zip_avg_price', ZipAvgPriceAdder()),
                      ('preprocessor', preprocessor),
                      ('dt', dt),
                      ])

dt_pipe.set_params(dt__max_depth=9)
dt_pipe.fit(X_train, y_train)
dt_pipe.score(X_test, y_test)

  return X.join(self.zip_avg_price, on='zipcode')


0.7624328684956283

## CatBoostRegressor

In [10]:
from catboost import CatBoostRegressor


cat = CatBoostRegressor()
# has issues with Polars input going to use a pandas_transformer
def to_pandas(df):
    return df.to_pandas()
pandas_transformer = FunctionTransformer(to_pandas)

y = raw.select('price')

X_train, X_test, y_train, y_test = train_test_split(raw, y, test_size=0.2, random_state=42)
cat_pipe = Pipeline(steps=[('tweak', tweak_transformer),
                      ('zip_avg_price', ZipAvgPriceAdder()),
                      ('preprocessor', preprocessor),
                      ('to_pandas', pandas_transformer),
                      ('cat', cat), 
                      ])

cat_pipe.fit(X_train, y_train.to_numpy()[:,0])
cat_pipe.score(X_test, y_test.to_numpy()[:,0])



Learning rate set to 0.064232
0:	learn: 346982.2594620	total: 52.5ms	remaining: 52.4s
1:	learn: 332378.5598059	total: 59.4ms	remaining: 29.6s
2:	learn: 319502.6617458	total: 68.9ms	remaining: 22.9s
3:	learn: 307490.7902475	total: 72.5ms	remaining: 18s
4:	learn: 294940.3225791	total: 76ms	remaining: 15.1s
5:	learn: 283907.9796375	total: 79.8ms	remaining: 13.2s
6:	learn: 273592.5548762	total: 83.5ms	remaining: 11.8s
7:	learn: 263472.9134855	total: 87.4ms	remaining: 10.8s
8:	learn: 254884.3331936	total: 92.5ms	remaining: 10.2s
9:	learn: 246261.9209853	total: 96ms	remaining: 9.51s
10:	learn: 238183.5765242	total: 99.5ms	remaining: 8.94s
11:	learn: 230600.1564374	total: 103ms	remaining: 8.49s
12:	learn: 223779.4367347	total: 107ms	remaining: 8.11s
13:	learn: 217021.8170437	total: 113ms	remaining: 7.96s
14:	learn: 210875.7711670	total: 119ms	remaining: 7.79s
15:	learn: 205122.1910805	total: 122ms	remaining: 7.51s
16:	learn: 199643.2642325	total: 126ms	remaining: 7.29s
17:	learn: 194468.66237

  return X.join(self.zip_avg_price, on='zipcode')


0.9046473600451286

# Evaluation

## R2 Score

In [None]:
from sklearn.metrics import r2_score

# R2 score of Linear Regression
r2_score(y_test, lr_pipe.predict(X_test))

  return X.join(self.zip_avg_price, on='zipcode')


0.806993671862066

In [None]:
# R2 score of Randon Forest Regressor
r2_score(y_test, rf_pipe.predict(X_test))

  return X.join(self.zip_avg_price, on='zipcode')


0.8506932617272662

In [None]:
# R2 score of Decision Tree Regressor
r2_score(y_test, dt_pipe.predict(X_test))

  return X.join(self.zip_avg_price, on='zipcode')


0.7624328684956283

In [None]:
# R2 score of CatBoost Regressor
r2_score(y_test, cat_pipe.predict(X_test))

  return X.join(self.zip_avg_price, on='zipcode')


0.9046473600451286

## Mean Squared/Absolute Error

In [12]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [13]:
mean_squared_error(y_test, lr_pipe.predict(X_test))

  return X.join(self.zip_avg_price, on='zipcode')


29178062132.660885

In [14]:
mean_absolute_error(y_test, lr_pipe.predict(X_test))

  return X.join(self.zip_avg_price, on='zipcode')


98981.6365949572

In [15]:
mean_squared_error(y_test, rf_pipe.predict(X_test))

  return X.join(self.zip_avg_price, on='zipcode')


22571701809.86688

In [16]:
mean_absolute_error(y_test, rf_pipe.predict(X_test))

  return X.join(self.zip_avg_price, on='zipcode')


81087.44508357158

In [17]:
mean_squared_error(y_test, dt_pipe.predict(X_test))

  return X.join(self.zip_avg_price, on='zipcode')


35914617881.1232

In [18]:
mean_absolute_error(y_test, dt_pipe.predict(X_test))

  return X.join(self.zip_avg_price, on='zipcode')


94279.4279221626

In [19]:
mean_squared_error(y_test, cat_pipe.predict(X_test))

  return X.join(self.zip_avg_price, on='zipcode')


14415098613.389244

In [20]:
mean_absolute_error(y_test, cat_pipe.predict(X_test))

  return X.join(self.zip_avg_price, on='zipcode')


64449.15608948771