In [2]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

In [3]:
import sys
import os
from pathlib import Path
path = Path('S:/Projects/Zillow') #replace with your root path
sys.path.append(os.path.abspath(path))
from config.config import Config

In [17]:
df = pd.read_csv(Config.PROCESSED_DATA_FILE, index_col=0)

In [18]:
df.head()

Unnamed: 0,region,state,metro,county,year,month,timestamp,median_price
0,New York,NY,New York-Newark-Jersey City,Queens County,2010,1,2010-01-01,
1,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,2010,1,2010-01-01,
2,Houston,TX,Houston-The Woodlands-Sugar Land,Harris County,2010,1,2010-01-01,
3,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2010,1,2010-01-01,
4,San Antonio,TX,San Antonio-New Braunfels,Bexar County,2010,1,2010-01-01,


In [19]:
df.dropna(inplace=True)

In [20]:
df.shape

(109800, 8)

In [21]:
x_train, x_test, y_train, y_test = train_test_split(df[df.columns[:6]].values, df['median_price'].values, random_state=40, test_size=0.2)

In [22]:
transformer = ColumnTransformer(transformers=[
    ('num', MinMaxScaler(), [4, 5]),
    ('cat', OneHotEncoder(), [0, 1, 2, 3])
])

In [23]:
pipeline = Pipeline([
    ('transform', transformer),
    ('model', RandomForestRegressor(n_jobs=6))
])

In [24]:
pipeline.fit(x_train, y_train)

In [25]:
y_pred = pipeline.predict(x_test)

In [29]:
(r2_score(y_pred=y_pred, y_true=y_test),
mean_squared_error(y_pred=y_pred, y_true=y_test),
mean_absolute_percentage_error(y_pred=y_pred, y_true=y_test))

(0.9943992770909198, 12022.638719640508, 0.026630787660043467)

In [None]:
joblib.dump