In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('../input/melbourne-housing-snapshot/melb_data.csv')

y = data.Price
X = data.drop('Price', axis=1)

In [2]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=0)

In [3]:
categorical_cols = [cname for cname in train_X.columns if train_X[cname].nunique()<10 and train_X[cname].dtype=='object']

In [4]:
numerical_cols = [cname for cname in train_X.columns if train_X[cname].dtype in ['int64', 'float64']]

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# Preprocessing numerical data

numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data

catergorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

In [6]:
# Bundle Preprocessing for Numerical and Categorical Data

preprocessor = ColumnTransformer(

    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', catergorical_transformer, categorical_cols)
    ]
)

In [7]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

In [8]:
# Create and Evaluate Pipeline

from sklearn.metrics import mean_absolute_error

# Bundle preprocessor and modeling code in pipeline

my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# my_pipeline.fit_transform(train_X, train_y, model__early_stopping_rounds=5, model__eval_set=[(test_X, test_y)])
my_pipeline.fit(train_X, train_y)

preds = my_pipeline.predict(test_X)

score = mean_absolute_error(test_y, preds)
print('MAE : ', score)

MAE :  163987.3804899362
