In [51]:

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, Normalizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score

In [52]:
url = r"..\data\raw\clean_house.csv"
house = pd.read_csv(url, sep=",")

In [53]:
x = house.drop(["price", "locality_name", "latitude", "longitude", "furnished", "property_id", "number_of_rooms"], axis=1)
y = house[["price"]]

In [54]:
x.head()

Unnamed: 0,postal_code,property_type,property_subtype,type_of_sale,living_area,kitchen_type,fully_equipped_kitchen,open_fire,terrace,terrace_area,garden,garden_area,surface_of_good,number_of_facades,swimming_pool,state_of_building,main_city,province
0,9600,HOUSE,HOUSE,BUY_REGULAR,146.0,INSTALLED,1.0,0,1.0,13.0,1.0,161.0,250.0,2.0,0.0,GOOD,ronse,oost-vlaanderen
1,9820,HOUSE,HOUSE,BUY_REGULAR,126.0,INSTALLED,1.0,0,1.0,23.0,,,77.0,2.0,,GOOD,merelbeke,oost-vlaanderen
2,4870,HOUSE,HOUSE,BUY_REGULAR,159.0,INSTALLED,1.0,0,1.0,16.0,1.0,420.0,572.0,3.0,,AS_NEW,trooz,luik
3,1340,HOUSE,HOUSE,BUY_REGULAR,141.0,USA_HYPER_EQUIPPED,1.0,0,1.0,37.0,1.0,76.0,165.0,2.0,,GOOD,ottignies-louvain-la-neuve,waals-brabant
4,2170,HOUSE,MIXED_USE_BUILDING,BUY_REGULAR,210.0,INSTALLED,1.0,0,,,,,65.0,2.0,0.0,GOOD,antwerpen,antwerpen


In [55]:
nfeatuers = ["living_area", "terrace_area", "garden_area", "surface_of_good"]
cfeatures = ["postal_code", "property_type", "property_subtype", "type_of_sale", "kitchen_type", "fully_equipped_kitchen", "open_fire", "terrace", "garden", "number_of_facades", "swimming_pool", "state_of_building", "main_city", "province"]

In [56]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=41)

In [57]:
num_pipeline = make_pipeline(SimpleImputer(missing_values=np.nan), StandardScaler(), PolynomialFeatures(degree=3))
cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent", missing_values=np.nan),OneHotEncoder(handle_unknown='ignore'))

In [58]:
preprocessor = make_column_transformer((num_pipeline, nfeatuers), (cat_pipeline, cfeatures))
b = BaggingRegressor(n_estimators=150) 

In [59]:
model = make_pipeline(preprocessor, b)
model.fit(x_train, y_train)

  return column_or_1d(y, warn=True)


In [60]:
y_pred = model.predict(x_test)
print('Mean squared error : ' + str(mean_squared_error(y_test,y_pred)))
print('Mean absolute error : ' + str(mean_absolute_error(y_test,y_pred)))
print('Explained vaiance score : ' + str(explained_variance_score(y_test,y_pred)))
print('r2 score : ' + str(r2_score(y_test,y_pred)))

Mean squared error : 39109979372.53452
Mean absolute error : 98626.31636641553
Explained vaiance score : 0.753624041251576
r2 score : 0.7532298743318143


In [62]:
x_train.head()
house_data = {'postal_code' :[1060],'property_type' :["HOUSE"],'property_subtype' :["HOUSE"],'type_of_sale':["BUY_REGULAR"],'living_area':["220.0"],'kitchen_type':["SEMI_EQUIPPED"],'fully_equipped_kitchen':[1.0],'open_fire':[0],'terrace':[0],'terrace_area':[50.0],'garden':[1.0],'garden_area':[100.0],'surface_of_good':[218.0],'number_of_facades':[4],'swimming_pool':[1.0],'state_of_building':['GOOD'],'main_city':['wevelgem'], "province":["west-vlaanderen"]}
test_df = pd.DataFrame(house_data)

In [63]:
y_pred = model.predict(test_df)
print(y_pred)

[376719.4]
