# First regression tests


In [47]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor


In [48]:
dataset = pd.read_excel("../datasets/homes.xlsx")
dataset.head()


Unnamed: 0,propertyCode,thumbnail,externalReference,numPhotos,floor,price,propertyType,operation,size,exterior,...,hasPlan,has3DTour,has360,hasStaging,labels,topNewDevelopment,superTopHighlight,neighborhood,highlight,newDevelopmentFinished
0,97905810,https://img3.idealista.com/blur/WEB_LISTING/0/...,17-PV089,18,1.0,2400,flat,rent,135,1.0,...,False,False,False,False,"[{'name': 'seaViewsType', 'text': 'Vistas al m...",False,False,,,
1,85623229,https://img3.idealista.com/blur/WEB_LISTING/0/...,XC112-XCD,19,1.0,1400,flat,rent,75,,...,False,False,False,False,,False,False,Sant Pere - Santa Caterina i la Ribera,,
2,101135188,https://img3.idealista.com/blur/WEB_LISTING/0/...,123361,21,1.0,790,flat,rent,61,,...,False,False,False,False,,False,False,,,
3,101216078,https://img3.idealista.com/blur/WEB_LISTING/0/...,123360,22,3.0,950,flat,rent,103,,...,False,False,False,False,,False,False,,,
4,101976668,https://img3.idealista.com/blur/WEB_LISTING/0/...,W-02R4MF,52,,8000,chalet,rent,231,,...,False,False,False,False,"[{'name': 'luxuryType', 'text': 'Lujo'}, {'nam...",False,False,,,


In [49]:
dataset.dtypes


propertyCode                int64
thumbnail                  object
externalReference          object
numPhotos                   int64
floor                      object
price                       int64
propertyType               object
operation                  object
size                        int64
exterior                  float64
rooms                       int64
bathrooms                   int64
address                    object
province                   object
municipality               object
district                   object
country                    object
latitude                  float64
longitude                 float64
showAddress                  bool
url                        object
distance                    int64
description                object
hasVideo                     bool
status                     object
newDevelopment               bool
hasLift                   float64
parkingSpace               object
priceByArea                 int64
detailedType  

In [50]:
FEATURES = ["size", "rooms", "bathrooms", "numPhotos"]
dataset = dataset[FEATURES + ["price"]]
dataset


Unnamed: 0,size,rooms,bathrooms,numPhotos,price
0,135,3,2,18,2400
1,75,1,1,19,1400
2,61,2,1,21,790
3,103,3,2,22,950
4,231,4,2,52,8000
...,...,...,...,...,...
246,220,4,3,58,4800
247,85,2,2,22,1000
248,95,4,1,30,1300
249,209,4,4,47,4800


In [51]:
X, y = dataset[FEATURES], dataset["price"]


In [52]:
model = XGBRegressor(n_estimators=1000)
model


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [54]:
print(f"Number of training instances: {len(X_train)}")
print(f"Number of testing instances: {len(X_test)}")


Number of training instances: 200
Number of testing instances: 51


In [56]:
model.fit(X_train, y_train)


In [32]:
predictions = model.predict(X_test)
predictions


array([1220.6918 ,  761.0292 ,  994.327  , 1478.3866 , 4108.5967 ,
        636.0886 , 1283.2186 ,  900.26184, 2473.5334 , 1191.301  ,
       1217.269  ,  409.4774 , 1160.527  , 1312.1942 ,  581.25183,
        318.1572 , 1303.3783 , 5615.2197 , 1429.3212 , 1280.6962 ,
       1249.9872 ,  802.7616 , 1468.6549 ,  872.81   , 3575.641  ,
       1821.904  ,  489.59543,  733.5957 ,  510.41504, 1377.7427 ,
       1749.0034 , 1037.2559 , 5509.6426 , 6340.3843 , 1942.8004 ,
       1507.98   ,  889.84534, 1649.0702 ,  806.0301 ,  775.8562 ,
       4952.175  , 2600.187  , 2059.1106 , 2776.257  , 2198.1838 ,
       1558.8536 , 2243.4067 , 1207.1384 , 1174.4623 , 2871.158  ,
       1462.666  ], dtype=float32)

In [34]:
np.array(y_test)


array([ 850,  750, 2500,  625, 2000, 2200,  600, 1550,  500,  750, 2100,
        525,  650, 1233,  800, 1100, 1510,  750,  700,  750, 1250,  710,
       3000,  800, 1100, 1300,  975,  750, 1100, 1750,  950,  700, 7500,
       1150,  680, 1000,  550, 1200,  900,  650, 4000, 3000, 1200, 2000,
        950,  650,  700,  800,  695, 1800, 2400])

In [59]:
def evaluate(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    print("MSE: ", mse)
    rmse = np.sqrt(mse)
    print("RMSE: ", rmse)
    r2 = r2_score(y_test, y_pred)
    print("R2 Score: ", r2)

    return mse, rmse, r2

evaluate(np.array(y_test), predictions)


MSE:  8133029.145653032
RMSE:  2851.8466202888667
R2 Score:  -0.2653255192489712


(8133029.145653032, 2851.8466202888667, -0.2653255192489712)