# Modeling our price estimator and Similarity finder

## Price estimator

Read the scrapped data from the scraper dir

In [4]:
import pandas as pd
data = pd.read_csv("test_data.csv",header=0,index_col=False)

### Trasform the data and add one-hot encoding

In [34]:
data.rename({'price':'label'}, axis=1, inplace=True)
data['region'] = data['region'].apply(lambda x:  
                                      x.replace(" ", "") \
                                      .replace("/", "-"))
data

Unnamed: 0,region,label,price_per_size,size,rooms,rent
0,Östermalm-Gärdet,2775000,119099,23.3,1.0,1071
1,VästraKungsholmen,2025000,117733,17.2,1.0,817
2,Kärrtorp,1910000,68214,28.0,1.0,1689
3,Gärdet-Östermalm,2650000,101923,26.0,1.0,1324
4,Östermalm,2570000,135263,19.0,1.0,1077
...,...,...,...,...,...,...
19642,Östermalm,6150000,111818,55.0,2.0,1667
19643,NackaStrand,5050000,60843,83.0,2.5,4777
19644,NorraDjurgårdsstaden,5525000,85000,65.0,3.0,3179
19645,Vasastan-Atlas,5550000,88095,63.0,2.0,2955


In [35]:
## One-hot encoding
regions = pd.DataFrame(data, columns=['region'])
dum_df = pd.get_dummies(regions, columns=["region"])
data_one_hot = data.join(dum_df)
data_one_hot = data_one_hot.drop(columns=['region'])
data_one_hot

Unnamed: 0,label,price_per_size,size,rooms,rent,region_Abrahamsberg,region_Abrahamsberg-Åkeslund,region_Akalla,region_Akallahöjden,region_Alby,...,region_Östermalm-vidÖstermalmstorg,region_ÖstermalmGärdet,region_ÖstermalmHjorthagen,region_ÖstermalmKarlaplan,region_ÖstermalmNedreGärdet,region_ÖstermalmVasastan,region_ÖstermalmnäraStrandvägen-Kaptensgatan14,region_Östertälje,region_ÖstraOrminge,region_ÖvreGärdet
0,2775000,119099,23.3,1.0,1071,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2025000,117733,17.2,1.0,817,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1910000,68214,28.0,1.0,1689,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2650000,101923,26.0,1.0,1324,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2570000,135263,19.0,1.0,1077,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19642,6150000,111818,55.0,2.0,1667,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19643,5050000,60843,83.0,2.5,4777,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19644,5525000,85000,65.0,3.0,3179,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19645,5550000,88095,63.0,2.0,2955,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Making the model

In [69]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

house_data = data_one_hot.drop(columns=['label'])
lables = pd.DataFrame(data, columns=['label'])

X_train, X_test, Y_train, Y_test = train_test_split(house_data, lables, train_size=0.90,test_size=0.10, random_state=101)

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

xg_reg.fit(X_train, Y_train,
          eval_set=[(X_train, Y_train), (X_test, Y_test)],
          eval_metric='rmse',
          verbose=True)


xg_reg.predict(X_test)

[0]	validation_0-rmse:3725577.50000	validation_1-rmse:3646605.75000
[1]	validation_0-rmse:3358557.25000	validation_1-rmse:3287321.50000
[2]	validation_0-rmse:3059405.00000	validation_1-rmse:2992304.00000
[3]	validation_0-rmse:2816692.25000	validation_1-rmse:2750439.50000
[4]	validation_0-rmse:2567295.25000	validation_1-rmse:2506318.25000
[5]	validation_0-rmse:2368632.75000	validation_1-rmse:2311677.25000
[6]	validation_0-rmse:2169136.25000	validation_1-rmse:2117087.75000
[7]	validation_0-rmse:1996761.37500	validation_1-rmse:1948790.37500
[8]	validation_0-rmse:1843889.87500	validation_1-rmse:1799019.50000
[9]	validation_0-rmse:1680865.62500	validation_1-rmse:1640588.62500


array([2029361.5, 2275210.5, 2301470.2, ..., 3018998. , 1823439.6,
       2084178. ], dtype=float32)

### Model evaluation

In [53]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(xg_reg, X_train, Y_train, cv=kfold)

y_test_pred = xg_reg.predict(X_test)
mse = mean_squared_error(y_test_pred, Y_test)

mse

  'random_state to its default (None), or set shuffle=True.',


2784868762223.56

In [62]:
xg_reg.predict(X_test.head(3))

array([2029361.5, 2275210.5, 2301470.2], dtype=float32)

In [64]:
Y_test

Unnamed: 0,label
1395,2500000
13306,3850000
5789,3700000
8274,2585000
15067,7200000
...,...
5526,3040000
12927,3400000
16790,6400000
8525,2660000
