In [36]:
import sklearn as skl
import matplotlib.pyplot as plt
import pandas as pd
import metrics
from sklearn import pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model, kernel_ridge
from sklearn.metrics import make_scorer, pairwise
from matplotlib.widgets import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate, ShuffleSplit, GridSearchCV
import numpy as np
import json
from functools import reduce
from custom import StratifiedRegressionSplit


# Real estate prices

In [37]:
variables = pd.read_excel('../data/covariates.xlsx')
variables = variables.to_dict(orient='records')
targets = []
features = []
for feature in variables:
    feature['type'] = int if feature['type'] == 'int' else float if feature['type'] == 'float' else str
    if feature['group'] == 'target':
        targets.append(feature)
        continue
    else:
        features.append(feature)

## Dataset

In [38]:

data = pd.read_excel('../data/hp_ljubljana_new_with_rooms.xlsx')
feature_names = [x['name'] for x in features]
target_names = [x['name'] for x in targets]
data = data[feature_names + target_names]


exclude_features_names = []
categorical_features = [x for x in features if  (x['type'] == str) and (x not in exclude_features_names)]
categorical_features_names = [x['name'] for x in categorical_features]
numerical_features = [x for x in features if  (x['type'] in (int, float)) and (x not in exclude_features_names)]
numerical_features_names = [x['name'] for x in numerical_features]

# target = {'name': 'price_m2', 'type': float}

onehot = OneHotEncoder(sparse=False)
onehot.fit(data[categorical_features_names])
encoded_features = list(reduce(lambda x,y: x + y, [[{"name": f"{feature['name']}_{cat}", "type": int, "group": feature['group']} for cat in cats] for cats, feature in zip(onehot.categories_, categorical_features)]))
encoded_features_names = [x['name'] for x in encoded_features]

X_encoded = onehot.transform(data[categorical_features_names])
X_numerical = data[numerical_features_names]
X = pd.DataFrame(columns=numerical_features_names + encoded_features_names)


X[numerical_features_names] = X_numerical
X[encoded_features_names] = X_encoded
X = X.astype({x['name'] : x['type'] for x in numerical_features + encoded_features})
if len(target_names)==1:
    y = data[target_names[0]]
    y = y.astype({targets[0]['name'] : targets[0]['type']})
else:
    y = data[target_names]
    y = y.astype({x['name'] : x['type'] for x in targets})



  data = pd.read_excel('../data/hp_ljubljana_new_with_rooms.xlsx')


In [93]:
categorical_features

[{'name': 'constr_type',
  'description': 'Construction type (brick, concrete, wood)',
  'group': 'structural',
  'type': str},
 {'name': 'duplex',
  'description': 'Duplex - apartment in two floors (Yes/No)',
  'group': 'structural',
  'type': str},
 {'name': 'elevator',
  'description': 'Elevator (Yes/No)',
  'group': 'structural',
  'type': str},
 {'name': 'house_type',
  'description': 'Housing type (single, double, raw)',
  'group': 'structural',
  'type': str},
 {'name': 'postion_type',
  'description': 'Position in building (basement, ground, middle, penthouse)',
  'group': 'structural',
  'type': str}]

In [91]:
X

Unnamed: 0,dist_airport,dist_highway_entr,dist_main_roads,dist_public_transport,dist_recreation,dist_regional_roads,dist_river,dist_schools,construct_age,facade_age,...,elevator_yes,house_type_double,house_type_other,house_type_raw,house_type_single,postion_type_attics,postion_type_basement,postion_type_floor,postion_type_ground_floor,postion_type_other
0,16676.929688,100.000000,0.000000,200.000000,1360.147095,2884.440918,1216.552490,300.0,15016.0,15016.0,...,1,0,0,1,0,0,0,1,0,0
1,20342.320312,1392.838867,0.000000,0.000000,1389.244385,1600.000000,223.606796,500.0,10380.0,10380.0,...,0,0,0,0,1,0,0,1,0,0
2,17736.966797,1252.996460,100.000000,100.000000,500.000000,3310.589111,1403.566895,200.0,17129.0,17129.0,...,1,0,1,0,0,0,0,1,0,0
3,20586.646484,1923.538452,400.000000,223.606796,854.400391,2039.607788,282.842712,200.0,11920.0,11920.0,...,0,0,1,0,0,0,0,0,1,0
4,19356.910156,3026.549072,100.000000,141.421356,900.000000,3138.470947,509.901947,300.0,29525.0,29525.0,...,1,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5682,15764.833984,761.577332,400.000000,100.000000,894.427185,1664.331665,921.954468,300.0,12548.0,12548.0,...,1,0,0,0,1,0,0,1,0,0
5683,19808.080078,2051.828369,141.421356,100.000000,806.225769,2051.828369,200.000000,100.0,18406.0,18406.0,...,0,0,0,0,1,0,0,1,0,0
5684,19808.080078,2051.828369,141.421356,100.000000,806.225769,2051.828369,200.000000,100.0,18672.0,18672.0,...,0,0,0,0,1,0,0,1,0,0
5685,20849.220703,2154.065918,100.000000,141.421356,223.606796,2816.025635,141.421356,100.0,18560.0,18560.0,...,0,0,0,0,1,0,0,1,0,0


## Kernel Ridge Regression

In [41]:
scoring_greater_is_better = {
        'mpe': False,
        'mae': False,
        'mape': False,
        'cod': False,
        'rmse': False,
        'r2': True
}

scoring = {
           'mpe': make_scorer(metrics.mpe, greater_is_better=scoring_greater_is_better['mpe']),
           'mae': make_scorer(metrics.mae, greater_is_better=scoring_greater_is_better['mae']),
           'mape': make_scorer(metrics.mape, greater_is_better=scoring_greater_is_better['mape']),
           'cod': make_scorer(metrics.cod, greater_is_better=scoring_greater_is_better['cod']),
           'rmse': make_scorer(metrics.rmse, greater_is_better=scoring_greater_is_better['rmse']),
           'r2': 'r2',
        }

gs = GridSearchCV(
        estimator=make_pipeline(StandardScaler(), kernel_ridge.KernelRidge(kernel='polynomial')),
        param_grid={'kernelridge__alpha': list(np.arange(1,5,0.2)), 'kernelridge__degree': list(np.arange(2,3,1)) },
        scoring=scoring,
        refit='r2',
        return_train_score=True,
        cv=StratifiedRegressionSplit(n_splits=10, n_bins = 10, test_size=0.3, random_state=0)
)
gs_all_metric_results = gs.fit(X,y)


In [82]:
gs_crossvalidation_results = pd.DataFrame(gs_all_metric_results.cv_results_)
dual_reg_coefs = pd.Series({i: x for i, x in enumerate(gs_all_metric_results.best_estimator_.steps[1][1].dual_coef_)})
dual_reg_coefs.to_excel('../results/kernel_ridge_reg_dual_coefs.xlsx', index=False)
gs_crossvalidation_results.to_excel('../results/kernel_ridge_cv_results.xlsx', index=False)

In [90]:
best_params = gs_all_metric_results.best_params_
metrics = {}
for metric in scoring:
    for dataset in ['train', 'test']:
        for aggr in ['mean', 'std']:
            metrics[f'{aggr}_{dataset}_{metric}'] = gs_crossvalidation_results.loc[gs_all_metric_results.best_index_,f'{aggr}_{dataset}_{metric}']
            if aggr=='std' or scoring_greater_is_better[metric]:
                continue
            metrics[f'{aggr}_{dataset}_{metric}'] = -metrics[f'{aggr}_{dataset}_{metric}']
best_params.update(metrics)

print(json.dumps(best_params, indent=4, default=int))
with open('../results/kernel_ridge_metrics.json', 'w') as f:
    json.dump(best_params, f, default=int)

{
    "kernelridge__alpha": 2.9999999999999996,
    "kernelridge__degree": 2,
    "mean_train_mpe": -0.0014444988753098734,
    "mean_test_mpe": 0.0014284321909228618,
    "std_train_mpe": 5.636883640267211e-05,
    "std_test_mpe": 0.004929047655413846,
    "mean_train_mae": 1.7348617917874964,
    "mean_test_mae": 0.01725156268746364,
    "std_train_mae": 0.00578127538688875,
    "std_test_mae": 8.314484625208598,
    "mean_train_mape": 0.12687865286994188,
    "mean_test_mape": 0.13845947008440268,
    "std_train_mape": 0.0008637684800909835,
    "std_test_mape": 0.004076256969533876,
    "mean_train_cod": 0.12678601615787366,
    "mean_test_cod": 0.1385106340718943,
    "std_train_cod": 0.0008883700524865648,
    "std_test_cod": 0.0040870640557178015,
    "mean_train_rmse": 419.0747698442584,
    "mean_test_rmse": 455.13801303829484,
    "std_train_rmse": 3.193198465894363,
    "std_test_rmse": 8.66794702917536,
    "mean_train_r2": 0.49755159815448247,
    "mean_test_r2": 0.4053912