In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model, ensemble
from sklearn.metrics import make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np
import json
from splitter import StratifiedRegressionSplit
from preprocess import OneHotEncoderOnlyCategorical, PolynomialFeaturesDF, TypeConverter
from scoring import scoring



# Real estate prices

In [2]:
variables = pd.read_excel('../data/covariates.xlsx')
data = pd.read_excel('../data/hp_ljubljana_new_with_rooms.xlsx')

  data = pd.read_excel('../data/hp_ljubljana_new_with_rooms.xlsx')


In [3]:

variables = variables.to_dict(orient='records')
target = []
features = []
for feature in variables:
    feature['type'] = int if feature['type'] == 'int' else float if feature['type'] == 'float' else str
    if feature['group'] == 'target':
        target = feature
        continue
    else:
        features.append(feature)

X = data[[feature['name'] for feature in features]]
y = data[target['name']]

## Ridge Regression

In [4]:

models = {
    'Linear Regression' : GridSearchCV(
         estimator=make_pipeline(
                TypeConverter(variables=features),
                OneHotEncoderOnlyCategorical(sparse=False),
                StandardScaler(),
                linear_model.LinearRegression()
        ),
        param_grid={},
        scoring=scoring,
        refit='r2',
        return_train_score = True,
        cv=StratifiedRegressionSplit(n_splits=10, n_bins = 10, test_size=0.3, random_state=0),
        n_jobs=-1
    ),
    'Ridge Regression' : GridSearchCV(
        estimator=make_pipeline(
                TypeConverter(variables=features),
                OneHotEncoderOnlyCategorical(sparse=False),
                StandardScaler(),
                linear_model.Ridge(max_iter=10000)
        ),
        param_grid={'ridge__alpha': list(np.arange(1,20,0.2)) },
        scoring=scoring,
        refit='r2',
        return_train_score = True,
        cv=StratifiedRegressionSplit(n_splits=10, n_bins = 10, test_size=0.3, random_state=0),
        n_jobs=-1
    ),
    'Lasso Regression': GridSearchCV(
        estimator=make_pipeline(
                TypeConverter(variables=features),
                OneHotEncoderOnlyCategorical(sparse=False),
                StandardScaler(),
                linear_model.Lasso(max_iter=10000)
        ),
        param_grid={'lasso__alpha': list(np.arange(1,20,0.2)) },
        scoring=scoring,
        refit='r2',
        return_train_score = True,
        cv=StratifiedRegressionSplit(n_splits=10, n_bins = 10, test_size=0.3, random_state=0),
        n_jobs=-1
    ),
    'ElasticNet Regression': GridSearchCV(
         estimator=make_pipeline(
                TypeConverter(variables=features),
                OneHotEncoderOnlyCategorical(sparse=False),
                StandardScaler(),
                linear_model.ElasticNet(max_iter=10000)
        ),
        param_grid={'elasticnet__alpha': list(np.arange(0.1,1,0.1)),
                    'elasticnet__l1_ratio': list(np.arange(0.1,1,0.1))},
        scoring=scoring,
        refit='r2',
        return_train_score = True,
        cv=StratifiedRegressionSplit(n_splits=10, n_bins = 10, test_size=0.3, random_state=0),
        n_jobs=-1
    ),
    'Random Forest Regression': ensemble.RandomForestRegressor(
                                        bootstrap=True,
                                        max_features=int(len(features)/3),
                                        min_samples_split = 4,
                                        min_samples_leaf=2,
                                        oob_score=True,
                                        n_jobs=-1,
                                        random_state = 0,
                                        n_estimators=500,
                                        max_samples = 0.7
    ),
    'Polynomial Regression' : GridSearchCV(
        estimator=make_pipeline(
                TypeConverter(variables=features),
                OneHotEncoderOnlyCategorical(sparse=False),
                PolynomialFeaturesDF(degree = 2, interaction_only=False),
                StandardScaler(),
                linear_model.LinearRegression()
        ),
        param_grid={},
        scoring=scoring,
        refit='r2',
        return_train_score = True,
        cv=StratifiedRegressionSplit(n_splits=10, n_bins = 10, test_size=0.3, random_state=0),
        n_jobs=-1
    ),
    'Ridge Polynomial Regression' : GridSearchCV(
        estimator=make_pipeline(
                TypeConverter(variables=features),
                OneHotEncoderOnlyCategorical(sparse=False),
                PolynomialFeaturesDF(degree = 2, interaction_only=False),
                StandardScaler(),
                linear_model.Ridge(max_iter=10000)
        ),
         param_grid={'ridge__alpha': list(np.arange(1,20,0.2)) },
        scoring=scoring,
        refit='r2',
        return_train_score = True,
        cv=StratifiedRegressionSplit(n_splits=10, n_bins = 10, test_size=0.3, random_state=0),
        n_jobs=-1
    ),
    'Lasso Polynomial Regression' : GridSearchCV(
        estimator=make_pipeline(
                TypeConverter(variables=features),
                OneHotEncoderOnlyCategorical(sparse=False),
                PolynomialFeaturesDF(degree = 2, interaction_only=False),
                StandardScaler(),
                linear_model.Lasso(max_iter=10000)
        ),
        param_grid={'lasso__alpha': list(np.arange(1,20,0.2)) },
        scoring=scoring,
        refit='r2',
        return_train_score = True,
        cv=StratifiedRegressionSplit(n_splits=10, n_bins = 10, test_size=0.3, random_state=0),
        n_jobs=-1
    ),
    'ElasticNet Polynomial Regression' : GridSearchCV(
        estimator=make_pipeline(
                TypeConverter(variables=features),
                OneHotEncoderOnlyCategorical(sparse=False),
                PolynomialFeaturesDF(degree = 2, interaction_only=False),
                StandardScaler(),
                linear_model.ElasticNet(max_iter=10000)
        ),
        param_grid={'elasticnet__alpha': list(np.arange(0.1,1,0.1)),
                    'elasticnet__l1_ratio': list(np.arange(0.1,1,0.1))},
        scoring=scoring,
        refit='r2',
        return_train_score = True,
        cv=StratifiedRegressionSplit(n_splits=10, n_bins = 10, test_size=0.3, random_state=0),
        n_jobs=-1
    )
    
}

In [5]:
gs_res = models['Ridge Regression'].fit(X,y)

Feature names seen at fit time, yet now missing:
- constr_type_metal_construction

Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 71, in _cached_call
    return cache[method]
KeyError: 'predict'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 258, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/opt/conda/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    result = getattr(estimator, method)(*args, **kwargs)
  Fil

In [18]:
import pandas as pd
import pickle
from scoring import scoring, sense
model_name = 'Lasso_Regression'

with open(f'/root/workspace/clanci/realestate_prices/models/{model_name}.pickle', 'rb') as f:
    gs_res = pickle.load(f)
    

gs_crossvalidation_results = pd.DataFrame(gs_res.cv_results_)
reg_coefs = pd.Series({feature: [coef] for feature, coef in zip(['intercept'] + list(gs_res.feature_names_in_) , [gs_res.best_estimator_.steps[1][1].intercept_] + list(gs_res.best_estimator_.steps[1][1].coef_))})
reg_coefs.to_excel(f'/root/workspace/clanci/realestate_prices/results/{model_name}_coefs.xlsx', index=False)
gs_crossvalidation_results.to_excel(f'/root/workspace/clanci/realestate_prices/results/{model_name}_results.xlsx', index=False)

In [20]:
best_params = gs_res.best_params_
metrics = {}
for metric in scoring:
    for dataset in ['train', 'test']:
        for aggr in ['mean', 'std']:
            metrics[f'{aggr}_{dataset}_{metric}'] = gs_crossvalidation_results.loc[gs_res.best_index_,f'{aggr}_{dataset}_{metric}']
            if aggr=='std' or sense[metric]:
                continue
            metrics[f'{aggr}_{dataset}_{metric}'] = -metrics[f'{aggr}_{dataset}_{metric}']
best_params.update(metrics)

print(json.dumps(best_params, indent=4, default=int))
with open(f'/root/workspace/clanci/realestate_prices/results/{model_name}_metrics.json', 'w') as f:
    json.dump(best_params, f, default=int)

{
    "lasso__alpha": 1.7000000000000006,
    "mean_train_mpe": 0.0010707682096338389,
    "std_train_mpe": 0.0003028259019442757,
    "mean_test_mpe": 0.002927173294707609,
    "std_test_mpe": 0.0030312915231110148,
    "mean_train_mae": -1.9241068816402152e-14,
    "std_train_mae": 1.6592677912003468e-13,
    "mean_test_mae": 1.1668351882236943,
    "std_test_mae": 5.803533128748521,
    "mean_train_mape": 0.1482892286765803,
    "std_train_mape": 0.0008952689032138652,
    "mean_test_mape": 0.15009002539422747,
    "std_test_mape": 0.002663217993525132,
    "mean_train_cod": 0.14918467120958123,
    "std_train_cod": 0.000985923943453198,
    "mean_test_cod": 0.15108837069325154,
    "std_test_cod": 0.002733285467324261,
    "mean_train_rmse": 480.0457363417372,
    "std_train_rmse": 3.5765136321090547,
    "mean_test_rmse": 483.95059477679206,
    "std_test_rmse": 8.518244462043336,
    "mean_train_r2": 0.34071598078435494,
    "std_train_r2": 0.00879486450982753,
    "mean_test_r2"