<a href="https://colab.research.google.com/github/rboghe/cened/blob/master/cened_1_2_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import os
import urllib.request
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats
from lightgbm import LGBMRegressor

# Silence panda's setting with copy warning
pd.options.mode.chained_assignment = None

# Load data

In [None]:
# Cened 1.2
url = 'https://www.dati.lombardia.it/api/views/rsg3-xhvk/rows.csv?accessType=DOWNLOAD'


if os.path.isfile('/tmp/cened12.csv'):
  pass
else:
  urllib.request.urlretrieve(url, '/tmp/cened12.csv')

In [None]:
# DDH
url = 'https://raw.githubusercontent.com/rboghe/cened/master/degreedays.txt'

urllib.request.urlretrieve(url, '/tmp/ddh.csv')

ddh = pd.read_csv('/tmp/ddh.csv', usecols = ['comune','dd'])

In [None]:
cols = ['COMUNE','FOGLIO', 'PARTICELLA','SUPERFICIE_DISPERDENTE',
          'TRASMITTANZA_MEDIA_COPERTURA','TRASMITTANZA_MEDIA_INVOLUCRO',
          'TRASMITTANZA_MEDIA_BASAMENTO','TRASMITTANZA_MEDIA_SERRAMENTO',
          'SUPERFICIE_LORDA', 'VOLUME_LORDO', 'DESTINAZIONE_DI_USO',
          'ANNO_COSTRUZIONE','SUPERFICIE_VETRATA_OPACA', 'VOLUME_NETTO',
          'SUPERFICIE_NETTA', 'EPH']
  
cened_old = pd.read_csv('/tmp/cened12.csv', usecols = cols)

# Preprocessing

In [None]:
# Drop buildings with NaN values
cened_old.dropna(inplace = True)

# We'll use residential buildings only
cened_old = cened_old[cened_old['DESTINAZIONE_DI_USO'] == 'E.1(1)']

# Add ddh
cened_old['COMUNE'] = cened_old['COMUNE'].str.lower()
ddh['comune'] = ddh['comune'].str.lower()
cened_old['COMUNE'] = cened_old['COMUNE'].str.replace(r"o`",  "o'")
cened_old['COMUNE'] = cened_old['COMUNE'].str.replace(r"baranzate",  "bollate")
cened_old['COMUNE'] = cened_old['COMUNE'].str.replace(r"bovisio masciago",  "bovisio-masciago")
cened_old['COMUNE'] = cened_old['COMUNE'].str.replace(r"cornate d`adda",  "cornate d'adda")
cened_old['COMUNE'] = cened_old['COMUNE'].str.replace(r"cortenuova",  "cortenova")
cened_old['COMUNE'] = cened_old['COMUNE'].str.replace(r"sant'omobono terme",  "sant'omobono imagna")
ddh['comune'] = ddh['comune'].str.replace(r"è",  "e'")
ddh['comune'] = ddh['comune'].str.replace(r"è",  "e'")
ddh['comune'] = ddh['comune'].str.replace(r"é",  "e'")
ddh['comune'] = ddh['comune'].str.replace(r"ò",  "o'")
ddh['comune'] = ddh['comune'].str.replace(r"ù",  "u'")

cened_old = cened_old.merge(ddh, left_on='COMUNE', right_on='comune', how='left')



# Correct floats
for col in ['SUPERFICIE_LORDA', 'SUPERFICIE_NETTA', 'VOLUME_LORDO', 'VOLUME_NETTO',
       'SUPERFICIE_DISPERDENTE', 'SUPERFICIE_VETRATA_OPACA',
       'TRASMITTANZA_MEDIA_INVOLUCRO', 'TRASMITTANZA_MEDIA_COPERTURA',
       'TRASMITTANZA_MEDIA_BASAMENTO', 'TRASMITTANZA_MEDIA_SERRAMENTO', 'EPH', 'dd']:
    cened_old[col] = cened_old[col].astype(str)
    cened_old[col] = cened_old[col].str.replace(r',', '')
    cened_old[col] = cened_old[col].astype("float")

# Filtering

In [None]:
# Drop buildings with wrong thermal conductivity
for col in ['TRASMITTANZA_MEDIA_INVOLUCRO', 'TRASMITTANZA_MEDIA_COPERTURA',
       'TRASMITTANZA_MEDIA_BASAMENTO']:
    cened_old = cened_old[cened_old[col] > 0]
    cened_old = cened_old[cened_old[col] < 4]

cened_old = cened_old[cened_old['TRASMITTANZA_MEDIA_SERRAMENTO'] > 0]
cened_old = cened_old[cened_old['TRASMITTANZA_MEDIA_SERRAMENTO'] < 6]

# Drop buildings with wrong EPH
cened_old = cened_old[cened_old['EPH'] > 0]
cened_old = cened_old[cened_old['EPH'] < 1000]

# Drop buildings with wrong An
cened_old = cened_old[cened_old['SUPERFICIE_NETTA'] > 50]

# Drop buildings with wrong Vn
cened_old = cened_old[cened_old['VOLUME_NETTO'] > 150]

# Drop buildings with wrong average height
cened_old['ALTEZZA_MEDIA'] = cened_old['VOLUME_NETTO']/cened_old['SUPERFICIE_NETTA']
cened_old = cened_old[cened_old['ALTEZZA_MEDIA'] > 2.4]


# Drop buildings with wrong mean thermal conductivity
cened_old['TRASMITTANZA_MEDIA'] = (cened_old['TRASMITTANZA_MEDIA_SERRAMENTO'] + cened_old['TRASMITTANZA_MEDIA_COPERTURA'] +
         cened_old['TRASMITTANZA_MEDIA_BASAMENTO'] + cened_old['TRASMITTANZA_MEDIA_INVOLUCRO'])/4
cened_old = cened_old[cened_old['TRASMITTANZA_MEDIA'] > 0.15]
cened_old = cened_old[cened_old['TRASMITTANZA_MEDIA'] < 4]

# Feature engineering

In [None]:
# Opaque surface
cened_old['SUP_OPACA'] = cened_old['SUPERFICIE_DISPERDENTE']/(1+cened_old['SUPERFICIE_DISPERDENTE'])

# Opaque surface
cened_old['SUP_FINESTRATA'] = cened_old['SUPERFICIE_DISPERDENTE'] - cened_old['SUP_OPACA']

# Take care of construction year
cened_old.ANNO_COSTRUZIONE = cened_old.ANNO_COSTRUZIONE.astype(str)
cened_old.ANNO_COSTRUZIONE = cened_old.ANNO_COSTRUZIONE.map(lambda x:x[-4:])
cened_old.ANNO_COSTRUZIONE = cened_old.ANNO_COSTRUZIONE.astype(int)

# Shuffle

In [None]:
cened = cened_old.sample(frac=1).reset_index(drop=True)

# Reserve a test set

In [None]:
msk = np.random.rand(len(cened)) < 0.8
train = cened[msk]
test = cened[~msk]

In [None]:
print(len(train))

210715


In [None]:
print(len(test))

52835


# Define MAPE

In [None]:
def neg_mape(y_true, y_pred):
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)
  return -np.abs((y_true - y_pred)/y_true).mean()

In [None]:
def modified_neg_mape(y_true, y_pred):
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)
  errors = np.abs((y_true - y_pred)/y_true)
  index = np.argwhere(errors > 1)
  mod_errors = np.delete(errors, index)
  return -mod_errors.mean()

In [None]:
neg_mape_scorer = make_scorer(neg_mape, greater_is_better=True)

In [None]:
modified_neg_mape_scorer = make_scorer(modified_neg_mape, greater_is_better=True)

# Random search

In [None]:
mlcol = ['ANNO_COSTRUZIONE','SUPERFICIE_LORDA', 'SUPERFICIE_NETTA', 'VOLUME_LORDO', 
         'VOLUME_NETTO', 'SUPERFICIE_DISPERDENTE',
         'TRASMITTANZA_MEDIA_INVOLUCRO', 'TRASMITTANZA_MEDIA_COPERTURA',
         'TRASMITTANZA_MEDIA_BASAMENTO', 'TRASMITTANZA_MEDIA_SERRAMENTO',
         'dd','SUP_FINESTRATA','SUP_OPACA']

In [None]:
param_dist = {'feature_fraction': np.linspace(0.4, 1, num=7),
              # 'n_estimators' : [100, 300, 600, 1000, 3000],
              'num_leaf' : list(range(20,40)),
             'max_depth' : [-1],
             'max_bin' : [100, 200, 300, 500, 750, 1000, 2000],
             'bagging_fraction' : np.linspace(0.4, 1, num=14),
             'bagging_freq' : list(range(1,10)),
             'lambda_l1' : stats.uniform(0, 0.6),
             'lambda_l2' : stats.uniform(0, 0.6)}

lgbm = LGBMRegressor(n_estimators = 100, silent = True, verbose = 0, is_training_metric = True, n_jobs = 1, 
                     eval_metric  = 'mape')

n_iter_search = 10

random_search = RandomizedSearchCV(lgbm, param_distributions=param_dist, n_iter=n_iter_search, 
                                   scoring=modified_neg_mape_scorer, cv = 5, n_jobs = 6, verbose = 2)

random_search.fit(cened[mlcol], cened['EPH'])

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  1.4min
[Parallel(n_jobs=6)]: Done  50 out of  50 | elapsed:  2.4min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=LGBMRegressor(boosting_type='gbdt',
                                           class_weight=None,
                                           colsample_bytree=1.0,
                                           eval_metric='mape',
                                           importance_type='split',
                                           is_training_metric=True,
                                           learning_rate=0.1, max_depth=-1,
                                           min_child_samples=20,
                                           min_child_weight=0.001,
                                           min_split_gain=0.0, n_estimators=100,
                                           n_jobs=1, num_leaves=31,
                                           objective=None, random_sta...
                                        'lambda_l1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fd005fbb978>,
            

# Print results

In [None]:
print(random_search.best_params_)

{'bagging_fraction': 1.0, 'bagging_freq': 4, 'feature_fraction': 0.8999999999999999, 'lambda_l1': 0.46393035771685154, 'lambda_l2': 0.03413279439519168, 'max_bin': 750, 'max_depth': -1, 'num_leaf': 20}


In [None]:
print(random_search.best_score_)

-0.17473533899234073


# Train the final model

In [None]:
light = LGBMRegressor(n_estimators = 1000, silent = False, verbose = 2, is_training_metric = True, n_jobs = 6)
light.set_params(**random_search.best_params_)

LGBMRegressor(bagging_fraction=1.0, bagging_freq=4, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0,
              feature_fraction=0.8999999999999999, importance_type='split',
              is_training_metric=True, lambda_l1=0.46393035771685154,
              lambda_l2=0.03413279439519168, learning_rate=0.1, max_bin=750,
              max_depth=-1, min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=10000, n_jobs=6, num_leaf=20,
              num_leaves=31, objective=None, random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=False, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0, verbose=2)

In [None]:
light.fit(train[mlcol], train['EPH'])

LGBMRegressor(bagging_fraction=1.0, bagging_freq=4, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0,
              feature_fraction=0.8999999999999999, importance_type='split',
              is_training_metric=True, lambda_l1=0.46393035771685154,
              lambda_l2=0.03413279439519168, learning_rate=0.1, max_bin=750,
              max_depth=-1, min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=10000, n_jobs=6, num_leaf=20,
              num_leaves=31, objective=None, random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=False, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0, verbose=2)

# Evaluate the model

In [102]:
y_pred = light.predict(test[mlcol])

In [103]:
results_df = pd.DataFrame({'predicted' : y_pred, 'true' : test['EPH']})
results_df['error'] = np.abs((results_df['true'] - results_df['predicted'])/results_df['true'])*100

In [104]:
# MAPE
results_df['error'].mean()

40.39698301862012

In [105]:
# MAPE without >100%
results_df[results_df.error <= 100].error.mean()

16.032692256295263