In [1]:
import os
import sys

sys.path.insert(0, '../')

import numpy as np
import pandas as pd

In [3]:
from src.models.model_common import select_features

In [4]:
train_data = pd.read_hdf('../data/processed/train_3600000000000.hd5', "data", parse_dates=[1])

In [6]:
train_data['SiteId'].value_counts()

2      34704
281    32776
268    32776
234    26992
250    26992
134    26992
212    26992
277    25064
273    23136
151    23136
259    23136
129    23136
286    23136
16     23136
93     21208
27     21208
131    21208
26     21208
135    21208
260    21208
154    21208
292    21208
294    21208
198    19280
199    19280
203    19280
209    19280
300    19280
39     17352
40     17352
       ...  
285    11568
231    11568
247    11568
53     11568
96     11568
77     11568
223    11568
216    11568
127    11568
254     9640
181     9640
171     9640
159      964
177      964
288      964
224      964
251      964
263      964
236      964
239      964
264      964
45       964
51       964
266      964
101      964
192      964
278      964
297      964
85       964
196      964
Name: SiteId, Length: 89, dtype: int64

In [7]:
site_data = train_data.loc[train_data['SiteId'] == 39, :]
site_data.head()

Unnamed: 0,ForecastId,Timestamp,obs_id,SiteId,Value,Frequency,DayOfMonth,DayOfWeek,DayOfYear,DaysInMonth,...,ConsumptionPerSurfaceArea,ConsumptionPerTemperatureDiff,ConsumptionDailyMeanPerSurfaceArea,ConsumptionDailyMeanPerTemperatureDiff,ConsumptionWeeklyMeanPerSurfaceArea,ConsumptionWeeklyMeanPerTemperatureDiff,ConsumptionBiWeeklyMeanPerSurfaceArea,ConsumptionBiWeeklyMeanPerTemperatureDiff,ConsumptionMonthlyMeanPerSurfaceArea,ConsumptionMonthlyMeanPerTemperatureDiff
113752,984,2015-05-12 14:00:00,5426833,39,5455.874114,3600000000000.0,12,1,132,31,...,0.654473,261.770445,0.414355,165.730144,0.34766,139.054066,0.350959,140.373567,0.354502,141.790744
113753,984,2015-05-12 15:00:00,7305160,39,4263.73481,3600000000000.0,12,1,132,31,...,0.511467,247.020308,0.414355,200.118718,0.34766,167.907424,0.350959,169.500717,0.354502,171.211955
113754,984,2015-05-12 16:00:00,2636789,39,5452.319027,3600000000000.0,12,1,132,31,...,0.654047,288.985271,0.414355,183.079487,0.34766,153.610843,0.350959,155.068475,0.354502,156.634008
113755,984,2015-05-12 17:00:00,5982039,39,12545.903399,3600000000000.0,12,1,132,31,...,1.504975,591.753876,0.414355,162.923754,0.34766,136.699396,0.350959,137.996553,0.354502,139.389732
113756,984,2015-05-12 18:00:00,5507851,39,4039.764304,3600000000000.0,12,1,132,31,...,0.4846,271.843479,0.414355,232.438535,0.34766,195.025013,0.350959,196.875629,0.354502,198.863237


In [9]:
x, y, groups = select_features(site_data, 'h')

In [16]:
x.head(20)['PotentialMeanHeating']

obs_id
5426833    22.565327
7305160    22.154596
2636789    22.343630
5982039    22.604478
5507851    21.854948
6186639    20.854948
3977636    18.918512
1426578    16.896110
3601966    15.831723
6748513    14.373472
1341303    13.765504
3414207    12.826892
6536712    12.148002
5523543    11.883968
3847877    11.790809
7464559    11.426775
4149302    11.094464
457326     11.811683
1933894    14.929725
3280100    18.088681
Name: PotentialMeanHeating, dtype: float64

In [30]:
def make_lagged(x, y, groups, lags=24):
    x_a = x.join(groups).join(y)
    for column in x_a.drop(columns=['ForecastId']).keys():
        for lag in range(1, lags+1):
            x_a['%s_(t-%d)' % (column, lag)] = x_a.groupby('ForecastId')[column].shift(lag).values
    
    x_a = x_a.dropna()
    return x_a.drop(columns=['ForecastId', 'Consumption']), x_a['Consumption'], x_a['ForecastId']

x_l, y_l, groups_l = make_lagged(x, y, groups)

In [69]:
def generate_train_forecast_ts(x, y, input_window_size, output_window_size, pad=True):
    assert x.shape[0] == y.shape[0]

    min_size = input_window_size + output_window_size
    input_size = x.shape[0]

    if input_size < min_size and pad:
        num_pads = min_size - input_size

        x_pad = np.zeros((num_pads, x.shape[1]))
        y_pad = np.zeros((num_pads, y.shape[1]))
        x = np.vstack((x_pad, x))
        y = np.vstack((y_pad, y))

    assert x.shape[0] >= min_size

    num_samples = x.shape[0] - input_window_size - output_window_size + 1

    x_res = []
    y_res = []

    for i in range(num_samples):
        x_t = np.hstack((x[i:i + input_window_size], y[i:i + input_window_size].reshape(-1, 1)))

        x_t_plus_1 = x[i + input_window_size:i + input_window_size + output_window_size]
        y_t_plus_1 = y[i + input_window_size:i + input_window_size + output_window_size]

        x_res.append(np.concatenate((x_t.ravel(), x_t_plus_1.ravel())))
        y_res.append(y_t_plus_1)

    return np.array(x_res), np.array(y_res)


def generate_train_ts(x, y, forecast_ids, input_window_size, output_window_size):
    assert x.shape[0] == y.shape[0]
    assert y.shape[0] == forecast_ids.shape[0]

    ids = np.unique(forecast_ids)

    agg_x = []
    agg_y = []
    for fid in ids:
        fx = x[forecast_ids == fid, :]
        fy = y[forecast_ids == fid, :]

        fx_res, fy_res = generate_train_forecast_ts(fx, fy, input_window_size, output_window_size)

        agg_x.append(fx_res)
        agg_y.append(fy_res)

    agg_x = np.concatenate(agg_x)
    agg_y = np.concatenate(agg_y)
#     agg_x = agg_x.reshape((agg_x.shape[0], 1, agg_x.shape[1]))
#     agg_y = agg_y.reshape((agg_y.shape[0], 1, agg_y.shape[1]))

    return agg_x, agg_y.ravel()

agx, agy = generate_train_ts(x.values, y.values.reshape(-1, 1), groups.values, 1*24, 1)
agx.shape, agy.shape

((16920, 1249), (16920,))

In [41]:
def print_cval_score(scores, metrics=('score',)):
    metric_vals = {}
    for metric in metrics:
        test_metric = scores['test_' + metric]
        train_metric = scores['train_' + metric]

        test_metric_mean = np.mean(test_metric)
        test_metric_variance = np.std(test_metric)

        train_metric_mean = np.mean(train_metric)
        train_metric_variance = np.std(train_metric)

        print(metric.capitalize())
        print("Test\t Mean: %f\t Variance: %f" % (test_metric_mean, test_metric_variance))
        print("Train\t Mean: %f\t Variance: %f" % (train_metric_mean, train_metric_variance))

        metric_vals[metric] = {
            'test': {
                'mean': test_metric_mean, 'variance': test_metric_variance
            }, 'train': {
                'mean': train_metric_mean, 'variance': train_metric_variance
            }}

    return metric_vals

In [42]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate

regressor = XGBRegressor()
print_cval_score(cross_validate(regressor, x, y, groups=None, scoring='neg_mean_squared_error', cv=8, n_jobs=8))

Score
Test	 Mean: -4456459.834287	 Variance: 5240439.585547
Train	 Mean: -813175.664892	 Variance: 87115.655067




{'score': {'test': {'mean': -4456459.83428715, 'variance': 5240439.585547046},
  'train': {'mean': -813175.6648917147, 'variance': 87115.65506659918}}}

In [55]:
x_l[x_l.select_dtypes(exclude=['bool', 'float', 'int']).keys()] = x_l.select_dtypes(exclude=['bool', 'float', 'int']).astype(bool)

In [68]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate

regressor = XGBRegressor()
print_cval_score(cross_validate(regressor, agx, agy, groups=None, scoring='neg_mean_squared_error', cv=8, n_jobs=8))

Score
Test	 Mean: -500671.291528	 Variance: 319566.438097
Train	 Mean: -281746.127425	 Variance: 32306.805622




{'score': {'test': {'mean': -500671.29152841057,
   'variance': 319566.4380968852},
  'train': {'mean': -281746.12742526707, 'variance': 32306.805621905867}}}