In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
training_data = pd.read_csv(f'../dataset/power-laws-forecasting-energy-consumption-training-data.csv', delimiter=';')
training_data

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value
0,4852050,42,2016-10-18T02:45:00+00:00,1087,26397.049623
1,1638923,42,2016-10-18T11:45:00+00:00,1087,42958.364641
2,5748910,42,2016-10-18T20:45:00+00:00,1087,27096.919666
3,38199,42,2016-10-20T10:45:00+00:00,1087,50211.408087
4,1338204,42,2016-10-20T18:45:00+00:00,1087,50503.305105
...,...,...,...,...,...
6559825,1127574,300,2017-09-22T18:45:00+00:00,6719,7740.955427
6559826,4695712,300,2017-09-23T17:45:00+00:00,6719,7133.180234
6559827,978979,300,2017-09-24T22:45:00+00:00,6719,7339.789365
6559828,6317358,300,2017-09-25T08:45:00+00:00,6719,18873.744081


In [3]:
weather_data = pd.read_csv(f'../dataset/power-laws-forecasting-energy-consumption-weather.csv', delimiter=';')
weather_data

Unnamed: 0,Timestamp,Temperature,Distance,SiteId
0,2017-03-03T19:00:00+00:00,10.6,27.489346,51
1,2017-03-03T19:20:00+00:00,11.0,28.663082,51
2,2017-03-03T20:00:00+00:00,6.3,28.307039,51
3,2017-03-03T21:55:00+00:00,10.0,29.797449,51
4,2017-03-03T23:00:00+00:00,5.4,28.307039,51
...,...,...,...,...
3957030,2016-09-11T11:00:00+00:00,25.9,28.307039,51
3957031,2016-09-11T11:20:00+00:00,27.0,27.489346,51
3957032,2016-09-11T12:00:00+00:00,27.1,28.307039,51
3957033,2016-09-11T15:50:00+00:00,28.0,27.489346,51


In [4]:
forecast_data = pd.read_csv(f'../dataset/power-laws-forecasting-energy-consumption-submission-forecast-period.csv', delimiter=';')
forecast_data

Unnamed: 0,ForecastId,ForecastPeriodNS
0,123,900000000000
1,264,900000000000
2,596,900000000000
3,914,900000000000
4,1053,900000000000
...,...,...
6969,6387,900000000000
6970,6487,3600000000000
6971,6569,900000000000
6972,6777,900000000000


In [5]:
metadata = pd.read_csv(f'../dataset/power-laws-forecasting-energy-consumption-metadata.csv', delimiter=';')
metadata

Unnamed: 0,SiteId,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,WednesdayIsDayOff,ThursdayIsDayOff,FridayIsDayOff,SaturdayIsDayOff,SundayIsDayOff
0,207,7964.873347,30.0,18.0,False,False,False,False,False,True,True
1,7,15168.125971,30.0,18.0,False,False,False,False,False,True,True
2,74,424.340663,15.0,18.0,False,False,False,False,False,True,True
3,239,1164.822636,15.0,18.0,False,False,False,False,False,True,True
4,274,1468.246690,5.0,18.0,False,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...
262,192,11188.881545,15.0,18.0,False,False,False,False,False,True,True
263,58,1149.050606,15.0,18.0,False,False,False,False,False,True,True
264,123,5470.205018,15.0,18.0,False,False,False,False,False,True,True
265,122,6843.612340,15.0,18.0,False,False,False,False,False,True,True


In [6]:
holiday_data = pd.read_csv(f'../dataset/power-laws-forecasting-energy-consumption-holidays.csv', delimiter=';')
holiday_data

Unnamed: 0,Date,Holiday,SiteId
0,2016-02-15,Washington's Birthday,1
1,2017-05-29,Memorial Day,1
2,2017-11-23,Thanksgiving Day,1
3,2017-12-29,New Years Eve Shift,1
4,2017-12-31,New Years Eve,1
...,...,...,...
8382,2015-12-26,Boxing Day,303
8383,2016-05-01,International Workers' Day,304
8384,2015-04-25,Liberation Day,304
8385,2016-03-28,Easter Monday,305


In [7]:
data = pd.merge(training_data, metadata, on='SiteId')
data

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,WednesdayIsDayOff,ThursdayIsDayOff,FridayIsDayOff,SaturdayIsDayOff,SundayIsDayOff
0,4852050,42,2016-10-18T02:45:00+00:00,1087,26397.049623,1032.735063,15.0,18.0,False,False,False,False,False,True,True
1,1638923,42,2016-10-18T11:45:00+00:00,1087,42958.364641,1032.735063,15.0,18.0,False,False,False,False,False,True,True
2,5748910,42,2016-10-18T20:45:00+00:00,1087,27096.919666,1032.735063,15.0,18.0,False,False,False,False,False,True,True
3,38199,42,2016-10-20T10:45:00+00:00,1087,50211.408087,1032.735063,15.0,18.0,False,False,False,False,False,True,True
4,1338204,42,2016-10-20T18:45:00+00:00,1087,50503.305105,1032.735063,15.0,18.0,False,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6559825,7800124,34,2015-02-16T04:50:00+00:00,981,112.112558,1868.542638,10.0,18.0,False,False,False,False,False,True,True
6559826,2014359,34,2015-02-16T07:20:00+00:00,981,112.112558,1868.542638,10.0,18.0,False,False,False,False,False,True,True
6559827,2568008,34,2015-02-16T08:35:00+00:00,981,411.079380,1868.542638,10.0,18.0,False,False,False,False,False,True,True
6559828,1915525,34,2015-02-16T09:20:00+00:00,981,411.079380,1868.542638,10.0,18.0,False,False,False,False,False,True,True


In [8]:
data = pd.merge(data, forecast_data, on='ForecastId')
data

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,WednesdayIsDayOff,ThursdayIsDayOff,FridayIsDayOff,SaturdayIsDayOff,SundayIsDayOff,ForecastPeriodNS
0,4852050,42,2016-10-18T02:45:00+00:00,1087,26397.049623,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000
1,1638923,42,2016-10-18T11:45:00+00:00,1087,42958.364641,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000
2,5748910,42,2016-10-18T20:45:00+00:00,1087,27096.919666,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000
3,38199,42,2016-10-20T10:45:00+00:00,1087,50211.408087,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000
4,1338204,42,2016-10-20T18:45:00+00:00,1087,50503.305105,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6559825,7800124,34,2015-02-16T04:50:00+00:00,981,112.112558,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000
6559826,2014359,34,2015-02-16T07:20:00+00:00,981,112.112558,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000
6559827,2568008,34,2015-02-16T08:35:00+00:00,981,411.079380,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000
6559828,1915525,34,2015-02-16T09:20:00+00:00,981,411.079380,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000


In [9]:
data['Date'] = pd.to_datetime(data['Timestamp']).dt.date
data


Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,WednesdayIsDayOff,ThursdayIsDayOff,FridayIsDayOff,SaturdayIsDayOff,SundayIsDayOff,ForecastPeriodNS,Date
0,4852050,42,2016-10-18T02:45:00+00:00,1087,26397.049623,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-18
1,1638923,42,2016-10-18T11:45:00+00:00,1087,42958.364641,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-18
2,5748910,42,2016-10-18T20:45:00+00:00,1087,27096.919666,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-18
3,38199,42,2016-10-20T10:45:00+00:00,1087,50211.408087,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-20
4,1338204,42,2016-10-20T18:45:00+00:00,1087,50503.305105,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6559825,7800124,34,2015-02-16T04:50:00+00:00,981,112.112558,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000,2015-02-16
6559826,2014359,34,2015-02-16T07:20:00+00:00,981,112.112558,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000,2015-02-16
6559827,2568008,34,2015-02-16T08:35:00+00:00,981,411.079380,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000,2015-02-16
6559828,1915525,34,2015-02-16T09:20:00+00:00,981,411.079380,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000,2015-02-16


In [10]:
data = pd.merge(data, holiday_data, on=['SiteId', 'Date'], how='left')
data

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,WednesdayIsDayOff,ThursdayIsDayOff,FridayIsDayOff,SaturdayIsDayOff,SundayIsDayOff,ForecastPeriodNS,Date,Holiday
0,4852050,42,2016-10-18T02:45:00+00:00,1087,26397.049623,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-18,
1,1638923,42,2016-10-18T11:45:00+00:00,1087,42958.364641,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-18,
2,5748910,42,2016-10-18T20:45:00+00:00,1087,27096.919666,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-18,
3,38199,42,2016-10-20T10:45:00+00:00,1087,50211.408087,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-20,
4,1338204,42,2016-10-20T18:45:00+00:00,1087,50503.305105,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-20,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6559825,7800124,34,2015-02-16T04:50:00+00:00,981,112.112558,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000,2015-02-16,
6559826,2014359,34,2015-02-16T07:20:00+00:00,981,112.112558,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000,2015-02-16,
6559827,2568008,34,2015-02-16T08:35:00+00:00,981,411.079380,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000,2015-02-16,
6559828,1915525,34,2015-02-16T09:20:00+00:00,981,411.079380,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000,2015-02-16,


In [11]:
data_merged = pd.merge(data, weather_data, on=['Timestamp', 'SiteId'], how='left')
data_merged

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,WednesdayIsDayOff,ThursdayIsDayOff,FridayIsDayOff,SaturdayIsDayOff,SundayIsDayOff,ForecastPeriodNS,Date,Holiday,Temperature,Distance
0,4852050,42,2016-10-18T02:45:00+00:00,1087,26397.049623,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-18,,,
1,1638923,42,2016-10-18T11:45:00+00:00,1087,42958.364641,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-18,,,
2,5748910,42,2016-10-18T20:45:00+00:00,1087,27096.919666,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-18,,,
3,38199,42,2016-10-20T10:45:00+00:00,1087,50211.408087,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-20,,,
4,1338204,42,2016-10-20T18:45:00+00:00,1087,50503.305105,1032.735063,15.0,18.0,False,False,False,False,False,True,True,3600000000000,2016-10-20,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6646744,7800124,34,2015-02-16T04:50:00+00:00,981,112.112558,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000,2015-02-16,,,
6646745,2014359,34,2015-02-16T07:20:00+00:00,981,112.112558,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000,2015-02-16,,,
6646746,2568008,34,2015-02-16T08:35:00+00:00,981,411.079380,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000,2015-02-16,,,
6646747,1915525,34,2015-02-16T09:20:00+00:00,981,411.079380,1868.542638,10.0,18.0,False,False,False,False,False,True,True,900000000000,2015-02-16,,,


In [12]:
data_merged['Index'] = data_merged.groupby(['SiteId', 'Timestamp']).cumcount() + 1

left_cols = [col for col in data.columns]
data_pivot = data_merged.pivot(index=left_cols, columns='Index', values=['Temperature', 'Distance']).reset_index()

data_pivot.columns = [f'{col[0]}_{col[1]}' if col[0] not in left_cols else col[0] for col in data_pivot.columns]

data_pivot

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,...,Date,Holiday,Temperature_1,Temperature_2,Temperature_3,Temperature_4,Distance_1,Distance_2,Distance_3,Distance_4
0,0,143,2017-08-12T20:30:00+00:00,3145,289058.731029,65578.168728,15.0,18.0,False,False,...,2017-08-12,,,,,,,,,
1,1,300,2017-07-28T14:45:00+00:00,6718,37873.175383,3813.660202,15.0,18.0,False,False,...,2017-07-28,,,,,,,,,
2,5,201,2014-07-24T12:30:00+00:00,4134,97.720896,11990.293968,15.0,18.0,False,False,...,2014-07-24,,,,,,,,,
3,6,271,2016-01-24T18:40:00+00:00,5926,133.267974,414.211270,5.0,18.0,False,False,...,2016-01-24,,,,,,,,,
4,7,276,2015-12-11T19:05:00+00:00,6065,11391.730143,621.276731,5.0,18.0,False,False,...,2015-12-11,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6559825,7869000,50,2017-01-01T16:00:00+00:00,1249,1747.282040,394.637081,15.0,18.0,False,False,...,2017-01-01,,,,,,,,,
6559826,7869001,294,2016-08-11T17:00:00+00:00,6507,53407.792966,4737.008135,10.0,18.0,False,False,...,2016-08-11,,,,,,,,,
6559827,7869002,84,2016-09-08T19:55:00+00:00,1688,997.216224,2775.127719,5.0,18.0,False,False,...,2016-09-08,,,,,,,,,
6559828,7869004,302,2017-08-03T14:00:00+00:00,6960,45585.744325,21284.578622,15.0,18.0,False,False,...,2017-08-03,,,,,,,,,


In [13]:
def dataoveriew(data, message):
    print(f'{message}:\n')
    print('Number of rows: ', data.shape[0])
    print("\nNumber of features:", data.shape[1])
    print("\nData Features:")
    print(data.columns.tolist())
    print("\nMissing values:", data.isnull().sum().values.sum())
    print("\nUnique values:")
    print(data.nunique())

dataoveriew(data_pivot, 'Overview of the dataset')

Overview of the dataset:

Number of rows:  6559830

Number of features: 26

Data Features:
['obs_id', 'SiteId', 'Timestamp', 'ForecastId', 'Value', 'Surface', 'Sampling', 'BaseTemperature', 'MondayIsDayOff', 'TuesdayIsDayOff', 'WednesdayIsDayOff', 'ThursdayIsDayOff', 'FridayIsDayOff', 'SaturdayIsDayOff', 'SundayIsDayOff', 'ForecastPeriodNS', 'Date', 'Holiday', 'Temperature_1', 'Temperature_2', 'Temperature_3', 'Temperature_4', 'Distance_1', 'Distance_2', 'Distance_3', 'Distance_4']

Missing values: 58288521

Unique values:
obs_id               6559830
SiteId                   267
Timestamp             598747
ForecastId              6974
Value                1933852
Surface                  267
Sampling                   4
BaseTemperature            3
MondayIsDayOff             1
TuesdayIsDayOff            1
WednesdayIsDayOff          1
ThursdayIsDayOff           1
FridayIsDayOff             2
SaturdayIsDayOff           2
SundayIsDayOff             2
ForecastPeriodNS           3
Date   

In [29]:
fill_values = {
    'Temperature_1': data_pivot['Temperature_1'].mean(),  
    'Temperature_2': data_pivot['Temperature_2'].mean(),  
    'Temperature_3': data_pivot['Temperature_3'].mean(),  
    'Temperature_4': data_pivot['Temperature_4'].mean(),  
    'Distance_1': 0,  
    'Distance_2': 0,  
    'Distance_3': 0,  
    'Distance_4': 0,  
    'Holiday': ''
}
data = data_pivot.fillna(value=fill_values)
data

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,...,Date,Holiday,Temperature_1,Temperature_2,Temperature_3,Temperature_4,Distance_1,Distance_2,Distance_3,Distance_4
0,0,143,2017-08-12T20:30:00+00:00,3145,289058.731029,65578.168728,15.0,18.0,False,False,...,2017-08-12,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0
1,1,300,2017-07-28T14:45:00+00:00,6718,37873.175383,3813.660202,15.0,18.0,False,False,...,2017-07-28,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0
2,5,201,2014-07-24T12:30:00+00:00,4134,97.720896,11990.293968,15.0,18.0,False,False,...,2014-07-24,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0
3,6,271,2016-01-24T18:40:00+00:00,5926,133.267974,414.211270,5.0,18.0,False,False,...,2016-01-24,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0
4,7,276,2015-12-11T19:05:00+00:00,6065,11391.730143,621.276731,5.0,18.0,False,False,...,2015-12-11,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6559825,7869000,50,2017-01-01T16:00:00+00:00,1249,1747.282040,394.637081,15.0,18.0,False,False,...,2017-01-01,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0
6559826,7869001,294,2016-08-11T17:00:00+00:00,6507,53407.792966,4737.008135,10.0,18.0,False,False,...,2016-08-11,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0
6559827,7869002,84,2016-09-08T19:55:00+00:00,1688,997.216224,2775.127719,5.0,18.0,False,False,...,2016-09-08,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0
6559828,7869004,302,2017-08-03T14:00:00+00:00,6960,45585.744325,21284.578622,15.0,18.0,False,False,...,2017-08-03,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0


In [30]:
data['Date'] = pd.to_datetime(data['Date'])
data['DateInWeek'] = data['Date'].dt.day_name()
data

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,...,Holiday,Temperature_1,Temperature_2,Temperature_3,Temperature_4,Distance_1,Distance_2,Distance_3,Distance_4,DateInWeek
0,0,143,2017-08-12T20:30:00+00:00,3145,289058.731029,65578.168728,15.0,18.0,False,False,...,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Saturday
1,1,300,2017-07-28T14:45:00+00:00,6718,37873.175383,3813.660202,15.0,18.0,False,False,...,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Friday
2,5,201,2014-07-24T12:30:00+00:00,4134,97.720896,11990.293968,15.0,18.0,False,False,...,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Thursday
3,6,271,2016-01-24T18:40:00+00:00,5926,133.267974,414.211270,5.0,18.0,False,False,...,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Sunday
4,7,276,2015-12-11T19:05:00+00:00,6065,11391.730143,621.276731,5.0,18.0,False,False,...,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Friday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6559825,7869000,50,2017-01-01T16:00:00+00:00,1249,1747.282040,394.637081,15.0,18.0,False,False,...,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Sunday
6559826,7869001,294,2016-08-11T17:00:00+00:00,6507,53407.792966,4737.008135,10.0,18.0,False,False,...,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Thursday
6559827,7869002,84,2016-09-08T19:55:00+00:00,1688,997.216224,2775.127719,5.0,18.0,False,False,...,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Thursday
6559828,7869004,302,2017-08-03T14:00:00+00:00,6960,45585.744325,21284.578622,15.0,18.0,False,False,...,,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Thursday


In [31]:
bins = list(range(0, 25))
labels = ['{}-{}'.format(i, i + 1) for i in bins[:-1]]

In [32]:
data['Timestamp'] = pd.to_datetime(data['Timestamp'])
data['Time'] = data['Timestamp'].dt.time
#data = data.drop(columns=['Timestamp'])
data

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,...,Temperature_1,Temperature_2,Temperature_3,Temperature_4,Distance_1,Distance_2,Distance_3,Distance_4,DateInWeek,Time
0,0,143,2017-08-12 20:30:00+00:00,3145,289058.731029,65578.168728,15.0,18.0,False,False,...,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Saturday,20:30:00
1,1,300,2017-07-28 14:45:00+00:00,6718,37873.175383,3813.660202,15.0,18.0,False,False,...,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Friday,14:45:00
2,5,201,2014-07-24 12:30:00+00:00,4134,97.720896,11990.293968,15.0,18.0,False,False,...,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Thursday,12:30:00
3,6,271,2016-01-24 18:40:00+00:00,5926,133.267974,414.211270,5.0,18.0,False,False,...,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Sunday,18:40:00
4,7,276,2015-12-11 19:05:00+00:00,6065,11391.730143,621.276731,5.0,18.0,False,False,...,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Friday,19:05:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6559825,7869000,50,2017-01-01 16:00:00+00:00,1249,1747.282040,394.637081,15.0,18.0,False,False,...,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Sunday,16:00:00
6559826,7869001,294,2016-08-11 17:00:00+00:00,6507,53407.792966,4737.008135,10.0,18.0,False,False,...,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Thursday,17:00:00
6559827,7869002,84,2016-09-08 19:55:00+00:00,1688,997.216224,2775.127719,5.0,18.0,False,False,...,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Thursday,19:55:00
6559828,7869004,302,2017-08-03 14:00:00+00:00,6960,45585.744325,21284.578622,15.0,18.0,False,False,...,13.495891,13.126044,13.607127,9.178482,0.0,0.0,0.0,0.0,Thursday,14:00:00


In [35]:
data['Hour'] = data['Time'].apply(lambda x: x.hour)
data['TimeInterval'] = pd.cut(data['Hour'], bins=bins, labels=labels, right=False)
data

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,...,Temperature_3,Temperature_4,Distance_1,Distance_2,Distance_3,Distance_4,DateInWeek,Time,Hour,TimeInterval
0,0,143,2017-08-12 20:30:00+00:00,3145,289058.731029,65578.168728,15.0,18.0,False,False,...,13.607127,9.178482,0.0,0.0,0.0,0.0,Saturday,20:30:00,20,20-21
1,1,300,2017-07-28 14:45:00+00:00,6718,37873.175383,3813.660202,15.0,18.0,False,False,...,13.607127,9.178482,0.0,0.0,0.0,0.0,Friday,14:45:00,14,14-15
2,5,201,2014-07-24 12:30:00+00:00,4134,97.720896,11990.293968,15.0,18.0,False,False,...,13.607127,9.178482,0.0,0.0,0.0,0.0,Thursday,12:30:00,12,12-13
3,6,271,2016-01-24 18:40:00+00:00,5926,133.267974,414.211270,5.0,18.0,False,False,...,13.607127,9.178482,0.0,0.0,0.0,0.0,Sunday,18:40:00,18,18-19
4,7,276,2015-12-11 19:05:00+00:00,6065,11391.730143,621.276731,5.0,18.0,False,False,...,13.607127,9.178482,0.0,0.0,0.0,0.0,Friday,19:05:00,19,19-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6559825,7869000,50,2017-01-01 16:00:00+00:00,1249,1747.282040,394.637081,15.0,18.0,False,False,...,13.607127,9.178482,0.0,0.0,0.0,0.0,Sunday,16:00:00,16,16-17
6559826,7869001,294,2016-08-11 17:00:00+00:00,6507,53407.792966,4737.008135,10.0,18.0,False,False,...,13.607127,9.178482,0.0,0.0,0.0,0.0,Thursday,17:00:00,17,17-18
6559827,7869002,84,2016-09-08 19:55:00+00:00,1688,997.216224,2775.127719,5.0,18.0,False,False,...,13.607127,9.178482,0.0,0.0,0.0,0.0,Thursday,19:55:00,19,19-20
6559828,7869004,302,2017-08-03 14:00:00+00:00,6960,45585.744325,21284.578622,15.0,18.0,False,False,...,13.607127,9.178482,0.0,0.0,0.0,0.0,Thursday,14:00:00,14,14-15


In [36]:
def hour_to_part(hour):
    if (6<=hour<=12): return "Morning" 
    elif (12<hour<=18): return "Afternoon"
    elif (18<hour<=24): return "Evening"
    else: return "Night"
 
data['PartOfDay'] = data['Hour'].apply(hour_to_part)
data

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,...,Temperature_4,Distance_1,Distance_2,Distance_3,Distance_4,DateInWeek,Time,Hour,TimeInterval,PartOfDay
0,0,143,2017-08-12 20:30:00+00:00,3145,289058.731029,65578.168728,15.0,18.0,False,False,...,9.178482,0.0,0.0,0.0,0.0,Saturday,20:30:00,20,20-21,Evening
1,1,300,2017-07-28 14:45:00+00:00,6718,37873.175383,3813.660202,15.0,18.0,False,False,...,9.178482,0.0,0.0,0.0,0.0,Friday,14:45:00,14,14-15,Afternoon
2,5,201,2014-07-24 12:30:00+00:00,4134,97.720896,11990.293968,15.0,18.0,False,False,...,9.178482,0.0,0.0,0.0,0.0,Thursday,12:30:00,12,12-13,Morning
3,6,271,2016-01-24 18:40:00+00:00,5926,133.267974,414.211270,5.0,18.0,False,False,...,9.178482,0.0,0.0,0.0,0.0,Sunday,18:40:00,18,18-19,Afternoon
4,7,276,2015-12-11 19:05:00+00:00,6065,11391.730143,621.276731,5.0,18.0,False,False,...,9.178482,0.0,0.0,0.0,0.0,Friday,19:05:00,19,19-20,Evening
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6559825,7869000,50,2017-01-01 16:00:00+00:00,1249,1747.282040,394.637081,15.0,18.0,False,False,...,9.178482,0.0,0.0,0.0,0.0,Sunday,16:00:00,16,16-17,Afternoon
6559826,7869001,294,2016-08-11 17:00:00+00:00,6507,53407.792966,4737.008135,10.0,18.0,False,False,...,9.178482,0.0,0.0,0.0,0.0,Thursday,17:00:00,17,17-18,Afternoon
6559827,7869002,84,2016-09-08 19:55:00+00:00,1688,997.216224,2775.127719,5.0,18.0,False,False,...,9.178482,0.0,0.0,0.0,0.0,Thursday,19:55:00,19,19-20,Evening
6559828,7869004,302,2017-08-03 14:00:00+00:00,6960,45585.744325,21284.578622,15.0,18.0,False,False,...,9.178482,0.0,0.0,0.0,0.0,Thursday,14:00:00,14,14-15,Afternoon


In [37]:
num_splits = len(data) // 100000 + 1
    
for i in range(num_splits):
    
    start_row = i * 100000
    end_row = (i + 1) * 100000
    
    split_df = data.iloc[start_row:end_row]
    
    split_df.to_csv(f'../dataset/preprocessed-training-data/power-laws-forecasting-energy-consumption-preprocessed-training-data-split_{i + 1}.csv', index=False)