In [1]:
import pandas as pd

In [2]:
revealed_targets = pd.read_csv('downloads/example_test_files/revealed_targets.csv')
test = pd.read_csv('downloads/example_test_files/test.csv')

In [3]:
revealed_targets['datetime'] = pd.to_datetime(revealed_targets['datetime'])
test['prediction_datetime'] = pd.to_datetime(test['prediction_datetime'])

In [4]:
def naive_predict(test_row, revealed_targets):
    last_known = revealed_targets[(revealed_targets.data_block_id == test_row.data_block_id) &
                                  (revealed_targets.is_consumption == test_row.is_consumption) &
                                  (revealed_targets.prediction_unit_id == test_row.prediction_unit_id) &
                                  (revealed_targets.datetime.map(lambda dt: dt.time()) == test_row.prediction_datetime.time() )]
    if len(last_known) == 0:
        last_known = 0
    else:
        last_known = last_known.target.mean()
    return last_known

def naive_predict_batch(test_batch, revealed_targets):
    target_series = test_batch.apply(lambda test_row: naive_predict(test_row, revealed_targets), axis=1)
    target_series.name = 'target'
    return pd.concat([test_batch[['row_id', 'data_block_id']], target_series], axis=1)

In [5]:
predicts = naive_predict_batch(test, revealed_targets)
predicts

Unnamed: 0,row_id,data_block_id,target
0,2005872,634,2.675
1,2005873,634,471.887
2,2005874,634,0.000
3,2005875,634,5.414
4,2005876,634,13.899
...,...,...,...
12475,2018347,637,188.167
12476,2018348,637,0.000
12477,2018349,637,31.484
12478,2018350,637,0.000


In [6]:
predicts.describe()

Unnamed: 0,row_id,data_block_id,target
count,12480.0,12480.0,12480.0
mean,2012112.0,635.5,387.985572
std,3602.81,1.118079,1045.722829
min,2005872.0,634.0,0.0
25%,2008992.0,634.75,10.32275
50%,2012112.0,635.5,64.6405
75%,2015231.0,636.25,300.99725
max,2018351.0,637.0,11146.496


In [32]:
pd.Series(
    pd.to_datetime(
        ['2022-03-27 02:00:00',
         '2022-03-27 03:00:00',
         '2022-03-27 04:00:00',
         '2021-10-31 02:00:00',
         '2021-10-31 03:00:00',
         '2021-10-31 04:00:00']
    )).dt.tz_localize('Europe/Tallinn', nonexistent='NaT', ambiguous='NaT')

0   2022-03-27 02:00:00+02:00
1                         NaT
2   2022-03-27 04:00:00+03:00
3   2021-10-31 02:00:00+03:00
4                         NaT
5   2021-10-31 04:00:00+02:00
dtype: datetime64[ns, Europe/Tallinn]

In [40]:
train = pd.read_csv('downloads/train.csv', low_memory=False)

In [41]:
native_dt = pd.to_datetime(train.datetime)

In [43]:
train.datetime = native_dt.dt.tz_localize(
    'Europe/Tallinn', nonexistent='NaT', ambiguous='NaT')

In [44]:
train[(train.prediction_unit_id == 0) & (native_dt >= pd.to_datetime('2022-03-27 02:00:00')) & (native_dt <= pd.to_datetime('2022-03-27 04:00:00'))]

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
634732,0,0,1,0.812,0,2022-03-27 02:00:00+02:00,207,634732,0
634733,0,0,1,490.703,1,2022-03-27 02:00:00+02:00,207,634733,0
634866,0,0,1,,0,NaT,207,634866,0
634867,0,0,1,,1,NaT,207,634867,0
635000,0,0,1,0.213,0,2022-03-27 04:00:00+03:00,207,635000,0
635001,0,0,1,488.763,1,2022-03-27 04:00:00+03:00,207,635001,0


In [62]:
train[(train.prediction_unit_id == 0) & (native_dt >= pd.to_datetime('2021-10-31 02:00:00')) & (native_dt <= pd.to_datetime('2021-10-31 04:00:00'))]

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
178812,0,0,1,0.0,0,2021-10-31 02:00:00+03:00,60,178812,0
178813,0,0,1,166.141,1,2021-10-31 02:00:00+03:00,60,178813,0
178938,0,0,1,,0,NaT,60,178938,0
178939,0,0,1,,1,NaT,60,178939,0
179064,0,0,1,0.0,0,2021-10-31 04:00:00+02:00,60,179064,0
179065,0,0,1,174.856,1,2021-10-31 04:00:00+02:00,60,179065,0
