In [21]:
from typing import Sequence, Tuple
import pandas as pd

class MockApi:
    def __init__(self):
        '''
        YOU MUST UPDATE THE FIRST THREE LINES of this method.
        They've been intentionally left in an invalid state.

        Variables to set:
            input_paths: a list of two or more paths to the csv files to be served
            group_id_column: the column that identifies which groups of rows the API should serve.
                A call to iter_test serves all rows of all dataframes with the current group ID value.
            export_group_id_column: if true, the dataframes iter_test serves will include the group_id_column values.
        '''
        self.input_paths: Sequence[str] = ['downloads/example_test_files/test.csv',
                                   'downloads/example_test_files/revealed_targets.csv',
                                   'downloads/example_test_files/client.csv',
                                   'downloads/example_test_files/historical_weather.csv',
                                   'downloads/example_test_files/forecast_weather.csv',
                                   'downloads/example_test_files/electricity_prices.csv',
                                   'downloads/example_test_files/gas_prices.csv',
                                   'downloads/example_test_files/sample_submission.csv']
        self.group_id_column: str = 'data_block_id'
        self.export_group_id_column: bool = False
        # iter_test is only designed to support at least two dataframes, such as test and sample_submission
        assert len(self.input_paths) >= 2

        self._status = 'initialized'
        self.predictions = []

    def iter_test(self) -> Tuple[pd.DataFrame]:
        '''
        Loads all of the dataframes specified in self.input_paths,
        then yields all rows in those dataframes that equal the current self.group_id_column value.
        '''
        if self._status != 'initialized':

            raise Exception('WARNING: the real API can only iterate over `iter_test()` once.')

        dataframes = []
        for pth in self.input_paths:
            dataframes.append(pd.read_csv(pth, low_memory=False))
        group_order = dataframes[0][self.group_id_column].drop_duplicates().tolist()
        dataframes = [df.set_index(self.group_id_column) for df in dataframes]

        for group_id in group_order:
            self._status = 'prediction_needed'
            current_data = []
            for df in dataframes:
                cur_df = df.loc[group_id].copy()
                # returning single line dataframes from df.loc requires special handling
                if not isinstance(cur_df, pd.DataFrame):
                    cur_df = pd.DataFrame({a: b for a, b in zip(cur_df.index.values, cur_df.values)}, index=[group_id])
                    cur_df.index.name = self.group_id_column
                cur_df = cur_df.reset_index(drop=not(self.export_group_id_column))
                current_data.append(cur_df)
            yield tuple(current_data)

            while self._status != 'prediction_received':
                print('You must call `predict()` successfully before you can continue with `iter_test()`', flush=True)
                yield None

        with open('submission.csv', 'w') as f_open:
            pd.concat(self.predictions).to_csv(f_open, index=False)
        self._status = 'finished'

    def predict(self, user_predictions: pd.DataFrame):
        '''
        Accepts and stores the user's predictions and unlocks iter_test once that is done
        '''
        if self._status == 'finished':
            raise Exception('You have already made predictions for the full test set.')
        if self._status != 'prediction_needed':
            raise Exception('You must get the next test sample from `iter_test()` first.')
        if not isinstance(user_predictions, pd.DataFrame):
            raise Exception('You must provide a DataFrame.')

        self.predictions.append(user_predictions)
        self._status = 'prediction_received'

def make_env():
    return MockApi()

In [22]:
import datetime

def naive_predict(test_row, revealed_targets):
    last_known = revealed_targets[(revealed_targets.is_consumption == test_row.is_consumption) &
                                  (revealed_targets.prediction_unit_id == test_row.prediction_unit_id) &
                                  (revealed_targets.datetime <= test_row.prediction_datetime - datetime.timedelta(days=2))]
    
    last_known = last_known[last_known.datetime == last_known.datetime.max()]
    
    if len(last_known) == 0:
        last_known = 0
    else:
        last_known = last_known.target.mean()
    return last_known

def naive_predict_batch(test_batch, revealed_targets):
    target_series = test_batch.apply(lambda test_row: naive_predict(test_row, revealed_targets), axis=1)
    target_series.name = 'target'
    predict = pd.concat([test_batch['row_id'], target_series], axis=1)
    predict.set_index('row_id', inplace=True)
    return predict

In [23]:
env = make_env()
iter_test = env.iter_test()

count = 0
batch_prediction = None
for (test, revealed_targets, client, historical_weather,
     forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    
    test['prediction_datetime'] = pd.to_datetime(test['prediction_datetime'])
    revealed_targets['datetime'] = pd.to_datetime(revealed_targets['datetime'])
    
    print(len(test), len(revealed_targets))
    
    batch_prediction = naive_predict_batch(test, revealed_targets)
    sample_prediction['target'] = 0
    env.predict(batch_prediction)
    count += 1

3120 3168
3120 3168
3120 3120
3120 3120


In [24]:
for prediction in env.predictions:
    print(len(prediction))
    print(prediction.describe())

3120
             target
count   3120.000000
mean     397.055628
std     1065.377391
min        0.000000
25%       10.917750
50%       70.727000
75%      334.855000
max    10689.082000
3120
             target
count   3120.000000
mean     384.236975
std     1039.807390
min        0.000000
25%       11.021250
50%       63.564000
75%      284.275000
max    11013.487000
3120
             target
count   3120.000000
mean     378.323915
std     1010.882205
min        0.000000
25%        9.075500
50%       61.553000
75%      276.308000
max    10265.362000
3120
             target
count   3120.000000
mean     392.325769
std     1066.244326
min        0.000000
25%        9.610000
50%       65.132000
75%      328.118750
max    11146.496000


In [25]:
batch_prediction

Unnamed: 0_level_0,target
row_id,Unnamed: 1_level_1
2015232,2.073
2015233,503.735
2015234,0.000
2015235,4.986
2015236,23.590
...,...
2018347,188.167
2018348,0.000
2018349,31.484
2018350,0.000
