In [106]:
from typing import Sequence, Tuple
import pandas as pd

class MockApi:
    def __init__(self):
        """
        YOU MUST UPDATE THE FIRST THREE LINES of this method.
        They've been intentionally left in an invalid state.

        Variables to set:
            input_paths: a list of two or more paths to the csv files to be served
            group_id_column: the column that identifies which groups of rows the API should serve.
                A call to iter_test serves all rows of all dataframes with the current group ID value.
            export_group_id_column: if true, the dataframes iter_test serves will include the group_id_column values.
        """
        self.input_paths: Sequence[str] = ['downloads/example_test_files/test.csv',
                                   'downloads/example_test_files/revealed_targets.csv',
                                   'downloads/example_test_files/client.csv',
                                   'downloads/example_test_files/historical_weather.csv',
                                   'downloads/example_test_files/forecast_weather.csv',
                                   'downloads/example_test_files/electricity_prices.csv',
                                   'downloads/example_test_files/gas_prices.csv',
                                   'downloads/example_test_files/sample_submission.csv']
        self.group_id_column: str = 'data_block_id'
        self.export_group_id_column: bool = False
        # iter_test is only designed to support at least two dataframes, such as test and sample_submission
        assert len(self.input_paths) >= 2

        self._status = 'initialized'
        self.predictions = []

    def iter_test(self) -> Tuple[pd.DataFrame]:
        """
        Loads all the dataframes specified in self.input_paths,
        then yields all rows in those dataframes that equal the current self.group_id_column value.
        """
        if self._status != 'initialized':

            raise Exception('WARNING: the real API can only iterate over `iter_test()` once.')

        dataframes = []
        for pth in self.input_paths:
            df = pd.read_csv(pth, low_memory=False)
            for column in df.columns:
                if 'date' in column:
                    print(f'Transforming {pth}[{column}] from {df[column].dtype} to datetime...')
                    if 'forecast' in pth:
                        print(f'Stripping incorrect timezone adjustments')
                        df[column] = df[column].str[:-6]
                    df[column] = pd.to_datetime(df[column], utc=False).dt.tz_localize('Europe/Tallinn')
                    
            dataframes.append(df)
        group_order = dataframes[0][self.group_id_column].drop_duplicates().tolist()
        dataframes = [df.set_index(self.group_id_column) for df in dataframes]

        for group_id in group_order:
            self._status = 'prediction_needed'
            current_data = []
            for df in dataframes:
                cur_df = df.loc[group_id].copy()
                # returning single line dataframes from df.loc requires special handling
                if not isinstance(cur_df, pd.DataFrame):
                    cur_df = pd.DataFrame({a: b for a, b in zip(cur_df.index.values, cur_df.values)}, index=[group_id])
                    cur_df.index.name = self.group_id_column
                cur_df = cur_df.reset_index(drop=not self.export_group_id_column)
                current_data.append(cur_df)
            yield tuple(current_data)

            while self._status != 'prediction_received':
                print('You must call `predict()` successfully before you can continue with `iter_test()`', flush=True)
                yield None

        with open('submission.csv', 'w') as f_open:
            pd.concat(self.predictions).to_csv(f_open, index=False)
        self._status = 'finished'

    def predict(self, user_predictions: pd.DataFrame):
        """
        Accepts and stores the user's predictions and unlocks iter_test once that is done
        """
        if self._status == 'finished':
            raise Exception('You have already made predictions for the full test set.')
        if self._status != 'prediction_needed':
            raise Exception('You must get the next test sample from `iter_test()` first.')
        if not isinstance(user_predictions, pd.DataFrame):
            raise Exception('You must provide a DataFrame.')

        self.predictions.append(user_predictions)
        self._status = 'prediction_received'

def make_env():
    return MockApi()

In [107]:
import datetime

def naive_predict(test_row, new_revealed_targets):
    last_known = new_revealed_targets[(new_revealed_targets.is_consumption == test_row.is_consumption) &
                                      (new_revealed_targets.prediction_unit_id == test_row.prediction_unit_id) &
                                      (new_revealed_targets.datetime <= test_row.prediction_datetime - datetime.timedelta(days=2))]

    last_known = last_known[last_known.datetime == last_known.datetime.max()]
    
    if len(last_known) == 0:
        last_known = 0
    else:
        last_known = last_known.target.mean()
    return last_known

def naive_predict_batch(test_batch, new_revealed_targets):
    target_series = test_batch.apply(lambda test_row: naive_predict(test_row, new_revealed_targets), axis=1)
    target_series.name = 'target'
    predict = pd.concat([test_batch['row_id'], target_series], axis=1)
    predict.set_index('row_id', inplace=True)
    return predict

In [108]:
env = make_env()
iter_test = env.iter_test()

for (test, revealed_targets, client, historical_weather,
     forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    print(len(test), test.prediction_datetime.min(), test.prediction_datetime.max())
    print(len(revealed_targets), revealed_targets.datetime.min(), revealed_targets.datetime.max())
    env.predict(naive_predict_batch(test, revealed_targets))


Transforming downloads/example_test_files/test.csv[prediction_datetime] from object to datetime...
Transforming downloads/example_test_files/revealed_targets.csv[datetime] from object to datetime...
Transforming downloads/example_test_files/client.csv[date] from object to datetime...
Transforming downloads/example_test_files/historical_weather.csv[datetime] from object to datetime...
Transforming downloads/example_test_files/forecast_weather.csv[origin_datetime] from object to datetime...
Stripping incorrect timezone adjustments
Transforming downloads/example_test_files/forecast_weather.csv[forecast_datetime] from object to datetime...
Stripping incorrect timezone adjustments
Transforming downloads/example_test_files/electricity_prices.csv[forecast_date] from object to datetime...
Transforming downloads/example_test_files/electricity_prices.csv[origin_date] from object to datetime...
Transforming downloads/example_test_files/gas_prices.csv[forecast_date] from object to datetime...
Tran

In [109]:
pd.concat([prediction.describe() for prediction in env.predictions], axis=1)

Unnamed: 0,target,target.1,target.2,target.3
count,3120.0,3120.0,3120.0,3120.0
mean,397.055628,384.236975,378.323915,392.325769
std,1065.377391,1039.80739,1010.882205,1066.244326
min,0.0,0.0,0.0,0.0
25%,10.91775,11.02125,9.0755,9.61
50%,70.727,63.564,61.553,65.132
75%,334.855,284.275,276.308,328.11875
max,10689.082,11013.487,10265.362,11146.496


In [110]:
env.predictions[0]

Unnamed: 0_level_0,target
row_id,Unnamed: 1_level_1
2005872,2.675
2005873,471.887
2005874,0.000
2005875,5.414
2005876,13.899
...,...
2008987,188.122
2008988,0.000
2008989,32.809
2008990,0.000
