In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## Resampling

### Resampling Energy Consumption

In [191]:
train = pd.read_csv('../data/train.csv', index_col=0)
test = pd.read_csv('../data/test.csv', index_col=0)

def resample_value(data: pd.DataFrame):
    forecast_ids =  data["ForecastId"].unique()
    resampled_values = pd.DataFrame()
    for forecast_id in forecast_ids:
        print("Resampling ForecastId: ", forecast_id)
        forecast_values = data[data["ForecastId"] == forecast_id].copy()
        forecast_values['Timestamp'] = pd.to_datetime(forecast_values['Timestamp'])
        forecast_values = forecast_values.reset_index().set_index('Timestamp')
        forecast_values = forecast_values.groupby('SiteId').resample('D')['Value'].sum().reset_index()

        if resampled_values.empty:
            resampled_values = forecast_values
        else:
            resampled_values = pd.concat([resampled_values, forecast_values])
    return resampled_values

resampled_train = resample_value(train)
resampled_test = resample_value(test)


Resampling ForecastId:  1
Resampling ForecastId:  2
Resampling ForecastId:  3
Resampling ForecastId:  4
Resampling ForecastId:  5
Resampling ForecastId:  6
Resampling ForecastId:  7
Resampling ForecastId:  8
Resampling ForecastId:  9
Resampling ForecastId:  10
Resampling ForecastId:  11
Resampling ForecastId:  12
Resampling ForecastId:  13
Resampling ForecastId:  14
Resampling ForecastId:  15
Resampling ForecastId:  16
Resampling ForecastId:  17
Resampling ForecastId:  18
Resampling ForecastId:  19
Resampling ForecastId:  20
Resampling ForecastId:  21
Resampling ForecastId:  22
Resampling ForecastId:  23
Resampling ForecastId:  24
Resampling ForecastId:  25
Resampling ForecastId:  26
Resampling ForecastId:  27
Resampling ForecastId:  28
Resampling ForecastId:  29
Resampling ForecastId:  30
Resampling ForecastId:  31
Resampling ForecastId:  32
Resampling ForecastId:  33
Resampling ForecastId:  34
Resampling ForecastId:  35
Resampling ForecastId:  36
Resampling ForecastId:  37
Resampling

In [192]:
resampled_train.to_csv('../data/resampled/resampled_train.csv', index = True)
resampled_test.to_csv('../data/resampled/resampled_test.csv', index = True)

### Resampling Temperature

In [190]:
weather = pd.read_csv('../data/weather.csv', index_col=0)

resampled_weather = weather.copy()
resampled_weather['Timestamp'] = pd.to_datetime(resampled_weather['Timestamp'])
resampled_weather= resampled_weather.sort_values(['Timestamp', 'SiteId', 'Distance'])
resampled_weather = resampled_weather.reset_index().drop_duplicates(['Timestamp', 'SiteId'], keep="first")
resampled_weather = resampled_weather.set_index('Timestamp')
resampled_weather = resampled_weather.drop([resampled_weather.columns[0]], axis=1)
resampled_weather = resampled_weather.groupby('SiteId').resample('D')['Temperature'].median()
resampled_weather.reset_index()
resampled_weather.head()

SiteId  Timestamp 
1       2013-12-31    -8.45
        2014-01-01    -9.25
        2014-01-02   -10.95
        2014-01-03   -13.00
        2014-01-04    -4.00
Name: Temperature, dtype: float64

In [193]:
resampled_weather.to_csv('../data/resampled/resampled_weather.csv', index = True)

## Merge Data

In [373]:
resampled_train = pd.read_csv('../data/resampled/resampled_train.csv', index_col=None)
resampled_test = pd.read_csv('../data/resampled/resampled_test.csv', index_col=None)

### Process timestamp format

In [374]:
def process_time(df):
    
    df['Timestamp'] = pd.to_datetime(df['Timestamp']).dt.floor('D')
    df = df.set_index('Timestamp')
    
    df['day_of_week'] = df.index.dayofweek
    df['day_of_month'] = df.index.day
    df['day_of_year'] = df.index.dayofyear
    df['month'] = df.index.month
    df['year'] = df.index.year
    
    df = df.reset_index(level=0)
    
    return df

processed_train = process_time(resampled_train)
processed_test = process_time(resampled_test)

### Process weather data

In [375]:
resampled_weather = pd.read_csv('../data/resampled/resampled_weather.csv', index_col=None)

In [376]:
resampled_weather['Timestamp'] = pd.to_datetime(resampled_weather['Timestamp']).dt.floor('D')


In [377]:
def add_weather(df: pd.DataFrame, weather: pd.DataFrame):
    
    df = df.drop_duplicates(['Timestamp', 'SiteId'], keep='first')
    original_length = len(df)

    df = pd.merge(df, weather, how = 'left', on = ['Timestamp', 'SiteId'])
        
    new_length = len(df)
    
    assert original_length == new_length, 'New Length must match original length'

    return df

processed_train = add_weather(processed_train, resampled_weather)
processed_test = add_weather(processed_test, resampled_weather)

In [378]:
site_ids = processed_train["SiteId"].unique()

count = 0
for site_id in site_ids:
    train_df = processed_train[processed_train['SiteId'] == site_id]
    test_df = processed_train[processed_train['SiteId'] == site_id]
    if np.all(np.isnan(train_df['Temperature'])) or np.all(np.isnan(test_df['Temperature'])):
        processed_train = processed_train[processed_train['SiteId'] != site_id]
        processed_test = processed_test[processed_test['SiteId'] != site_id]

### Process meta data

In [379]:
meta = pd.read_csv('../data/metadata.csv')

site_ids = set(meta['SiteId'])
all_meta = pd.DataFrame(columns=['SiteId', 'day_of_week', 'is_day_off'])

for site in site_ids:
    # Extract the metadata information for the site
    meta_slice = meta.loc[meta['SiteId'] == site]
    
    # Create a new dataframe for the site
    site_meta = pd.DataFrame(
        columns=['SiteId', 'day_of_week', 'is_day_off', 'BaseTemperature', 'Surface'],
        index = [0, 1, 2, 3, 4, 5, 6]
    )
    
    site_meta['day_of_week'] = [0, 1, 2, 3, 4, 5, 6]
    site_meta['SiteId'] = site

    # Record the days off
    site_meta.loc[0, 'is_day_off'] = float(meta_slice['MondayIsDayOff'])
    site_meta.loc[1, 'is_day_off'] = float(meta_slice['TuesdayIsDayOff'])
    site_meta.loc[2, 'is_day_off'] = float(meta_slice['WednesdayIsDayOff'])
    site_meta.loc[3, 'is_day_off'] = float(meta_slice['ThursdayIsDayOff'])
    site_meta.loc[4, 'is_day_off'] = float(meta_slice['FridayIsDayOff'])
    site_meta.loc[5, 'is_day_off'] = float(meta_slice['SaturdayIsDayOff'])
    site_meta.loc[6, 'is_day_off'] = float(meta_slice['SundayIsDayOff'])

    site_meta['BaseTemperature'] = float(meta_slice['BaseTemperature'])
    site_meta['Surface'] = float(meta_slice['Surface'])
    
    # Append the resulting dataframe to all site dataframe
    all_meta = pd.concat([all_meta, site_meta])

# Find the days off in the training and testing data
processed_train = processed_train.merge(all_meta, how = 'left', on = ['SiteId', 'day_of_week'])
processed_test = processed_test.merge(all_meta, how = 'left', on = ['SiteId', 'day_of_week'])

In [380]:
processed_train = processed_train.drop(['Unnamed: 0'], axis=1)
processed_test = processed_test.drop(['Unnamed: 0'], axis=1)

In [381]:
processed_train.head()

Unnamed: 0,Timestamp,SiteId,Value,day_of_week,day_of_month,day_of_year,month,year,Temperature,is_day_off,BaseTemperature,Surface
0,2014-09-03,1,909655.5,2,3,246,9,2014,20.0,0.0,18.0,1387.205119
1,2014-09-04,1,1748273.0,3,4,247,9,2014,22.25,0.0,18.0,1387.205119
2,2014-09-05,1,0.0,4,5,248,9,2014,23.65,0.0,18.0,1387.205119
3,2014-09-06,1,0.0,5,6,249,9,2014,20.0,1.0,18.0,1387.205119
4,2014-09-07,1,0.0,6,7,250,9,2014,16.5,1.0,18.0,1387.205119


### Process holidays

In [382]:
holidays = pd.read_csv('../data/holidays.csv', delimiter=';')
holidays["Timestamp"] = pd.to_datetime(holidays["Timestamp"]).dt.floor('D')
holidays.set_index("Timestamp")
holidays.head()

Unnamed: 0,Timestamp,Holiday,SiteId
0,2016-12-23,Christmas Eve (Observed),1
1,2016-12-24,Christmas Eve,1
2,2017-07-04,Independence Day,1
3,2014-11-04,Election Day,1
4,2016-09-05,Labor Day,12


In [383]:
processed_train['is_holiday'] = 0
processed_train['is_holiday_elsewhere'] = 0

processed_test['is_holiday'] = 0
processed_test['is_holiday_elsewhere'] = 0


In [385]:

def process_holidays(df: pd.DataFrame, holidays: pd.DataFrame):
    
    for index, holiday in holidays.iterrows():
        df.loc[
            (df['Timestamp'] == holiday['Timestamp']) & (df['SiteId'] == holiday['SiteId']), 
            "is_holiday"
        ] = 1

    for index, holiday in holidays.iterrows():
        df.loc[
            (df['Timestamp'] == holiday['Timestamp']) & (df['SiteId'] != holiday['SiteId']) & (df['is_holiday'] != 1), 
            "is_holiday_elsewhere"
        ] = 1
        
    df_test.loc[
                ((df_test['is_holiday_elsewhere'] == 1) & (df_test['is_holiday'] == 1)), 
                "is_holiday_elsewhere"
            ] = 0
    
    return df


df_train = process_holidays(processed_train, holidays)
df_test = process_holidays(processed_test, holidays)



In [386]:
df_train.head()

Unnamed: 0,Timestamp,SiteId,Value,day_of_week,day_of_month,day_of_year,month,year,Temperature,is_day_off,BaseTemperature,Surface,is_holiday,is_holiday_elsewhere
0,2014-09-03,1,909655.5,2,3,246,9,2014,20.0,0.0,18.0,1387.205119,0,0
1,2014-09-04,1,1748273.0,3,4,247,9,2014,22.25,0.0,18.0,1387.205119,0,0
2,2014-09-05,1,0.0,4,5,248,9,2014,23.65,0.0,18.0,1387.205119,0,0
3,2014-09-06,1,0.0,5,6,249,9,2014,20.0,1.0,18.0,1387.205119,0,0
4,2014-09-07,1,0.0,6,7,250,9,2014,16.5,1.0,18.0,1387.205119,0,0


In [387]:
df_train = df_train.dropna()
df_test = df_test.dropna()

df_train = df_train[df_train["Value"] != 0]
df_test = df_test[df_test["Value"] != 0]

In [388]:
df_train.to_csv('../data/processed/processed_train.csv', index = True)
df_test.to_csv('../data/processed/processed_test.csv', index = True)