# Import Libraries & Data

In [1]:
import pandas as pd
import numpy as np

import collections

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
directory = '../../data_PowerLaws_ForecastingEnergyConsumption/'

df_holidays = pd.read_csv(directory + 'holidays.csv', parse_dates=['Date'])
df_metadata = pd.read_csv(directory + 'metadata.csv')
df_submission_format = pd.read_csv(directory + 'submission_format.csv', parse_dates=['Timestamp'])
df_submission_frequency = pd.read_csv(directory + 'submission_frequency.csv')
df_train = pd.read_csv(directory + 'train.csv', parse_dates=['Timestamp'])
df_weather = pd.read_csv(directory + 'weather.csv', parse_dates=['Timestamp'])

In [None]:
# df_holidays drop unnecessary column
df_holidays = df_holidays.drop('Unnamed: 0', axis=1)

In [None]:
# df_metadata cast dtypes as int
df_metadata['MondayIsDayOff'] = df_metadata['MondayIsDayOff'].astype(int)
df_metadata['TuesdayIsDayOff'] = df_metadata['TuesdayIsDayOff'].astype(int)
df_metadata['WednesdayIsDayOff'] = df_metadata['WednesdayIsDayOff'].astype(int)
df_metadata['ThursdayIsDayOff'] = df_metadata['ThursdayIsDayOff'].astype(int)
df_metadata['FridayIsDayOff'] = df_metadata['FridayIsDayOff'].astype(int)
df_metadata['SaturdayIsDayOff'] = df_metadata['SaturdayIsDayOff'].astype(int)
df_metadata['SundayIsDayOff'] = df_metadata['SundayIsDayOff'].astype(int)

In [None]:
# there are three unique forecast time periods: 1440, 60, and 15 minutes
df_submission_frequency['ForecastPeriodMin'] = (df_submission_frequency['ForecastPeriodNS']
                                                .apply(lambda x: int(x / 60000000000)))

In [None]:
# df_weather drop unnecessary column
df_weather = df_weather.drop('Unnamed: 0', axis=1)

# Build ML DataFrame

**Steps to Creating ML DataFrame**

- Training data
    - Account for NaN (Interpolation)
- Merge
    - Meta data
    - Holidays
    - Weather
- Feature Engineering
    - Categorical Feature(s) Get Dummies for Holidays
    - Lagged features: t-1...

## [D] Create DataFrame

* FROM df_train

In [None]:
df_n_1 = df_train

In [None]:
# shape of df_n_1
shapes = collections.OrderedDict()
shapes['df_n_1'] = df_n_1.shape
for k,v in shapes.items(): 
    print("{}: \t {}".format(k,v))

In [None]:
# sort values, inspect... see NaN requires interpolation
df_n_1 = df_n_1.sort_values(['SiteId', 'ForecastId', 'Timestamp'], axis=0, ascending=[True,True,True])
df_n_1.head()

In [None]:
# 6,974 unique ForecastId's
df_n_1['ForecastId'].unique().size

In [None]:
# 86,601 number of NaN values in training data set before interpolation
df_n_1['Value'].isnull().value_counts()

In [None]:
# 226 Number of SiteId with one or more single NaN values before interpolation
df_n_1[df_n_1['Value'].isnull()].groupby(['SiteId']).sum()['Value'].isnull().sum()

In [None]:
# 2,227 Number of ForecastId with one or more single NaN values before interpolation
df_n_1[df_n_1['Value'].isnull()].groupby(['ForecastId']).sum()['Value'].isnull().sum()

In [None]:
# 90; 360; 964: unique number of data points per ForecastId in training data
df_n_1.groupby(['ForecastId'], as_index=False).agg({'Timestamp':'count'})['Timestamp'].unique()

## [D] Add ForecastPeriodMin

* FROM df_submission_frequency

In [None]:
df_n_2 = df_n_1.copy(deep=True)

In [None]:
df_n_2 = df_n_2.merge(df_submission_frequency, on='ForecastId', how='inner')
df_n_2 = df_n_2.drop(['ForecastPeriodNS'], axis=1)
df_n_2.head()

In [None]:
# shape of df_n_2
shapes['df_n_2'] = df_n_2.shape
for k,v in shapes.items(): 
    print("{}: \t {}".format(k,v))

In [None]:
# ForecastPeriodMin value_counts by submission frequency
df_n_2['ForecastPeriodMin'].value_counts()

## [D] Add df_metadata

In [None]:
df_n_3 = df_n_2.copy(deep=True)

In [None]:
df_metadata.head(2)

In [None]:
df_metadata.shape

### Add Surface and BaseTemperature

In [None]:
# merge Surface and BaseTemperature to ml dataframe
df_n_3 = df_n_3.merge(df_metadata[['SiteId', 'Surface', 'BaseTemperature']], on='SiteId', how='left')
df_n_3.head(2)

### Add isDayOff

In [None]:
# add Weekday to ml dataframe 
# first step is to create a dictionary of isDayOff using Weekday
df_n_3['Weekday'] = df_n_3['Timestamp'].dt.weekday
df_n_3.head(2)

In [None]:
# create dict_metadata for efficient memory and search speed when applying isDayOff to dataframe
dict_metadata = {}

for index, row in df_metadata.iterrows():
    sid = row['SiteId'].astype(int)
    dict_metadata[sid, 0] = row['MondayIsDayOff'].astype(int)
    dict_metadata[sid, 1] = row['TuesdayIsDayOff'].astype(int)
    dict_metadata[sid, 2] = row['WednesdayIsDayOff'].astype(int)
    dict_metadata[sid, 3] = row['ThursdayIsDayOff'].astype(int)
    dict_metadata[sid, 4] = row['FridayIsDayOff'].astype(int)
    dict_metadata[sid, 5] = row['SaturdayIsDayOff'].astype(int)
    dict_metadata[sid, 6] = row['SundayIsDayOff'].astype(int)

In [None]:
# add isDayOff column to ml dataframe via apply lambda function referencing dict_metadata
df_n_3['isDayOff'] = df_n_3.apply(lambda row: dict_metadata[(row['SiteId'],row['Weekday'])], axis=1)
df_n_3 = df_n_3.drop('Weekday', axis=1)
df_n_3.head()

In [None]:
# shape of df_n_3
shapes['df_n_3'] = df_n_3.shape
for k,v in shapes.items(): 
    print("{}: \t {}".format(k,v))

## [D] Add df_holidays

In [None]:
df_n_4 = df_n_3.copy(deep=True)

In [None]:
# add a date column (without time) to merge on with df_holidays
df_n_4['Date'] = df_n_4['Timestamp'].apply(lambda x: x.date())

In [None]:
# create df_holidays_dates with boolean isHoliday for merging
df_holidays_dates = df_holidays[['SiteId', 'Date']].drop_duplicates()
df_holidays_dates['isHoliday'] = 1
df_holidays_dates['Date'] = df_holidays_dates['Date'].apply(lambda x: x.date())
df_holidays_dates.head()

In [None]:
# merge df_holidays_dates with dataframe
df_n_4 = df_n_4.merge(df_holidays_dates, on=['SiteId', 'Date'], how='left')
df_n_4 = df_n_4.drop('Date', axis=1)
df_n_4.head(2)

In [None]:
# fillna with 0... then make isHoliday an integer dtype
df_n_4['isHoliday'] = df_n_4['isHoliday'].fillna(value=0)
df_n_4['isHoliday'] = df_n_4['isHoliday'].astype(int)
df_n_4.head(2)

In [None]:
print("percent of days are Holidays: {}".format(round(df_n_4['isHoliday'].sum() / df_n_4['isHoliday'].count(),2)))

In [None]:
# shape of df_n_4
shapes['df_n_4'] = df_n_4.shape
for k,v in shapes.items(): 
    print("{}: \t {}".format(k,v))

## Add df_weather

In [None]:
df_n_5 = df_n_4.copy(deep=True)

### Inspect df_weather

In [None]:
# inspect weather data
# see multiple Temperature for single SiteId & Timestamp combination
# need a single SiteId & Timestamp combination to merge with ml dataframe
df_weather.head()

In [None]:
# shape of weather data
df_weather.shape

In [None]:
# remove extraneous SiteIds from df_weather not found in df_train
# shape of df_weather
df_weather = df_weather[df_weather['SiteId'].isin(pd.Series(df_n_5['SiteId'].unique()))]
df_weather.shape

In [None]:
# check if all SiteIds in weather data are also found in the train data
# 52 fewer unique SiteIds in both weather data & train data than in train data alone (215 vs. 267)
# later, put these 52 SiteIds without weather data in separate ML DataFrame
df_weather['SiteId'].unique().size

### Keep Nearest Temperature

In [None]:
# example: view first 5 datapoints for SiteId 1
# see multiple Temperature for single SiteId & Timestamp combination
# need a single SiteId & Timestamp combination to merge with ml dataframe
df_weather[(df_weather['SiteId']==1)].head()

In [None]:
# create dataframe of nearest temperatures by SiteId and Timestamp
# inspect df_weather_nearest to compare above and see we took the nearest Temperature
df_weather_nearest = (df_weather.sort_values(['SiteId', 'Timestamp', 'Distance'])
                      .groupby(['SiteId', 'Timestamp'], as_index=False).first())
df_weather_nearest.head()

In [None]:
# df_weather_nearest shape
df_weather_nearest.shape

In [None]:
# check if ther are NaN in Temperature... good, there are 0 NaN
df_weather_nearest.Temperature.isnull().sum()

### Resample Weather by 15min Intervals

In [None]:
def multi_index_resample(df, time_bins='15T'):

    df_list = []
    
    for i in df.index.unique():
        df_rs = df[df.index == i]
        df_rs = df_rs.reset_index().drop('SiteId', axis=1).set_index('Timestamp')
        df_rs = df_rs.resample(time_bins).interpolate(method='linear')
        df_rs = df_rs.reset_index()
        df_rs['SiteId'] = i
        df_list.append(df_rs)
    
    df_return = pd.concat(df_list, ignore_index=True)
    df_return = df_return[['SiteId', 'Timestamp', 'Temperature']]

    return df_return

In [None]:
# call multi_index_resample on df_weather_nearest
mask = ['SiteId', 'Timestamp', 'Temperature']
df_weather_nearest_resample15 = multi_index_resample(df_weather_nearest[mask].set_index('SiteId'))
df_weather_nearest_resample15.head()

In [None]:
# Confirmed: 215 SiteIds in resampled weather data
df_weather_nearest_resample15.SiteId.unique().size

In [None]:
# shape of df_weather_nearest_resample15
df_weather_nearest_resample15.shape

### Merge Resampled Weather Data

In [None]:
# merge average temperature by SiteId and Timestamp with dataframe
df_n_5 = df_n_5.merge(df_weather_nearest_resample15, on=['SiteId', 'Timestamp'], how='left')

In [None]:
# inspect dataframe
df_n_5.head(2)

In [None]:
# shape of df_n_5
shapes['df_n_5'] = df_n_5.shape
for k,v in shapes.items(): 
    print("{}: \t {}".format(k,v))

### Inspect NaN temperatures

In [None]:
# 1,708,262 data points are missing temperature values
df_n_5['Temperature'].isnull().value_counts()

In [None]:
# associated with 91 SiteId having 1 or more NaN Temperature values in training data
SiteId_with_null_temps = df_n_5[df_n_5['Temperature'].isnull()]['SiteId'].unique()
SiteId_with_null_temps.size

In [None]:
# associated with 1,832 ForecastId having 1 or more NaN Temperature values in training data
ForecastId_with_null_temps = df_n_5[df_n_5['Temperature'].isnull()]['ForecastId'].unique()
ForecastId_with_null_temps.size

In [None]:
# inspect where dataframe temperature is null (i.e. no df_weather corresponding to df_train data)
# visual inspection shows most values for these SiteId's are NaN (where NaN=1)
# weather will not be a useable feature for these SiteIds... drop  
nrows = SiteId_with_null_temps.size
ncols = 1

#fig, ax = plt.subplots(ncols=ncols, nrows=nrows, sharex=True, sharey=True, figsize=(12,256), dpi=80, facecolor='w', edgecolor='k')
#for i in range(nrows):
#    sid = SiteId_with_null_temps[i]
#    section = df_n_5[df_n_5['SiteId']==sid]
#    ax[i].plot(section['Timestamp'], section['Temperature'].isnull().astype(int))
#    ax[i].set_title("SiteId: {}".format(sid));

#plt.tight_layout()

In [None]:
# now by calculations show % NaN weather values by SiteId
df_SiteId_with_null_temps = df_n_5[df_n_5['SiteId'].isin(SiteId_with_null_temps)].copy()
df_SiteId_with_null_temps['isNull_Temp'] = df_SiteId_with_null_temps['Temperature'].isnull()
df_null_temps = df_SiteId_with_null_temps.groupby('SiteId').agg({'isNull_Temp':['sum', 'count']})
df_null_temps['isNull_Temp', 'Percent_Null'] = (df_null_temps['isNull_Temp', 'sum'] 
                                                / df_null_temps['isNull_Temp', 'count'])
df_null_temps.head()

In [None]:
# this shows that 79 of 91 SiteId's have 100% NaN weather values
# later, let's put these 79 SiteId's + 6 with high %_Null in separate Machine Learning dataframe
# inspect SiteId 180 & 93 for ForecastId's with weather data
# the bottom 6 SiteId's seems reasonable to keep because the % NaN < 5% which is low
# we can impute those values
df_null_temps.sort_values([('isNull_Temp', 'Percent_Null')], ascending=False).tail(15)

In [None]:
# 85 SiteId's with no weather data
keep_SiteId = list(df_null_temps[df_null_temps[('isNull_Temp', 'Percent_Null')] <= 0.05].index)
no_weather_SiteId = [s for s in SiteId_with_null_temps if s not in keep_SiteId]

len(no_weather_SiteId)

# Interpolate Values

In [None]:
df_n_6 = df_n_5.copy(deep=True)

## Inspect NaN Values (before interpolation)

In [None]:
# create dataframe to inspect the distribution of NaN Value across ForecastId
Nulls = df_n_6.groupby(['ForecastId']).apply(lambda row: row['Value'].isnull().sum())
Entries = df_n_6.groupby(['ForecastId']).apply(lambda row: row['Timestamp'].count())
df_NaN = pd.concat([Nulls, Entries], axis=1)
df_NaN.columns = ['Nulls', 'Entries']
df_NaN.head()

In [None]:
# merge with df_submission_frequency to chart by submission frequency
df_NaN = df_NaN.merge(df_submission_frequency, left_index=True, right_on='ForecastId')
df_NaN = df_NaN.drop('ForecastPeriodNS', axis=1)
df_NaN = df_NaN[['ForecastId', 'ForecastPeriodMin', 'Nulls', 'Entries']]
df_NaN['percent_NaN'] = df_NaN['Nulls'] / df_NaN['Entries']
df_NaN.head()

In [None]:
df_NaN['percent_NaN'].describe()

In [None]:
df_NaN.sort_values('percent_NaN', ascending=False).head()

In [None]:
# if we just delete all NaN values, we delete rows 86,601 rows (~1% of data)
print("Potential to Remove \t Rows: {} \t Percent of Data: {}".format(df_NaN['Nulls'].sum(), round(df_NaN['Nulls'].sum() / df_NaN['Entries'].sum(), 2)))

In [None]:
# if we delete all Forecasts that contain one or more NaN Values, we delete 2,081,362 rows (~30% data)
print("Potential to Remove \t Rows: {} \t Percent of Data: {}".format(df_NaN[df_NaN['percent_NaN'] > 0].Entries.sum(), round(df_NaN[df_NaN['percent_NaN'] > 0].Entries.sum() / df_NaN['Entries'].sum(), 2)))

In [None]:
# histogram of ForecastId's with NaN values greater than 1% of series data
# takeaway: a lot of ForecastId's have a < 5% of missing data, while a few have more than 5%
fig = plt.figure(figsize=(6,4))
plt.hist(df_NaN[df_NaN['percent_NaN'] > 0.01]['percent_NaN'], bins=100)
plt.title("Train.csv: Histogram of Percent NaN by ForecastId", size=12)
plt.xlabel("Percent NaN Value Data", size=12)
plt.ylabel("Number of ForecastId's", size=12)
plt.tight_layout();
fig.savefig("EDA_hist_percent_NaNbyForecastId_greaterthan1percent.png")

## [D] Interpolate NaN Values

In [None]:
### VALIDATED - BUT ONLY WORKS MOVING FORWARD, NOT BACKFILL ###
### https://github.com/pandas-dev/pandas/issues/10420 ###

# add flag for "is_int"
df_n_6['Value_Int'] = df_n_6.groupby(['SiteId', 'ForecastId'])['Value'].apply(lambda x: x.interpolate(method='linear'))
df_n_6.head(10)

In [None]:
# 25,480 number of NaN values in training data set after interpolation
df_n_6.Value_Int.isnull().value_counts()

In [None]:
# 58 Number of SiteId with one or more single NaN values after interpolation
df_n_6[df_n_6.Value_Int.isnull()].groupby(['SiteId']).sum()['Value_Int'].isnull().sum()

In [None]:
# 104 Number of ForecastId with one or more single NaN values after interpolation
df_n_6[df_n_6['Value_Int'].isnull()].groupby(['ForecastId']).sum()['Value_Int'].isnull().sum()

### Inspect NaN Values (after interpolation)

In [None]:
# create dataframe to inspect the distribution of NaN Value across ForecastId
Nulls_2 = df_n_6.groupby(['ForecastId']).apply(lambda row: row['Value_Int'].isnull().sum())
Entries_2 = df_n_6.groupby(['ForecastId']).apply(lambda row: row['Timestamp'].count())
df_NaN_2 = pd.concat([Nulls_2, Entries_2], axis=1)
df_NaN_2.columns = ['Nulls', 'Entries']
df_NaN_2.head()

In [None]:
# merge with df_submission_frequency to chart by frequencies
df_NaN_2 = df_NaN_2.merge(df_submission_frequency, left_index=True, right_on='ForecastId')
df_NaN_2 = df_NaN_2.drop('ForecastPeriodNS', axis=1)
df_NaN_2 = df_NaN_2[['ForecastId', 'ForecastPeriodMin', 'Nulls', 'Entries']]
df_NaN_2['percent_NaN'] = df_NaN_2['Nulls'] / df_NaN_2['Entries']
df_NaN_2.head()

In [None]:
df_NaN_2['percent_NaN'].describe()

In [None]:
df_NaN_2.sort_values('percent_NaN', ascending=False).tail()

In [None]:
# in the case of ForecastId == 1378, 2 NaN Value_Int remain after interpolation
df_n_6[df_n_6['ForecastId'] == 1378].loc[:,'Value_Int'].isnull().sum()

In [None]:
# in the case of ForecastId == 1378, these NaN Value_Int are located at beginning of data
# this is due to the linear interpolation algorithm which is range bound by data
df_n_6[df_n_6['ForecastId'] == 1378].head(5)

In [None]:
# Ex: ForecastId 608 starts 12/28 but NaN through 01/04 06

x1 = df_n_6[df_n_6['ForecastId'] == 608]['Timestamp']
y1 = df_n_6[df_n_6['ForecastId'] == 608]['Value']

plt.figure(figsize=(12,4))
plt.plot(x1, y1);

In [None]:
# Histogram of ForecastId's with NaN values greater than 1% of series data

fig = plt.figure(figsize=(6,4))
plt.hist(df_NaN_2[df_NaN_2['percent_NaN'] > 0.01]['percent_NaN'], bins=100)
plt.title("Train.csv: Histogram of Percent NaN by ForecastId", size=12)
plt.xlabel("Percent NaN Value Data", size=12)
plt.ylabel("Number of ForecastId's", size=12)
plt.tight_layout();
fig.savefig("EDA_hist_percent_NaNbyForecastId_2_greaterthan1percent.png")

In [None]:
# if we just delete all NaN values, we delete rows 25,480 rows (~0% of data)
print("Potential to Remove \t Rows: {} \t Percent of Data: {}".format(df_NaN_2['Nulls'].sum(), round(df_NaN_2['Nulls'].sum() / df_NaN_2['Entries'].sum(), 2)))

In [None]:
# if we delete all Forecasts that contain one or more NaN Values, we delete 96,426 rows (~1% data)
print("Potential to Remove \t Rows: {} \t Percent of Data: {}".format(df_NaN_2[df_NaN_2['percent_NaN'] > 0].Entries.sum(), round(df_NaN_2[df_NaN_2['percent_NaN'] > 0].Entries.sum() / df_NaN_2['Entries'].sum(), 2)))

# [D] Add Lagged Features

In [None]:
df_n_7 = df_n_6.copy(deep=True)

In [None]:
# Add lagged feature(s)
df_n_7['Value_Lag_1'] = df_n_7.groupby(['SiteId', 'ForecastId'])['Value_Int'].shift(1)
df_n_7['Value_Lag_2'] = df_n_7.groupby(['SiteId', 'ForecastId'])['Value_Int'].shift(2)
df_n_7['Value_Lag_3'] = df_n_7.groupby(['SiteId', 'ForecastId'])['Value_Int'].shift(3)
df_n_7['Value_Lag_4'] = df_n_7.groupby(['SiteId', 'ForecastId'])['Value_Int'].shift(4)
df_n_7.head()

In [None]:
# shape of df_n_7
shapes['df_n_7'] = df_n_7.shape
for k,v in shapes.items(): 
    print("{}: \t {}".format(k,v))

# Feature Engineering

In [None]:
df_n_8 = df_n_7.copy(deep=True)

In [None]:
df_n_8.head(2)

## Temperature Features

In [None]:
# add difference of (outside) Temperature to BaseTemperature
df_n_8['Temp_Minus_BaseTemp'] = df_n_8['Temperature'] - df_n_8['BaseTemperature']
df_n_8['Temp_Div_BaseTemp'] = df_n_8['Temperature'] / df_n_8['BaseTemperature']
df_n_8.head(2)

## Add Month & Quarter Features

In [None]:
# add month features to ml dataframe
df_n_8['Month'] = df_n_8['Timestamp'].dt.month
df_n_8[['Month_1', 'Month_2', 'Month_3','Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8', \
        'Month_9', 'Month_10', 'Month_11', 'Month_12']] = pd.get_dummies(df_n_8['Month'])
df_n_8 = df_n_8.drop('Month', axis=1)
df_n_8.head(2)

In [None]:
# add quarter features to ml dataframe
df_n_8['Quarter'] = df_n_8['Timestamp'].dt.quarter
df_n_8[['Quarter_1', 'Quarter_2', 'Quarter_3','Quarter_4']] = pd.get_dummies(df_n_8['Quarter'])
df_n_8 = df_n_8.drop('Quarter', axis=1)
df_n_8.head(2)

# Pickle ML DataFrame 

## Inspect Final DataFrame

In [None]:
df_n_8.head(2)

In [None]:
# shape of df_n_8
shapes['df_n_8'] = df_n_8.shape
for k,v in shapes.items(): 
    print("{}: \t {}".format(k,v))

## Pickle DataFrame

In [None]:
df_n_8.to_csv(directory + 'df_n_8.csv')

# df_n_8.to_pickle('df_n_8.pkl')
#df_n_7[~df_n_7['SiteId'].isin(no_weather_SiteId)].to_pickle('mldataframe.pkl')
#df_n_7[df_n_7['SiteId'].isin(no_weather_SiteId)].to_pickle('mldataframe_noweather.pkl')

# Regression DataFrame

In [None]:
df_n_9 = df_n_8.copy(deep=True)

In [None]:
#df_sparse_n_9 = pd.SparseDataFrame(df_n_9)
#df_sparse_n_9.head(2)

## SiteId Sparse Features

In [None]:
df_sid_dummies = pd.get_dummies(df_n_9['SiteId'])
df_sid_dummies.shape

#df_sparse_sid_dummies = pd.SparseDataFrame(df_sid_dummies)
#df_sparse_sid_dummies.shape

In [None]:
col_names = []

for c_name in df_sid_dummies.columns:
    col_names.append("SiteId_" + str(c_name))

df_sid_dummies.columns = col_names

In [None]:
df_n_9 = df_n_9.merge(df_sid_dummies, left_index=True, right_index=True, how='left')

## Time of Day Sparse Features

In [None]:
# df_sparse_n_9['HourMin'] = df_sparse_n_9['Timestamp'].dt.hour.astype(str) + "_" + df_sparse_n_9['Timestamp'].dt.minute.astype(str)
df_n_9['HourMin'] = df_n_9['Timestamp'].dt.hour.astype(str) + "_" + df_n_9['Timestamp'].dt.minute.astype(str)

In [None]:
df_hm_dummies = pd.get_dummies(df_n_9['HourMin'])
df_hm_dummies.shape

# df_sparse_hm_dummies = pd.SparseDataFrame(df_hm_dummies)
# df_sparse_hm_dummies.shape

In [None]:
col_names = []

for c_name in df_hm_dummies.columns:
    col_names.append("HourMin_" + str(c_name))

df_hm_dummies.columns = col_names

In [None]:
# some data is offset by 1min, starting at 00hr:01min instead of 00hr:00min
for c in col_names:
    print(c)

In [None]:
df_n_9 = df_n_9.merge(df_hm_dummies, left_index=True, right_index=True, how='left')
df_n_9.shape

# df_sparse_n_9 = df_sparse_n_9.merge(df_sparse_hm_dummies, left_index=True, right_index=True, how='left')
# df_sparse_n_9.shape

In [None]:
df_n_9.head(2)
# df_sparse_n_9.head(2)

## Pickle Sparse DataFrame

In [None]:
# df_sparse_n_9 = pd.SparseDataFrame(df_n_9)

In [None]:
df_n_9.to_csv(directory + 'df_n_9.csv')
#df_n_9.to_pickle('df_n_9.pkl')

# Prepare Submission Set

# Baseline Forecasts

* Note: where ML forecast is incomplete fill-in forecast values with the best available Baseline Forecast

In [None]:
# predict average 15min energy consumption by ForecastId
df_submission_format.head(2)

In [None]:
df_submission_format.shape

In [None]:
df_train.head(2)

## Submission_1

* forecast a single average value of training data for each ForecastId
* fillna(0)

In [None]:
df_train_avg_forecastid = (df_train.groupby(['ForecastId'], as_index=False).agg({'Value':'mean'}))
df_train_avg_forecastid.head()

In [None]:
df_submission_1 = df_submission_format.merge(df_train_avg_forecastid, on='ForecastId', how='left')
df_submission_1.head(2)

In [None]:
df_submission_1 = df_submission_1.drop('Value_x', axis=1)
df_submission_1 = df_submission_1.rename(columns={'Value_y':'Value'})
df_submission_1.head(2)

In [None]:
df_submission_1.shape

In [None]:
# 1,920 NaN Values
df_submission_1['Value'].isnull().sum()

In [None]:
# fillna with 0
df_submission_1['Value'] = df_submission_1['Value'].fillna(value=0)

In [None]:
# 0 NaN Values
df_submission_1.Value.isnull().sum()

In [None]:
# set index to obs_id to match submission format
df_submission_1 = df_submission_1.set_index('obs_id')
df_submission_1.head(2)

In [None]:
# create submission_1.csv
# drivendata : 0.007459
# df_submission_1.to_csv('submission_1.csv')

## Submission_2

* forecast the average hour:minute energy consumption for each ForecastId 
    * (i.e. 96 points per 15min interval submission frequency)
    * (i.e. 24 points per 60min interval submission frequency)
    * (i.e.  1 point per 1140min interval submission frequency (same result as submission_1))
* fillna(0)

In [None]:
# create df_train_2 with hour:min data
df_train_2 = df_train.copy(deep=True)
df_train_2['Hour'] = df_train_2['Timestamp'].dt.hour
df_train_2['Minute'] = df_train_2['Timestamp'].dt.minute
df_train_2.head(2)

In [None]:
# get average value per time submission frequency by ForecastId
df_train_avg_forecastid_hourmin = (df_train_2.groupby(['ForecastId','Hour','Minute'], as_index=False).agg({'Value':'mean'}))
df_train_avg_forecastid_hourmin.head()

In [None]:
df_train_avg_forecastid_hourmin.shape

In [None]:
# create df_submission_format_2 with hour:min data
df_submission_format_2 = df_submission_format.copy(deep=True)
df_submission_format_2['Timestamp'] = pd.to_datetime(df_submission_format_2['Timestamp'])
df_submission_format_2['Hour'] = df_submission_format_2['Timestamp'].dt.hour
df_submission_format_2['Minute'] = df_submission_format_2['Timestamp'].dt.minute
df_submission_format_2.head(2)

In [None]:
df_submission_2 = df_submission_format_2.merge(df_train_avg_forecastid_hourmin, on=['ForecastId','Hour','Minute'], how='left')
df_submission_2.head(2)

In [None]:
df_submission_2 = df_submission_2.drop(['Value_x', 'Hour', 'Minute'], axis=1)
df_submission_2 = df_submission_2.rename(columns={'Value_y':'Value'})
df_submission_2.head(2)

In [None]:
df_submission_2.shape

In [None]:
# 2,482 NaN Values
df_submission_2['Value'].isnull().sum()

In [None]:
# fillna with 0
df_submission_2['Value'] = df_submission_2['Value'].fillna(value=0)

In [None]:
# 0 NaN Values
df_submission_2['Value'].isnull().sum()

In [None]:
# set index to obs_id to match submission format
df_submission_2 = df_submission_2.set_index('obs_id')
df_submission_2.head(2)

In [None]:
# create submission_2.csv
# drivendata score: 0.005652
# df_submission_2.to_csv('submission_2.csv')

## df_submission_3

* create a distribution of daily energy consumption for various SiteId's
* make two distributions: 1. non-holiday / non-off-day, 2. holiday or off-day
* then add a multiplier to the distributions based upon the season of year
    * (i.e. 96 points per 15min interval submission frequency)
    * (i.e. 24 points per 60min interval submission frequency)
    * (i.e.  1 point per 1140min interval submission frequency (same result as submission_1))
* fillna(0)

In [None]:
df_n_8.head(1)

In [None]:
# create df_n_submission_3 with hour:min data
df_n_submission_3 = df_n_8.copy(deep=True)
df_n_submission_3['Hour'] = df_n_submission_3['Timestamp'].dt.hour
df_n_submission_3['Minute'] = df_n_submission_3['Timestamp'].dt.minute
df_n_submission_3.head(2)

In [None]:
drop_columns = ['Value', 'Value_Lag_1', 'Value_Lag_2', 'Value_Lag_3', 'Value_Lag_4', 'Temp_Minus_BaseTemp',]
df_n_submission_3 = df_n_submission_3.drop(drop_columns, axis=1)
df_n_submission_3.head(2)

In [None]:
df_n_submission_3['isDayOffOrHoliday'] = (df_n_submission_3['isDayOff'] + df_n_submission_3['isHoliday']) > 0
df_n_submission_3['isDayOffOrHoliday'] = df_n_submission_3['isDayOffOrHoliday'].astype(int)
df_n_submission_3.head(2)

In [None]:
# get average value per time submission frequency by ForecastId
mask = (df_n_submission_3['ForecastPeriodMin'] == 15)
df_s_3_split_avg_forecastid_hourmin_15 = (df_n_submission_3[mask].groupby(['SiteId','isDayOff','Hour','Minute'], as_index=False).agg({'Value_Int':'mean'}))
df_s_3_split_avg_forecastid_hourmin_15.head()

In [None]:
# these are the SiteId that have ForecastPeriodMin == 15
sid_fpm_15 = df_s_3_split_avg_forecastid_hourmin_15['SiteId'].unique()
sid_fpm_15

In [None]:
sid = 41
mask = (df_s_3_split_avg_forecastid_hourmin['SiteId'] == sid) & (df_s_3_split_avg_forecastid_hourmin['isDayOff'] == 0)
df_s_3_split_avg_forecastid_hourmin[mask]['Value_Int'].plot()

mask = (df_s_3_split_avg_forecastid_hourmin['SiteId'] == sid) & (df_s_3_split_avg_forecastid_hourmin['isDayOff'] == 1)
df_s_3_split_avg_forecastid_hourmin[mask]['Value_Int'].plot()

# TO DO
* timeseries tutorial analytics vidhya
* start the presentation: create appendix EDA slides

**Today's Goals**
* Time Series as Index
* Fill NaN / Ignore in train data
* Understand train vs. submission time periods
    * Use Forecast values as inputs for future forecasts
* feature engineering for temp v. surface area v. ppl in building (stand in is vacation days)
* 
    