In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('../data/train.csv', index_col=0)
test = pd.read_csv('../data/test.csv', index_col=0)
weather = pd.read_csv('../data/weather.csv', index_col=0)
meta = pd.read_csv('../data/metadata.csv')
holidays = pd.read_csv('../data/holidays.csv', delimiter=';')

## Resampling

### Resampling Energy Consumption

In [None]:
def resample_value(data):
    forecast_ids =  data["ForecastId"].unique()
    resampled_values = pd.DataFrame()
    for forecast_id in forecast_ids:
        print("Resampling ForecastId: ", forecast_id)
        forecast_values = data[data["ForecastId"] == forecast_id].copy()
        forecast_values['Timestamp'] = pd.to_datetime(forecast_values['Timestamp'])
        forecast_values = forecast_values.reset_index().set_index('Timestamp')
        forecast_values.resample('D').sum()

        if resampled_values.empty:
            resampled_values = forecast_values
        else:
            resampled_values = pd.concat([resampled_values, forecast_values])
    return resampled_values

resampled_train = resample_value(train)
resampled_test = resample_value(test)


In [None]:
resampled_train.to_csv('../data/resampled/resampled_train.csv', index = True)
resampled_test.to_csv('../data/resampled/resampled_test.csv', index = True)


### Resampling Temperature

In [None]:
resampled_weather = weather.sort_values(['Timestamp', 'SiteId', 'Distance']).copy()
resampled_weather = resampled_weather.reset_index().drop_duplicates(['Timestamp', 'SiteId'], keep="first")
resampled_weather = weather.reset_index(level=0)
resampled_weather['Timestamp'] = pd.to_datetime(resampled_weather['Timestamp'])
resampled_weather = resampled_weather.reset_index().set_index('Timestamp')
resampled_weather.drop(['index', 'Temperature'], axis=1)
resampled_weather.resample('D').agg({'Temperature': np.median})
resampled_weather.head()

In [None]:
resampled_weather.to_csv('../data/resampled/resampled_weather.csv', index = True)

## Merge Data

In [40]:
resampled_train = pd.read_csv('../data/resampled/resampled_train.csv', index_col=None)
resampled_test = pd.read_csv('../data/resampled/resampled_test.csv', index_col=None)
resampled_weather = pd.read_csv('../data/resampled/resampled_weather.csv', index_col=None)

In [41]:
resampled_test.head()

Unnamed: 0,Timestamp,obs_id,SiteId,ForecastId,Value
0,2015-08-29 00:00:00,1677832,1,1,7413780.0
1,2015-08-30 00:00:00,5379616,1,1,8927612.0
2,2015-08-31 00:00:00,496261,1,1,7288439.0
3,2015-09-01 00:00:00,4567147,1,1,8399679.0
4,2015-09-02 00:00:00,3684873,1,1,7576456.0


In [42]:
def process_time(df):
    
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df = df.set_index('Timestamp')
    
    df['wday'] = df.index.dayofweek
    df['mday'] = df.index.day
    df['yday'] = df.index.dayofyear
    df['month'] = df.index.month
    df['year'] = df.index.year
    
    df = df.reset_index(level=0)
    
    return df

processed_train = process_time(resampled_train)
processed_test = process_time(resampled_test)

In [44]:
resampled_train.head()

Unnamed: 0,Timestamp,obs_id,SiteId,ForecastId,Value,wday,mday,yday,month,year
0,2014-09-03,744519,1,1,909655.5,2,3,246,9,2014
1,2014-09-04,7627564,1,1,1748273.0,3,4,247,9,2014
2,2014-09-05,7034705,1,1,,4,5,248,9,2014
3,2014-09-06,5995486,1,1,,5,6,249,9,2014
4,2014-09-07,7326510,1,1,,6,7,250,9,2014


## Process weather data

In [46]:
def add_weather(df: pd.DataFrame, weather: pd.DataFrame):
    
    original_length = len(df)
    
    df = pd.merge(df, weather, how = 'left', on = ['Timestamp', 'SiteId'])
    
    df = df.drop_duplicates(['Timestamp', 'SiteId'], keep='first')
    
    new_length = len(df)
    
    assert original_length == new_length, 'New Length must match original length'

    return df

processed_train = add_weather(processed_train, resampled_weather)
processed_test = add_weather(processed_test, resampled_weather)

In [None]:
processed_train.head()

## Process meta data

In [None]:
site_ids = set(meta['SiteId'])
all_meta = pd.DataFrame(columns=['SiteId', 'wday', 'off'])

for site in site_ids:
    # Extract the metadata information for the site
    meta_slice = meta.loc[meta['SiteId'] == site]
    
    # Create a new dataframe for the site
    site_meta = pd.DataFrame(
        columns=['SiteId', 'wday', 'off', 'BaseTemperature', 'Surface'],
        index = [0, 1, 2, 3, 4, 5, 6]
    )
    
    site_meta['wday'] = [0, 1, 2, 3, 4, 5, 6]
    site_meta['SiteId'] = site

    # Record the days off
    site_meta.loc[0, 'off'] = float(meta_slice['MondayIsDayOff'])
    site_meta.loc[1, 'off'] = float(meta_slice['TuesdayIsDayOff'])
    site_meta.loc[2, 'off'] = float(meta_slice['WednesdayIsDayOff'])
    site_meta.loc[3, 'off'] = float(meta_slice['ThursdayIsDayOff'])
    site_meta.loc[4, 'off'] = float(meta_slice['FridayIsDayOff'])
    site_meta.loc[5, 'off'] = float(meta_slice['SaturdayIsDayOff'])
    site_meta.loc[6, 'off'] = float(meta_slice['SundayIsDayOff'])

    site_meta['BaseTemperature'] = float(meta_slice['BaseTemperature'])
    site_meta['Surface'] = float(meta_slice['Surface'])
    
    # Append the resulting dataframe to all site dataframe
    all_meta = pd.concat([all_meta, site_meta])

# Find the days off in the training and testing data
resampled_train = train.merge(all_meta, how = 'left', on = ['SiteId', 'wday'])
test = test.merge(all_meta, how = 'left', on = ['SiteId', 'wday'])

In [None]:


train.head()
test.head()

## Process weather data

In [None]:
def add_weather(df: pd.DataFrame, weather: pd.DataFrame):
    
    original_length = len(df)
    
    df = pd.merge(df, weather, how = 'left', on = ['Timestamp', 'SiteId'])
    
    df = df.sort_values(['Timestamp', 'SiteId', 'Distance'])
    df = df.drop_duplicates(['Timestamp', 'SiteId'], keep='first')
    
    new_length = len(df)
    
    assert original_length == new_length, 'New Length must match original length'

    return df

train = add_weather(train, weather)
test = add_weather(test, weather)

In [None]:
train.isna().sum()

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
# site = 2
# train_df = train[train['SiteId'] == site].sort_values(['Timestamp', 'Distance'])
# test_df = test[test['SiteId'] == site].sort_values(['Timestamp', 'Distance'])
# # train_df['Timestamp'].max()
# # test_df['Timestamp'].min()
# train_df = train_df[train_df['Timestamp'] < test_df['Timestamp'].min()]
# site_values = train_df.groupby(['year', 'month', 'mday'])['Value'].sum()
# site_values.reset_index()
# train_df = train_df.drop_duplicates(['year', 'month', 'mday']).merge(site_values, how = 'left', on = ['year', 'month', 'mday'])

# train_df.columns

In [None]:
def process(forecast_id):
	print("Current forecast_id ", forecast_id)
	# Drop duplicate in testing data
	test_df = test[test['ForecastId'] == forecast_id].sort_values(['Timestamp', 'Distance'])
	test_df = test_df.drop_duplicates(['Timestamp'], keep='first')

	# Drop duplicate in training data
	train_df = train[train['ForecastId'] == forecast_id].sort_values(['Timestamp', 'Distance'])
	train_df = train_df.drop_duplicates(['Timestamp'], keep='first')

	# Filter to only use past training data
	train_df = train_df[train_df['Timestamp'] < test_df['Timestamp'].min()]

	print(train.head())
	
	if(len(train_df) <= 0): 
		return pd.DataFrame(), pd.DataFrame()

	# Impute the missing values
	value_median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median')
	value_median_imputer.fit(train_df[['Value']])

	if pd.isnull(train_df['Value']).all():
		train_df['Value'] = 0
	else:
		train_df['Value'] = value_median_imputer.transform(train_df[['Value']])

	# If all training temperatures are missing, drop temperatures from both training and testing
	if (np.all(np.isnan(train_df['Temperature']))) or (np.all(np.isnan(test_df['Temperature']))):
		train_df = train_df.drop(labels = 'Temperature', axis=1)
		test_df = test_df.drop(labels= 'Temperature', axis=1)

	# Otherwise impute the missing temperatures
	else:
		temp_median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median')
		temp_median_imputer.fit(train_df[['Temperature']])
		train_df['Temperature'] = temp_median_imputer.transform(train_df[['Temperature']])
		test_df['Temperature'] = temp_median_imputer.transform(test_df[['Temperature']])


	# Drop columns
	train_df = train_df.drop(columns = ['Distance', 'ForecastId'])
	test_df = test_df.drop(columns = ['Distance', 'ForecastId'])

	train_df['time_diff'] = train_df['Timestamp'].diff().fillna(0)
	test_df['time_diff'] = test_df['Timestamp'].diff().fillna(0)

	train_df.head()
	
	return train_df, test_df

site_list = list(set(train['ForecastId']))

processed_train = pd.DataFrame()
processed_test = pd.DataFrame()

for site in site_list:
	train_df, test_df = process(site)
	if processed_train.empty:
		processed_train = train_df
	processed_train = pd.concat([processed_train, train_df])
	if processed_test.empty:
		processed_test = test_df
	processed_test = pd.concat([processed_test, test_df])
