In [1]:
import pandas as pd
import numpy as np
import holidays
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
from os import path

In [2]:
# Produces a pickled dataframe with columns 
# ['building_id','site_id','timestamp','holiday'].
# This allows fast merges in the main file
# but we are likely to want to add other columns or
# filters by building id

In [3]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float16, 'year_built': np.float16, 'floor_count': np.float16},
}

file_loc = {}    
for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
    for name in ['building_metadata','weather_train','weather_test','train','test']:
        if path.exists(dir_path + name + '.csv'):
            file_loc[name]= dir_path + name + '.csv'
    
    building = pd.read_csv(file_loc['building_metadata'], dtype=file_dtype['building_metadata'])
    train = pd.read_csv(file_loc['train'], dtype=file_dtype['train'])
    test = pd.read_csv(file_loc['test'], dtype=file_dtype['test'])



In [4]:
all = train.drop(['meter_reading'], axis=1).append(test.drop(['row_id'], axis=1))

In [5]:
in_us = [0,2,3,4,6,8,9,10,13,14]
in_ca = [7,11]
in_uk = [1,5]
in_ie = [12]
us_cal =  holidays.US()
ca_cal = holidays.CA()
ie_cal = holidays.IE()
uk_cal = holidays.UK()

def holidayName(timestamp, site_id):
    if site_id in in_ca:
        return ca_cal.get(timestamp)
    elif site_id in in_uk:
        return uk_cal.get(timestamp)
    elif site_id in in_ie:
        return ie_cal.get(timestamp)
    else:
        return us_cal.get(timestamp)
    
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114483#latest-660771
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114874#latest-660970
class createHolidaysDF(TransformerMixin):
    def transform(self, df, **transform_params):
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        temp_df = df[['site_id','timestamp']]
        df['holiday'] = temp_df.apply(lambda x: holidayName(x.timestamp, x.site_id), axis=1)
        #df['holiday'] = df.apply(lambda x: all_holidays.get(x))
 #       df['holiday'] = df.apply(lambda x: holidayName(x.timestamp, x.site_id))
        df['holiday'] = df['holiday'].astype('category')
        return df

    def fit(self, X, y=None, **fit_params):
        return self

# quick test you should see new years days marked on the 1st and NaN for the 2nd
print(createHolidaysDF().transform(all.head(100000).merge(building, on='building_id', how='left'))[['building_id','meter','timestamp','holiday']])


       building_id  meter           timestamp         holiday
0                0      0 2016-01-01 00:00:00  New Year's Day
1                1      0 2016-01-01 00:00:00  New Year's Day
2                2      0 2016-01-01 00:00:00  New Year's Day
3                3      0 2016-01-01 00:00:00  New Year's Day
4                4      0 2016-01-01 00:00:00  New Year's Day
5                5      0 2016-01-01 00:00:00  New Year's Day
6                6      0 2016-01-01 00:00:00  New Year's Day
7                7      0 2016-01-01 00:00:00  New Year's Day
8                8      0 2016-01-01 00:00:00  New Year's Day
9                9      0 2016-01-01 00:00:00  New Year's Day
10              10      0 2016-01-01 00:00:00  New Year's Day
11              11      0 2016-01-01 00:00:00  New Year's Day
12              12      0 2016-01-01 00:00:00  New Year's Day
13              13      0 2016-01-01 00:00:00  New Year's Day
14              14      0 2016-01-01 00:00:00  New Year's Day
15      

In [6]:
holiday_df = createHolidaysDF().transform(all.merge(building, on='building_id', how='left'))[['building_id','meter','site_id','timestamp','holiday']]
print(holiday_df['holiday'].unique())

[New Year's Day, NaN, New Year Holiday [Scotland], New Year Holiday [Scotland] (Observed), Martin Luther King, Jr. Day, ..., Veterans Day (Observed), New Year Holiday [Scotland], New Year's Day (O..., St. Patrick's Day [Northern Ireland] (Observed), Canada Day (Observed), St. Patrick's Day (Observed)]
Length: 41
Categories (40, object): [New Year's Day, New Year Holiday [Scotland], New Year Holiday [Scotland] (Observed), Martin Luther King, Jr. Day, ..., New Year Holiday [Scotland], New Year's Day (O..., St. Patrick's Day [Northern Ireland] (Observed), Canada Day (Observed), St. Patrick's Day (Observed)]


In [7]:
holiday_df.to_pickle('../input/ashrae-energy-prediction/holiday_df.pickle')