In [1]:
import pandas as pd
import numpy as np
import holidays
from sklearn.pipeline import Pipeline, FeatureUnion, TransformerMixin
from os import path

In [2]:
# Produces a pickled dataframe with columns 
# ['building_id','site_id','timestamp','holiday'].
# This allows fast merges in the main file
# but we are likely to want to add other columns or
# filters by building id

In [3]:
file_dtype = {
    'train': {'building_id': np.int16, 'meter': np.int8, 'meter_reading': np.float32},
    'test': {'building_id': np.int16, 'meter': np.int8},
    'building_metadata': {'site_id': np.int8, 'building_id': np.uint16, 'square_feet': np.float16, 'year_built': np.float16, 'floor_count': np.float16},
}

file_loc = {}    
for dir_path in ['../input/ashrae-energy-prediction/','../input/_ashrae-energy-prediction/']:
    for name in ['building_metadata','weather_train','weather_test','train','test']:
        if path.exists(dir_path + name + '.csv'):
            file_loc[name]= dir_path + name + '.csv'
    
    building = pd.read_csv(file_loc['building_metadata'], dtype=file_dtype['building_metadata'])
    train = pd.read_csv(file_loc['train'], dtype=file_dtype['train'])
    test = pd.read_csv(file_loc['test'], dtype=file_dtype['test'])



In [25]:
all = train.drop(['meter_reading'], axis=1).append(test.drop(['row_id'], axis=1))
all = all.merge(building, on='building_id', how='left')
all = all[['site_id','timestamp']]
all = all.drop_duplicates(subset=['site_id','timestamp'])
print(all)

          site_id            timestamp
0               0  2016-01-01 00:00:00
103             1  2016-01-01 00:00:00
166             2  2016-01-01 00:00:00
453             3  2016-01-01 00:00:00
722             4  2016-01-01 00:00:00
804             5  2016-01-01 00:00:00
893             6  2016-01-01 00:00:00
959             7  2016-01-01 00:00:00
1001            8  2016-01-01 00:00:00
1069            9  2016-01-01 00:00:00
1375           10  2016-01-01 00:00:00
1423           11  2016-01-01 00:00:00
1434           12  2016-01-01 00:00:00
1470           13  2016-01-01 00:00:00
1778           14  2016-01-01 00:00:00
2064           15  2016-01-01 00:00:00
2301            0  2016-01-01 01:00:00
2404            1  2016-01-01 01:00:00
2467            2  2016-01-01 01:00:00
2752            3  2016-01-01 01:00:00
3021            4  2016-01-01 01:00:00
3103            5  2016-01-01 01:00:00
3192            6  2016-01-01 01:00:00
3258            7  2016-01-01 01:00:00
3300            8  2016-0

In [26]:
in_us = [0,2,3,4,6,8,9,10,13,14,15]
in_ca = [7,11]
in_uk = [1,5]
in_ie = [12]
us_cal =  holidays.US()
ca_cal = holidays.CA()
ie_cal = holidays.IE()
uk_cal = holidays.UK()

def holidayName(timestamp, site_id):
    if site_id in in_ca:
        return ca_cal.get(timestamp)
    elif site_id in in_uk:
        return uk_cal.get(timestamp)
    elif site_id in in_ie:
        return ie_cal.get(timestamp)
    else:
        return us_cal.get(timestamp)
    
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114483#latest-660771
# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/114874#latest-660970
class createHolidaysDF(TransformerMixin):
    def transform(self, df, **transform_params):
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df['holiday'] = df.apply(lambda x: holidayName(x.timestamp, x.site_id), axis=1)
        #df['holiday'] = df.apply(lambda x: all_holidays.get(x))
 #       df['holiday'] = df.apply(lambda x: holidayName(x.timestamp, x.site_id))
        df['holiday'] = df['holiday'].astype('category')
        return df

    def fit(self, X, y=None, **fit_params):
        return self

# quick test you should see new years days marked on the 1st and NaN for the 2nd
print(createHolidaysDF().transform(all.head(100000)))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


          site_id           timestamp         holiday
0               0 2016-01-01 00:00:00  New Year's Day
103             1 2016-01-01 00:00:00  New Year's Day
166             2 2016-01-01 00:00:00  New Year's Day
453             3 2016-01-01 00:00:00  New Year's Day
722             4 2016-01-01 00:00:00  New Year's Day
804             5 2016-01-01 00:00:00  New Year's Day
893             6 2016-01-01 00:00:00  New Year's Day
959             7 2016-01-01 00:00:00  New Year's Day
1001            8 2016-01-01 00:00:00  New Year's Day
1069            9 2016-01-01 00:00:00  New Year's Day
1375           10 2016-01-01 00:00:00  New Year's Day
1423           11 2016-01-01 00:00:00  New Year's Day
1434           12 2016-01-01 00:00:00  New Year's Day
1470           13 2016-01-01 00:00:00  New Year's Day
1778           14 2016-01-01 00:00:00  New Year's Day
2064           15 2016-01-01 00:00:00  New Year's Day
2301            0 2016-01-01 01:00:00  New Year's Day
2404            1 2016-01-01

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [27]:
holiday_df = createHolidaysDF().transform(all).dropna(axis=0)
print(holiday_df['holiday'].unique())

[New Year's Day, New Year Holiday [Scotland], New Year Holiday [Scotland] (Observed), Martin Luther King, Jr. Day, Washington's Birthday, ..., Veterans Day (Observed), New Year Holiday [Scotland], New Year's Day (O..., St. Patrick's Day [Northern Ireland] (Observed), Canada Day (Observed), St. Patrick's Day (Observed)]
Length: 40
Categories (40, object): [New Year's Day, New Year Holiday [Scotland], New Year Holiday [Scotland] (Observed), Martin Luther King, Jr. Day, ..., New Year Holiday [Scotland], New Year's Day (O..., St. Patrick's Day [Northern Ireland] (Observed), Canada Day (Observed), St. Patrick's Day (Observed)]


In [31]:
print(holiday_df.sample(100))

          site_id           timestamp  \
8101061         1 2016-05-30 02:00:00   
45230426       10 2017-01-02 14:00:00   
32000385        3 2018-05-28 20:00:00   
45247176       10 2017-01-16 14:00:00   
29839895        3 2017-07-04 06:00:00   
54071317       14 2017-09-04 05:00:00   
54628021       14 2017-11-23 18:00:00   
25272384        2 2017-09-04 03:00:00   
49772977       13 2018-01-15 20:00:00   
10107736       10 2016-07-04 17:00:00   
17403757        8 2016-11-11 16:00:00   
26100947        2 2018-01-01 14:00:00   
19920729        0 2016-12-26 19:00:00   
45619526       10 2017-11-23 12:00:00   
54542485       14 2017-11-11 09:00:00   
54543925       14 2017-11-11 14:00:00   
8125158         4 2016-05-30 12:00:00   
46522894       12 2017-08-07 08:00:00   
38132778        7 2017-07-03 09:00:00   
27550           0 2016-01-01 12:00:00   
19889155       11 2016-12-26 05:00:00   
18143691       10 2016-11-24 21:00:00   
45605076       10 2017-11-11 11:00:00   
21498102        

In [32]:
holiday_df.to_pickle('../input/ashrae-energy-prediction-pickles/holiday_df.pickle')