# Extract Data that is not included in Dataset
Extract time series information such as 
   * day of week 
   * season
   * holidays by country
   
Only run once, which is why I am putting in jupyter notebook

In [2]:
import pandas as pd
import numpy as np
import holidays
from datetime import date, datetime

In [3]:
input_dir = "../data/RE-Europe/"
output_dir = "../data/processed/"

## Data Extraction
Day of week using date time. Seasons using predefined map. Holidays using node countries and holidays package 

In [4]:
nodes = pd.read_csv(input_dir + "Static_data/network_nodes.csv")
nodes.head()

Unnamed: 0,ID,name,country,voltage,latitude,longitude
0,1,P-1,POR,380,42.094674,-8.113982
1,2,P-2,POR,380,41.587435,-8.326462
2,3,P-3,POR,380,41.585994,-6.722728
3,4,P-4,POR,380,41.597684,-6.342848
4,5,P-5,POR,380,41.373957,-7.350949


In [7]:
# generate raw time series for covered period
date_df = df = pd.DataFrame(
        {'Time': pd.date_range('2012-01-01', '2015-01-01', freq='1H', closed='left')}
     )
dates = pd.to_datetime(date_df['Time'], unit = "d").dt.date
date_df['date'] = dates

dates = dates.unique()
date_df

Unnamed: 0,Time,date
0,2012-01-01 00:00:00,2012-01-01
1,2012-01-01 01:00:00,2012-01-01
2,2012-01-01 02:00:00,2012-01-01
3,2012-01-01 03:00:00,2012-01-01
4,2012-01-01 04:00:00,2012-01-01
...,...,...
26299,2014-12-31 19:00:00,2014-12-31
26300,2014-12-31 20:00:00,2014-12-31
26301,2014-12-31 21:00:00,2014-12-31
26302,2014-12-31 22:00:00,2014-12-31


### Extract Day of Week and Seasons

In [8]:
# define season information
Y=2012
seasons = [('winter', (date(Y,  1,  1),  date(Y,  3, 20))),
           ('spring', (date(Y,  3, 21),  date(Y,  6, 20))),
           ('summer', (date(Y,  6, 21),  date(Y,  9, 22))),
           ('autumn', (date(Y,  9, 23),  date(Y, 12, 20))),
           ('winter', (date(Y, 12, 21),  date(Y, 12, 31)))]

def get_season(date, seasons):
    # update season 
    Y = date.year
    seasons = [(season, (start.replace(year = Y), end.replace(year=Y))) 
               for season, (start, end) in seasons]
    
    # extract season
    return [season for season, (start, end) in seasons
                if start <= date <= end][0]

get_season(date_df.loc[1000, "date"], seasons)

'winter'

In [9]:
# extract day of week, month, year and season
date_df['dow'] = date_df.date.apply(lambda x: x.weekday())
date_df['month'] = date_df.date.apply(lambda x: x.month)
date_df['year'] = date_df.date.apply(lambda x: x.year)
date_df['hour'] = date_df['Time'].dt.hour
date_df['season'] = date_df.date.apply(lambda x: get_season(x, seasons))
date_df.head()

Unnamed: 0,Time,date,dow,month,year,hour,season
0,2012-01-01 00:00:00,2012-01-01,6,1,2012,0,winter
1,2012-01-01 01:00:00,2012-01-01,6,1,2012,1,winter
2,2012-01-01 02:00:00,2012-01-01,6,1,2012,2,winter
3,2012-01-01 03:00:00,2012-01-01,6,1,2012,3,winter
4,2012-01-01 04:00:00,2012-01-01,6,1,2012,4,winter


### Extract Holidays

In [10]:
# map country tags to work versions that work with holidays package
country_dict = {'POR': "PT",
             'ESP': "ES",
             'FRA': "FRA",
             'BEL': "BE",
             'CHE': "CH",
             'LUX': "LU",
             'NLD': "NL",
             'ITA': "IT",
             'DEU': "DE",
             'AUT': "AT",
             'DNK': "DK",
             'CZE': "CZ",
             'POL': "PL",
             'HUN': "HU",
             'SVK': "SK",
             'SVN': "SI",
             'HRV': "HR",
             'GRC': "GR",
             'ALB': None,
             'MKD': None,
             'BGR': "BG",
             'MNE': None,
             'BIH': None,
             'SRB': "RS",
             'ROU': None}

In [11]:
# generate our holiday lookup dictionary
holiday_dict = {}
countries = list(nodes.country.unique())

for y in [2012,2013,2014]:
    for c in countries:
        try: 
            hol = list(holidays.CountryHoliday(country_dict[c],years=y).keys())
            holiday_dict[c] += hol
        except KeyError:
            holiday_dict[c] = hol

holiday_dict

{'POR': [datetime.date(2012, 1, 1),
  datetime.date(2012, 4, 6),
  datetime.date(2012, 4, 8),
  datetime.date(2012, 6, 7),
  datetime.date(2012, 10, 5),
  datetime.date(2012, 11, 1),
  datetime.date(2012, 12, 1),
  datetime.date(2012, 4, 25),
  datetime.date(2012, 5, 1),
  datetime.date(2012, 6, 10),
  datetime.date(2012, 8, 15),
  datetime.date(2012, 12, 8),
  datetime.date(2012, 12, 25),
  datetime.date(2013, 1, 1),
  datetime.date(2013, 3, 29),
  datetime.date(2013, 3, 31),
  datetime.date(2013, 4, 25),
  datetime.date(2013, 5, 1),
  datetime.date(2013, 6, 10),
  datetime.date(2013, 8, 15),
  datetime.date(2013, 12, 8),
  datetime.date(2013, 12, 25),
  datetime.date(2014, 1, 1),
  datetime.date(2014, 4, 18),
  datetime.date(2014, 4, 20),
  datetime.date(2014, 4, 25),
  datetime.date(2014, 5, 1),
  datetime.date(2014, 6, 10),
  datetime.date(2014, 8, 15),
  datetime.date(2014, 12, 8),
  datetime.date(2014, 12, 25)],
 'ESP': [datetime.date(2012, 1, 1),
  datetime.date(2012, 1, 6),
  d

In [12]:
# now for each country generate the time series values
# I will then loop through every node / country combination and add the columns
holiday_time_dict = {}
for c in countries:
    holiday_time_dict[c] = date_df.date.apply(lambda x: 1 if x in holiday_dict[c] else 0)
    
holiday_time_dict

{'POR': 0        1
 1        1
 2        1
 3        1
 4        1
         ..
 26299    0
 26300    0
 26301    0
 26302    0
 26303    0
 Name: date, Length: 26304, dtype: int64, 'ESP': 0        1
 1        1
 2        1
 3        1
 4        1
         ..
 26299    0
 26300    0
 26301    0
 26302    0
 26303    0
 Name: date, Length: 26304, dtype: int64, 'FRA': 0        1
 1        1
 2        1
 3        1
 4        1
         ..
 26299    0
 26300    0
 26301    0
 26302    0
 26303    0
 Name: date, Length: 26304, dtype: int64, 'BEL': 0        1
 1        1
 2        1
 3        1
 4        1
         ..
 26299    0
 26300    0
 26301    0
 26302    0
 26303    0
 Name: date, Length: 26304, dtype: int64, 'CHE': 0        1
 1        1
 2        1
 3        1
 4        1
         ..
 26299    0
 26300    0
 26301    0
 26302    0
 26303    0
 Name: date, Length: 26304, dtype: int64, 'LUX': 0        1
 1        1
 2        1
 3        1
 4        1
         ..
 26299    0
 26300   

In [13]:
for i, row in nodes.iterrows():
    node_id = row.ID
    country = row.country
    date_df[str(node_id)] = holiday_time_dict[country]
    
date_df

Unnamed: 0,Time,date,dow,month,year,hour,season,1,2,3,...,1505,1506,1507,1508,1509,1510,1511,1512,1513,1514
0,2012-01-01 00:00:00,2012-01-01,6,1,2012,0,winter,1,1,1,...,1,1,1,1,1,1,1,0,1,1
1,2012-01-01 01:00:00,2012-01-01,6,1,2012,1,winter,1,1,1,...,1,1,1,1,1,1,1,0,1,1
2,2012-01-01 02:00:00,2012-01-01,6,1,2012,2,winter,1,1,1,...,1,1,1,1,1,1,1,0,1,1
3,2012-01-01 03:00:00,2012-01-01,6,1,2012,3,winter,1,1,1,...,1,1,1,1,1,1,1,0,1,1
4,2012-01-01 04:00:00,2012-01-01,6,1,2012,4,winter,1,1,1,...,1,1,1,1,1,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26299,2014-12-31 19:00:00,2014-12-31,2,12,2014,19,winter,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26300,2014-12-31 20:00:00,2014-12-31,2,12,2014,20,winter,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26301,2014-12-31 21:00:00,2014-12-31,2,12,2014,21,winter,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26302,2014-12-31 22:00:00,2014-12-31,2,12,2014,22,winter,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# drop date column and export 
date_df.drop(columns = "date", inplace = True)
date_df.to_csv(output_dir + "holidays and seasons.csv", index = False)