# import pandas as pd
import numpy as np
import plotly.graph_objects as goHolidays data 

In [61]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

In [62]:
holidays = pd.read_csv('../../data/01_raw/holidays.csv')

## Columns and shape

In [63]:
holidays.head()

Unnamed: 0,DT,HOL_NM
0,2018-01-01T00:00:00.000Z,New Year's Day
1,2018-01-01T00:00:00.000Z,New Year's Day
2,2018-01-01T00:00:00.000Z,New Year's Day
3,2018-01-01T00:00:00.000Z,New Year's Day
4,2018-01-01T00:00:00.000Z,New Year's Day


In [64]:
holidays.dtypes

DT        object
HOL_NM    object
dtype: object

In [65]:
holidays["DT"] = pd.to_datetime(holidays['DT']) # Change to timestamp

In [66]:
holidays.head()

Unnamed: 0,DT,HOL_NM
0,2018-01-01 00:00:00+00:00,New Year's Day
1,2018-01-01 00:00:00+00:00,New Year's Day
2,2018-01-01 00:00:00+00:00,New Year's Day
3,2018-01-01 00:00:00+00:00,New Year's Day
4,2018-01-01 00:00:00+00:00,New Year's Day


In [67]:
holidays.dtypes

DT        datetime64[ns, UTC]
HOL_NM                 object
dtype: object

In [68]:
holidays.shape

(32220, 2)

In [69]:
holidays["HOL_NM"] = holidays["HOL_NM"].astype("str")

In [70]:
holidays.head()

Unnamed: 0,DT,HOL_NM
0,2018-01-01 00:00:00+00:00,New Year's Day
1,2018-01-01 00:00:00+00:00,New Year's Day
2,2018-01-01 00:00:00+00:00,New Year's Day
3,2018-01-01 00:00:00+00:00,New Year's Day
4,2018-01-01 00:00:00+00:00,New Year's Day


## Dropping duplicates for the holidays column

In [71]:
holidays = holidays.drop_duplicates()

In [72]:
holidays.head()

Unnamed: 0,DT,HOL_NM
0,2018-01-01 00:00:00+00:00,New Year's Day
59,2018-01-06 00:00:00+00:00,Epiphany
113,2018-03-30 00:00:00+00:00,Good Friday
176,2018-05-01 00:00:00+00:00,Labour Day
226,2018-08-15 00:00:00+00:00,Assumption Day


In [73]:
holidays.shape

(90, 2)

## Checkings

### Check the names of the holidays type is the same

In [74]:
holidays["HOL_NM"].unique()

array(["New Year's Day", 'Epiphany', 'Good Friday', 'Labour Day',
       'Assumption Day', 'Fiesta Nacional de España', 'All Saints Day',
       'Constitution Day', 'Immaculate Conception', 'Christmas Day'],
      dtype=object)

## Feature engineering

In [75]:
### Putting the week (mondays) for all the data 

In [76]:
holidays['Week'] = holidays['DT'].apply(lambda x: x - pd.Timedelta(days=x.weekday()))

In [77]:
holidays.head()

Unnamed: 0,DT,HOL_NM,Week
0,2018-01-01 00:00:00+00:00,New Year's Day,2018-01-01 00:00:00+00:00
59,2018-01-06 00:00:00+00:00,Epiphany,2018-01-01 00:00:00+00:00
113,2018-03-30 00:00:00+00:00,Good Friday,2018-03-26 00:00:00+00:00
176,2018-05-01 00:00:00+00:00,Labour Day,2018-04-30 00:00:00+00:00
226,2018-08-15 00:00:00+00:00,Assumption Day,2018-08-13 00:00:00+00:00


In [78]:
holidays["Week"].dt.weekday.unique()

array([0], dtype=int32)

All the dates are on monday. 

### Now we have duplicates. What can we do with them?

#### Multiple posibilities:
- Generate a flag column when there is a holiday in that week
- Add number of holidays for each week
- Generate a dicotomic feature for each of the holidays types
- ...

In [79]:
type_of_holiday = holidays["HOL_NM"].unique()

In [80]:
for element in type_of_holiday: 
    holidays.loc[holidays["HOL_NM"] == element,"HOL_NM"] = element.replace(" de ","").replace(" ","").replace("ñ","n").replace("'","")

In [81]:
holidays["HOL_NM"].unique()

array(['NewYearsDay', 'Epiphany', 'GoodFriday', 'LabourDay',
       'AssumptionDay', 'FiestaNacionalEspana', 'AllSaintsDay',
       'ConstitutionDay', 'ImmaculateConception', 'ChristmasDay'],
      dtype=object)

In [82]:
for column in holidays["HOL_NM"].unique():
    holidays[column] = holidays["HOL_NM"].apply(lambda x: 1 if x == column else 0)

In [83]:
holidays

Unnamed: 0,DT,HOL_NM,Week,NewYearsDay,Epiphany,GoodFriday,LabourDay,AssumptionDay,FiestaNacionalEspana,AllSaintsDay,ConstitutionDay,ImmaculateConception,ChristmasDay
0,2018-01-01 00:00:00+00:00,NewYearsDay,2018-01-01 00:00:00+00:00,1,0,0,0,0,0,0,0,0,0
59,2018-01-06 00:00:00+00:00,Epiphany,2018-01-01 00:00:00+00:00,0,1,0,0,0,0,0,0,0,0
113,2018-03-30 00:00:00+00:00,GoodFriday,2018-03-26 00:00:00+00:00,0,0,1,0,0,0,0,0,0,0
176,2018-05-01 00:00:00+00:00,LabourDay,2018-04-30 00:00:00+00:00,0,0,0,1,0,0,0,0,0,0
226,2018-08-15 00:00:00+00:00,AssumptionDay,2018-08-13 00:00:00+00:00,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4734,2026-10-12 00:00:00+00:00,FiestaNacionalEspana,2026-10-12 00:00:00+00:00,0,0,0,0,0,1,0,0,0,0
4795,2026-11-01 00:00:00+00:00,AllSaintsDay,2026-10-26 00:00:00+00:00,0,0,0,0,0,0,1,0,0,0
4868,2026-12-06 00:00:00+00:00,ConstitutionDay,2026-11-30 00:00:00+00:00,0,0,0,0,0,0,0,1,0,0
4927,2026-12-08 00:00:00+00:00,ImmaculateConception,2026-12-07 00:00:00+00:00,0,0,0,0,0,0,0,0,1,0


In [84]:
holidays["number_of_holidays"] = holidays.groupby("Week").transform("size")

Removing not necessary columns

In [85]:
holidays = holidays.drop("DT",axis=1)

In [86]:
holidays = holidays.drop("HOL_NM",axis=1)

and aggregating by the week column all the features generated in the below steps

In [87]:
holidays = holidays.groupby(["Week"]).max().reset_index()
holidays = holidays.reset_index(drop = True)

In [88]:
holidays

Unnamed: 0,Week,NewYearsDay,Epiphany,GoodFriday,LabourDay,AssumptionDay,FiestaNacionalEspana,AllSaintsDay,ConstitutionDay,ImmaculateConception,ChristmasDay,number_of_holidays
0,2018-01-01 00:00:00+00:00,1,1,0,0,0,0,0,0,0,0,2
1,2018-03-26 00:00:00+00:00,0,0,1,0,0,0,0,0,0,0,1
2,2018-04-30 00:00:00+00:00,0,0,0,1,0,0,0,0,0,0,1
3,2018-08-13 00:00:00+00:00,0,0,0,0,1,0,0,0,0,0,1
4,2018-10-08 00:00:00+00:00,0,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
76,2026-10-12 00:00:00+00:00,0,0,0,0,0,1,0,0,0,0,1
77,2026-10-26 00:00:00+00:00,0,0,0,0,0,0,1,0,0,0,1
78,2026-11-30 00:00:00+00:00,0,0,0,0,0,0,0,1,0,0,1
79,2026-12-07 00:00:00+00:00,0,0,0,0,0,0,0,0,1,0,1


In [89]:
holidays["is_holiday"] = 1

In [90]:
holidays

Unnamed: 0,Week,NewYearsDay,Epiphany,GoodFriday,LabourDay,AssumptionDay,FiestaNacionalEspana,AllSaintsDay,ConstitutionDay,ImmaculateConception,ChristmasDay,number_of_holidays,is_holiday
0,2018-01-01 00:00:00+00:00,1,1,0,0,0,0,0,0,0,0,2,1
1,2018-03-26 00:00:00+00:00,0,0,1,0,0,0,0,0,0,0,1,1
2,2018-04-30 00:00:00+00:00,0,0,0,1,0,0,0,0,0,0,1,1
3,2018-08-13 00:00:00+00:00,0,0,0,0,1,0,0,0,0,0,1,1
4,2018-10-08 00:00:00+00:00,0,0,0,0,0,1,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,2026-10-12 00:00:00+00:00,0,0,0,0,0,1,0,0,0,0,1,1
77,2026-10-26 00:00:00+00:00,0,0,0,0,0,0,1,0,0,0,1,1
78,2026-11-30 00:00:00+00:00,0,0,0,0,0,0,0,1,0,0,1,1
79,2026-12-07 00:00:00+00:00,0,0,0,0,0,0,0,0,1,0,1,1
