In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
# Let's get the data
train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")

In [None]:
# What countries?
train.country.unique()

In [None]:
# What about in test?
test.country.unique()

In [None]:
# Train time frame
train.date.min(), train.date.max()

In [None]:
# Test time frame
test.date.min(), test.date.max()

# Festivities

Festivities in time series analysis are important. If the data is realistic they also should be important in this competition. Let's have a glance at how we can get them for the different countries.

In [None]:
holiday_list = list()

In [None]:
import holidays
import dateutil.easter as easter

print("--- FINLAND ---")
for date in holidays.Finland(years=[2015, 2016, 2017, 2018, 2019], observed=True).items():
    print(str(date[0]), date[1])
    holiday_list.append([date[0], date[1], "Finland"])

In [None]:
print("--- NORWAY ---")
for date in holidays.Norway(years=[2015, 2016, 2017, 2018, 2019], observed=True).items():
    print(str(date[0]), date[1])
    holiday_list.append([date[0], date[1], "Norway"])

In [None]:
print("--- SWEDEN ---")
for date in holidays.Sweden(years=[2015, 2016, 2017, 2018, 2019], observed=True).items():
    if date[1]!='Söndag':
        print(str(date[0]), date[1].replace(", Söndag", ""))
        holiday_list.append([date[0], date[1].replace(", Söndag", ""), "Sweden"])

Let's add some special dates and events meaningful for the competition

In [None]:
# Last week of the year
for year in [2015, 2016, 2017, 2018, 2019]:
    for i, day in enumerate(range(24, 32)):
        for country in ['Finland', 'Sweden', 'Norway']:
             holiday_list.append([pd.to_datetime(f"{year}-{12}-{day}").date(), 
                                  f"Last week of the year (day {i+1})", 
                                  country])
# Swedish Rock Concert
for start, end, year in [[3,6,2015],[8,11,2016],[7,10,2017],[6,10,2018],[5,8,2019]]:
    for i, day in enumerate(range(start, end+1)):
        holiday_list.append([pd.to_datetime(f"{year}-{6}-{day}").date(), 
                                  f"Swedish Rock Concert (day {i+1})", 
                                  "Sweden"])
        
# Last Wednesday of June
for date in ['2015-06-24', '2016-06-29', '2017-06-28', '2018-06-27', '2019-06-26']:
    for country in ['Finland', 'Sweden', 'Norway']:
         holiday_list.append([pd.to_datetime(date).date(), 
                                  f"Last Wednesday of June", 
                                  country])
            
# First Sunday of November
for date in ['2015-11-1', '2016-11-6', '2017-11-5', '2018-11-4', '2019-11-3']:
    for country in ['Finland', 'Sweden', 'Norway']:
         holiday_list.append([pd.to_datetime(date).date(), 
                                  f"First Sunday of November", 
                                  country])
            
# Independence Day of Finland
for year in [2015, 2016, 2017, 2018, 2019]:
    holiday_list.append([pd.to_datetime(f"{year}-{12}-{6}").date(), 
                                      f"Independence Day of Finland", 
                                      'Finland'])

# Easter
easter_date = [easter.easter(y) for y in [2015, 2016, 2017, 2018, 2019]]
for date in easter_date:
    for country in ['Finland', 'Sweden', 'Norway']:
         holiday_list.append([pd.to_datetime(date).date(), 
                                  f"Easter", 
                                  country])
    

Finally, let's turn all the dates into a pandas DataFrame.

In [None]:
holidays_df = pd.DataFrame(holiday_list, columns=['date', 'holiday', 'country'])
holidays_df = holidays_df.drop_duplicates(['date', 'country'], keep='first')
holidays_df.to_csv("nordic_holidays.csv")

## Happy Kaggling!