In [1]:
from pathlib import Path
import pandas as pd
import re

from jours_feries_france import JoursFeries
from vacances_scolaires_france import SchoolHolidayDates

In [2]:
data = pd.read_parquet(Path("../data") / "train.parquet")

In [9]:
data.shape

(455163, 11)

In [3]:
def _encode_dates(X):
    """
    """
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

In [4]:
encoded_dates = _encode_dates(data[["date"]])
encoded_dates.year.unique()

array([2020, 2021])

In [5]:
def _additional_date_variables(X):
    """
    """
    X = X.copy()  # modify a copy of X

    # add seasons
    seasons = {1: "winter", 2: "winter", 3: "spring", 4: "spring", 
               5: "spring", 6: "summer", 7: "summer", 8: "summer", 
               9: "autumn", 10: "autumn", 11: "autumn", 12: "winter"}
    X.loc[:, "season"] = X["date"].dt.month.map(seasons)

    public_holidays = []
    school_holidays = {}
    for year in X["date"].dt.year.unique():
        public_holidays.extend(JoursFeries.for_year(year).values())
        school_holidays.update(SchoolHolidayDates().holidays_for_year_and_zone(year, 'C'))
    
    # add public holidays
    X.loc[:, "public_holiday"] = X["date"].isin(public_holidays)

    # add school holidays names
    school_holidays_name = {k: re.sub("\s+|'", '_', 
                                      re.sub('[éë]', 'e', v['nom_vacances'].lower())) 
                            for k, v in school_holidays.items() if v['vacances_zone_c']}
    X.loc[:, "school_holiday_name"] = X["date"].map(school_holidays_name)

    # add school holidays
    school_holidays_bool = [k for k,v in school_holidays.items() if v['vacances_zone_c']]
    X.loc[:, "school_holiday"] = X["date"].isin(school_holidays_bool)

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

In [6]:
additional_dates = _additional_date_variables(data[["date"]])
additional_dates.head()

Unnamed: 0,season,public_holiday,school_holiday_name,school_holiday
48321,autumn,False,,False
48324,autumn,False,,False
48327,autumn,False,,False
48330,autumn,False,,False
48333,autumn,False,,False


In [10]:
full_data = pd.concat([data, encoded_dates, additional_dates], axis=1)

In [15]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 455163 entries, 48321 to 928462
Data columns (total 20 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 455163 non-null  category      
 1   counter_name               455163 non-null  category      
 2   site_id                    455163 non-null  int64         
 3   site_name                  455163 non-null  category      
 4   bike_count                 455163 non-null  float64       
 5   date                       455163 non-null  datetime64[ns]
 6   counter_installation_date  455163 non-null  datetime64[ns]
 7   counter_technical_id       455163 non-null  category      
 8   latitude                   455163 non-null  float64       
 9   longitude                  455163 non-null  float64       
 10  log_bike_count             455163 non-null  float64       
 11  year                       455163 non-null  int6

In [14]:
full_data.corr()

  full_data.corr()


Unnamed: 0,site_id,bike_count,latitude,longitude,log_bike_count,year,month,day,weekday,hour,public_holiday,school_holiday
site_id,1.0,-0.004168,-0.127437,-0.18588,0.01889,0.088022,-0.069248,0.005392,0.000962,-2.4e-05,0.000696,0.003605
bike_count,-0.004168,1.0,0.13003,0.015162,0.738974,-0.025228,0.027991,-0.008853,-0.076188,0.128626,-0.022529,-0.070846
latitude,-0.127437,0.13003,1.0,0.153146,0.108289,-0.010695,0.008137,-0.000164,-0.000148,-1.5e-05,-0.000113,-0.000538
longitude,-0.18588,0.015162,0.153146,1.0,0.044736,-0.013573,0.009238,0.00176,-0.00027,-1.8e-05,-0.000259,-0.001089
log_bike_count,0.01889,0.738974,0.108289,0.044736,1.0,-0.019968,0.048162,-0.012793,-0.056949,0.268903,-0.040998,-0.128896
year,0.088022,-0.025228,-0.010695,-0.013573,-0.019968,1.0,-0.855011,-0.031391,0.009201,-0.00053,0.003997,0.019296
month,-0.069248,0.027991,0.008137,0.009238,0.048162,-0.855011,1.0,-0.000195,-0.01166,0.000325,0.001684,0.008122
day,0.005392,-0.008853,-0.000164,0.00176,-0.012793,-0.031391,-0.000195,1.0,0.015379,-6e-05,-0.020085,0.042054
weekday,0.000962,-0.076188,-0.000148,-0.00027,-0.056949,0.009201,-0.01166,0.015379,1.0,7.9e-05,0.00163,0.010972
hour,-2.4e-05,0.128626,-1.5e-05,-1.8e-05,0.268903,-0.00053,0.000325,-6e-05,7.9e-05,1.0,-0.058126,-0.187804
