In [92]:
!pip install skrub -U
!pip install holidays -U
!pip install jours_feries_france -U
#!pip freeze

Collecting jours_feries_france
  Downloading jours_feries_france-0.7.0.tar.gz (4.9 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: jours_feries_france
  Building wheel for jours_feries_france (setup.py) ... [?25ldone
[?25h  Created wheel for jours_feries_france: filename=jours_feries_france-0.7.0-py3-none-any.whl size=4979 sha256=c70b6dec8638d52e1b4af0ce5ab0f07f173233adec27a4867afd53fe28fe8145
  Stored in directory: /Users/rafaelvalente/Library/Caches/pip/wheels/6c/91/19/bc9f25ded8d3f5b48cfa3a01cfce9cf767fc027988e4eee9fb
Successfully built jours_feries_france
Installing collected packages: jours_feries_france
Successfully installed jours_feries_france-0.7.0


In [93]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import holidays
import utils
from skrub import TableVectorizer
from jours_feries_france import JoursFeries

## Import the data

In [94]:
train_data = pd.read_parquet(Path("data") / "train.parquet")
train_data.set_index("date", inplace=True)

external_data = pd.read_csv(Path("data") / "external_data.csv")
external_data["date"] = pd.to_datetime(external_data["date"])
external_data.set_index("date", inplace=True)

## Data Preprocessing


### Removing duplicate rows
Only external_data has duplicated rows.

In [95]:
# Remove duplicates and keep first occurance
external_data.drop_duplicates(keep="first", inplace=True)

### Handling Missing Values
Only external_data has missing values.

#### Option 1: Drop features with >=50% missing values

In [96]:
threshold = 0.5
bool_drop = external_data.isna().mean() >= threshold

dropped_feat_ext = external_data.columns[bool_drop]
selected_feat_ext = external_data.columns[~bool_drop]

external_data.drop(dropped_feat_ext, axis=1, inplace=True)

print(f"Features dropped: {len(dropped_feat_ext)}")

Features dropped: 20


We also have different options to impute. We can impute with mean, median, mode, or even use models to predict missing values

## Merge datasets based on Date

In [105]:
# Upsample to hourly frequency
# 3 methods: forward fill, backward fill, linear interpolation
# external_data_resampled = external_data.resample('h').ffill()
# external_data_resampled = external_data.resample('h').bfill()
external_data_resampled = external_data.resample("h").interpolate(method="linear")

# Reset index if needed
external_data_resampled = external_data_resampled.reset_index()

In [110]:
train_merged = pd.merge(train_data, external_data_resampled, how="inner", on="date")
train_merged

Unnamed: 0,date,counter_id,counter_name,site_id,site_name,bike_count,counter_installation_date,coordinates,counter_technical_id,latitude,...,rr1,rr3,rr6,rr12,rr24,nnuage1,ctype1,hnuage1,nnuage2,hnuage2
0,2020-09-01 02:00:00,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.0,0.0,0.0,0.000000,0.0,,,,,
1,2020-09-01 03:00:00,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.0,0.0,0.0,0.000000,0.0,,,,,
2,2020-09-01 04:00:00,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.0,0.0,0.0,0.000000,0.0,,,,,
3,2020-09-01 15:00:00,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.0,0.0,0.0,0.000000,0.0,2.000000,8.000000,1700.000000,5.000000,2300.000000
4,2020-09-01 18:00:00,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.0,0.0,0.0,0.000000,0.0,1.000000,8.000000,1700.000000,7.000000,2400.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496822,2021-09-09 06:00:00,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,445.0,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,0.0,0.0,-0.1,1.200000,1.2,7.000000,7.000000,120.000000,6.000000,2520.000000
496823,2021-09-09 10:00:00,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,145.0,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,0.0,0.0,0.0,0.766667,1.2,5.666667,7.166667,520.000000,6.333333,1610.000000
496824,2021-09-09 15:00:00,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,218.0,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,0.0,0.0,0.0,0.000000,1.2,3.000000,8.000000,1380.000000,4.000000,1560.000000
496825,2021-09-09 22:00:00,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,21.0,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,0.0,0.0,0.4,0.600000,1.4,4.000000,3.666667,3713.333333,4.000000,6006.666667


## Feature Selection

In [109]:
# Features of train.paquet
selected_columns = [
    "counter_technical_id",
    "latitude",
    "longitude",
    "counter_id",
    "site_id",
    "date",
]

dropped_columns = [
    "counter_name",
    "site_name",
    "bike_count",
    "counter_installation_date",
    "coordinates",
]

## Feature Engineering

#### Encode dates

In [161]:
def _encode_dates(X):
    X = X.copy()  # modify a copy of X

    # Encode the date information from the date column
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday + 1
    X["hour"] = X["date"].dt.hour

    X["is_weekend"] = (X["weekday"] > 5).astype(int)
    X["is_holiday"] = (
        X["date"].apply(lambda x: 1 if x in holidays.FR() else 0).astype(int)
    )

    return X.drop(columns=["date"])

In [162]:
train_merged = _encode_dates(train_merged)

### Rescale

### Encode 

#### Encode Weather

## Selecting features with ANOVA

# Modelling

Data for modelling

In [188]:
train_columns = [
    "counter_id",
    "site_id",
    "counter_technical_id",
    "latitude",
    "longitude",
    "numer_sta",
    "pmer",
    "tend",
    "cod_tend",
    "dd",
    "ff",
    "t",
    "td",
    "u",
    "vv",
    "ww",
    "w1",
    "w2",
    "n",
    "nbas",
    "hbas",
    "cl",
    "cm",
    "ch",
    "pres",
    "tend24",
    "raf10",
    "rafper",
    "per",
    "etat_sol",
    "ht_neige",
    "ssfrai",
    "perssfrai",
    "rr1",
    "rr3",
    "rr6",
    "rr12",
    "rr24",
    "nnuage1",
    "ctype1",
    "hnuage1",
    "nnuage2",
    "hnuage2",
    "is_holiday",
    "year",
    "month",
    "day",
    "weekday",
    "hour",
    "is_weekend",
]

In [189]:
X = train_merged[train_columns]
y = train_merged["log_bike_count"]

Train-test split

In [190]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Model with Vectorized Table

In [191]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import HistGradientBoostingRegressor

model = make_pipeline(TableVectorizer(), HistGradientBoostingRegressor())

model.fit(X_train, y_train)

In [192]:
from sklearn.metrics import root_mean_squared_error

print(f"Train set, RMSE={root_mean_squared_error(y_train, model.predict(X_train)):.2f}")
print(f"Valid set, RMSE={root_mean_squared_error(y_test, model.predict(X_test)):.2f}")

Train set, RMSE=0.47
Valid set, RMSE=0.48


In [197]:
rmse = root_mean_squared_error(y_test, model.predict(X_test))
print(f"Root Mean Squared Error: {rmse}")

test_data = pd.read_parquet(Path("data") / "final_test.parquet")
test_data_merged = test_data.merge(external_data_resampled, on="date", how="inner")
test_data_merged = _encode_dates(test_data_merged)[X.columns]
# test_data_merged = test_data_merged[X_train.columns]

Root Mean Squared Error: 0.47590429723419136


In [198]:
test_data_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51440 entries, 0 to 51439
Data columns (total 50 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   counter_id            51440 non-null  category
 1   site_id               51440 non-null  int64   
 2   counter_technical_id  51440 non-null  category
 3   latitude              51440 non-null  float64 
 4   longitude             51440 non-null  float64 
 5   numer_sta             51440 non-null  float64 
 6   pmer                  51440 non-null  float64 
 7   tend                  51440 non-null  float64 
 8   cod_tend              51440 non-null  float64 
 9   dd                    51440 non-null  float64 
 10  ff                    51440 non-null  float64 
 11  t                     51440 non-null  float64 
 12  td                    51440 non-null  float64 
 13  u                     51440 non-null  float64 
 14  vv                    51440 non-null  float64 
 15  ww

In [199]:
submission = model.predict(test_data_merged[X.columns])
print(submission.shape)
pd.Series(submission).to_frame().rename_axis("Id").rename(
    columns={0: "log_bike_count"}
).to_csv("submission2_081224.csv")

(51440,)
