In [1]:
!pip install skrub -U
!pip install holidays -U
!pip install jours_feries_france -U
#!pip freeze



In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import holidays
import utils
from skrub import TableVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error

## Import the data

In [3]:
train_data = pd.read_parquet(Path("data") / "train.parquet")
train_data.set_index("date", inplace=True)

external_data = pd.read_csv(Path("data") / "external_data.csv")
external_data["date"] = pd.to_datetime(external_data["date"])
external_data.set_index("date", inplace=True)

## Data Preprocessing


### Removing duplicate rows
Only external_data has duplicated rows.

In [4]:
# Remove duplicates and keep first occurance
external_data.drop_duplicates(keep="first", inplace=True)

### Handling Missing Values
Only external_data has missing values.

#### Option 1: Drop features with >=50% missing values

In [5]:
threshold = 0.5
bool_drop = external_data.isna().mean() >= threshold

dropped_feat_ext = external_data.columns[bool_drop]
selected_feat_ext = external_data.columns[~bool_drop]

external_data.drop(dropped_feat_ext, axis=1, inplace=True)

print(f"Features dropped: {len(dropped_feat_ext)}")
print(dropped_feat_ext)

Features dropped: 20
Index(['niv_bar', 'geop', 'tn12', 'tn24', 'tx12', 'tx24', 'tminsol', 'sw',
       'tw', 'phenspe1', 'phenspe2', 'phenspe3', 'phenspe4', 'ctype2',
       'nnuage3', 'ctype3', 'hnuage3', 'nnuage4', 'ctype4', 'hnuage4'],
      dtype='object')


### Handling Outliers
3 Options:
1. Manual filter
2. Using quantiles
3. Isolation Forest

In [6]:
# Selecting the numerical columns with the np.number type
df_numerical = train_data.select_dtypes(include=np.number)
df_numerical.head()

Unnamed: 0_level_0,site_id,bike_count,latitude,longitude,log_bike_count
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-09-01 02:00:00,100007049,0.0,48.846028,2.375429,0.0
2020-09-01 03:00:00,100007049,1.0,48.846028,2.375429,0.693147
2020-09-01 04:00:00,100007049,0.0,48.846028,2.375429,0.0
2020-09-01 15:00:00,100007049,4.0,48.846028,2.375429,1.609438
2020-09-01 18:00:00,100007049,9.0,48.846028,2.375429,2.302585


In [7]:
# Selecting the numerical columns with the np.number type
df_numerical = external_data.select_dtypes(include=np.number)
df_numerical.head()

Unnamed: 0_level_0,numer_sta,pmer,tend,cod_tend,dd,ff,t,td,u,vv,...,rr1,rr3,rr6,rr12,rr24,nnuage1,ctype1,hnuage1,nnuage2,hnuage2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01 00:00:00,7149,100810,80,1,270,1.8,272.75,272.15,96,990,...,0.0,0.0,0.0,0.0,2.0,1.0,6.0,600.0,,
2021-01-01 03:00:00,7149,100920,110,3,300,1.7,271.25,270.95,98,210,...,0.0,0.0,0.0,0.0,1.2,1.0,6.0,1500.0,2.0,3000.0
2021-01-01 06:00:00,7149,100950,30,3,290,2.6,271.95,271.65,98,3660,...,0.0,0.0,0.0,0.0,1.0,3.0,6.0,480.0,4.0,2000.0
2021-01-01 09:00:00,7149,101100,150,2,280,1.7,272.45,272.05,97,3500,...,0.0,0.2,0.2,0.2,0.2,1.0,6.0,1740.0,3.0,2800.0
2021-01-01 12:00:00,7149,101110,30,0,50,1.0,276.95,274.15,82,8000,...,0.0,0.0,0.2,0.2,0.2,1.0,8.0,330.0,4.0,570.0


## Merge datasets based on Date

In [9]:
# Upsample to hourly frequency
# 3 methods: forward fill, backward fill, linear interpolation
# external_data_resampled = external_data.resample('h').ffill()
# external_data_resampled = external_data.resample('h').bfill()
# external_data_resampled = external_data.resample("h").interpolate(method="linear")
external_data_resampled = external_data.resample("h").interpolate(
    method="time"
)  # Best way to do it

# Reset index if needed
external_data_resampled = external_data_resampled.reset_index()

In [10]:
train_merged = pd.merge(train_data, external_data_resampled, how="inner", on="date")
train_merged

Unnamed: 0,date,counter_id,counter_name,site_id,site_name,bike_count,counter_installation_date,coordinates,counter_technical_id,latitude,...,rr1,rr3,rr6,rr12,rr24,nnuage1,ctype1,hnuage1,nnuage2,hnuage2
0,2020-09-01 02:00:00,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.0,0.0,0.0,0.000000,0.0,,,,,
1,2020-09-01 03:00:00,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.0,0.0,0.0,0.000000,0.0,,,,,
2,2020-09-01 04:00:00,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.0,0.0,0.0,0.000000,0.0,,,,,
3,2020-09-01 15:00:00,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.0,0.0,0.0,0.000000,0.0,2.000000,8.000000,1700.000000,5.000000,2300.000000
4,2020-09-01 18:00:00,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,0.0,0.0,0.0,0.000000,0.0,1.000000,8.000000,1700.000000,7.000000,2400.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496822,2021-09-09 06:00:00,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,445.0,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,0.0,0.0,-0.1,1.200000,1.2,7.000000,7.000000,120.000000,6.000000,2520.000000
496823,2021-09-09 10:00:00,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,145.0,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,0.0,0.0,0.0,0.766667,1.2,5.666667,7.166667,520.000000,6.333333,1610.000000
496824,2021-09-09 15:00:00,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,218.0,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,0.0,0.0,0.0,0.000000,1.2,3.000000,8.000000,1380.000000,4.000000,1560.000000
496825,2021-09-09 22:00:00,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,21.0,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,0.0,0.0,0.4,0.600000,1.4,4.000000,3.666667,3713.333333,4.000000,6006.666667


## Feature Selection

In [13]:
# Features of train.paquet
selected_columns = [
    "counter_technical_id",
    "latitude",
    "longitude",
    "counter_id",
    "site_id",
    "date",
]

dropped_columns = [
    "counter_name",
    "site_name",
    "bike_count",
    "counter_installation_date",
    "coordinates",
]

## Feature Engineering

#### Encode dates

In [11]:
def _encode_dates(X):
    X = X.copy()  # modify a copy of X

    # Encode the date information from the date column
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday + 1
    X["hour"] = X["date"].dt.hour

    X["is_weekend"] = (X["weekday"] > 5).astype(int)
    X["is_holiday"] = (
        X["date"].apply(lambda x: 1 if x in holidays.FR() else 0).astype(int)
    )

    return X.drop(columns=["date"])

In [12]:
train_merged = _encode_dates(train_merged)

### Rescale

### Encode 

#### Encode Weather

## Selecting features with ANOVA

# Modelling
1. SGD Regressor (<100k samples)
2. Lasso / ElasticNet (>100k samples + few features should be important)
3. RidgeRegression / SVR(kernel='linear') (>100k samples + many features should be important)

Data for modelling

In [13]:
train_columns = [
    "counter_id",
    # "site_id",
    # "counter_technical_id",
    "latitude",
    "longitude",
    "is_holiday",
    "year",
    "month",
    "day",
    "weekday",
    "hour",
    "is_weekend",
    "ff",  #: "Vitesse_du_vent_moyen_10mn",
    "t",  #: "Température_K",
    # "u",  #: "humidity", # Correlated with 'Temps_présent' (ww, w1, w2) and "État_du_sol" (etat_sol)
    "vv",  #: "visibility_h",
    "ww",  #: "Temps_présent",
    # "w1",  #: "Temps_passé_1", # Correlated with w2 and ww so kept ww
    # "w2",  #: "Temps_passé_2", # Correlated with w1 and ww so kept ww
    "n",  #: "Nebulosité_totale", #
    "etat_sol",  #: "État_du_sol",
    "ht_neige",  #: "Hauteur_totale_neige",
    "rr1",  #: "Précipitations_1h",
    # "rr3",  #: "Précipitations_3h",   # Correlated with each other so only took rr1
    # "rr6",  #: "Précipitations_6h",   # Correlated with each other so only took rr1
    # "rr12",  #: "Précipitations_12h", # Correlated with each other so only took rr1
    # "rr24",  #: "Précipitations_24h",  # Not so correlated with rr1 but info in etat_sol so kept etat_sol
]

In [14]:
X = train_merged[train_columns]
y = train_merged["log_bike_count"]

Train-test split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Model with Vectorized Table

In [16]:
model = make_pipeline(TableVectorizer(), HistGradientBoostingRegressor())

model.fit(X_train, y_train)

In [17]:
print(f"Train set, RMSE={root_mean_squared_error(y_train, model.predict(X_train)):.2f}")
print(f"Valid set, RMSE={root_mean_squared_error(y_test, model.predict(X_test)):.2f}")

Train set, RMSE=0.47
Valid set, RMSE=0.48


In [18]:
rmse = root_mean_squared_error(y_test, model.predict(X_test))
print(f"Root Mean Squared Error: {rmse}")

test_data = pd.read_parquet(Path("data") / "final_test.parquet")
test_data_merged = test_data.merge(external_data_resampled, on="date", how="inner")
test_data_merged = _encode_dates(test_data_merged)[X.columns]
# test_data_merged = test_data_merged[X_train.columns]

Root Mean Squared Error: 0.4761325190197334


In [20]:
submission = model.predict(test_data_merged[X.columns])
print(submission.shape)
pd.Series(submission).to_frame().rename_axis("Id").rename(
    columns={0: "log_bike_count"}
).to_csv("submission5_081224.csv")

(51440,)
