In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn


In [2]:
data = pd.read_parquet("data/train.parquet")
data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.609438
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585


In [3]:
# Parse the 'date' column and extract time-based features for the training data

data['date'] = pd.to_datetime(data['date'])

data['hour'] = data['date'].dt.hour

data['day_of_week'] = data['date'].dt.dayofweek

data['day'] = data['date'].dt.day

data['month'] = data['date'].dt.month

data['year'] = data["date"].dt.year


In [4]:
# Check for missing values in the training dataset

missing_values = data.isnull().sum()



# Display the transformed data and missing value summary

data.head(), missing_values

(                counter_id              counter_name    site_id  \
 48321  100007049-102007049  28 boulevard Diderot E-O  100007049   
 48324  100007049-102007049  28 boulevard Diderot E-O  100007049   
 48327  100007049-102007049  28 boulevard Diderot E-O  100007049   
 48330  100007049-102007049  28 boulevard Diderot E-O  100007049   
 48333  100007049-102007049  28 boulevard Diderot E-O  100007049   
 
                   site_name  bike_count                date  \
 48321  28 boulevard Diderot         0.0 2020-09-01 02:00:00   
 48324  28 boulevard Diderot         1.0 2020-09-01 03:00:00   
 48327  28 boulevard Diderot         0.0 2020-09-01 04:00:00   
 48330  28 boulevard Diderot         4.0 2020-09-01 15:00:00   
 48333  28 boulevard Diderot         9.0 2020-09-01 18:00:00   
 
       counter_installation_date         coordinates counter_technical_id  \
 48321                2013-01-18  48.846028,2.375429          Y2H15027244   
 48324                2013-01-18  48.846028,2.3754

In [5]:
test_data = pd.read_parquet("data/final_test.parquet")
test_data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 01:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 13:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 17:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 19:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 22:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429


In [6]:
# Parse the 'date' column and extract time-based features for the test data

test_data['date'] = pd.to_datetime(test_data['date'])

test_data['hour'] = test_data['date'].dt.hour

test_data['day_of_week'] = test_data['date'].dt.dayofweek

test_data['day'] = test_data['date'].dt.day

test_data['month'] = test_data['date'].dt.month

test_data['year'] = test_data["date"].dt.year

# Check for missing values in the test dataset

test_missing_values = test_data.isnull().sum()



# Display the transformed test data and missing value summary

test_data.head(), test_missing_values

(            counter_id              counter_name    site_id  \
 0  100007049-102007049  28 boulevard Diderot E-O  100007049   
 1  100007049-102007049  28 boulevard Diderot E-O  100007049   
 2  100007049-102007049  28 boulevard Diderot E-O  100007049   
 3  100007049-102007049  28 boulevard Diderot E-O  100007049   
 4  100007049-102007049  28 boulevard Diderot E-O  100007049   
 
               site_name                date counter_installation_date  \
 0  28 boulevard Diderot 2021-09-10 01:00:00                2013-01-18   
 1  28 boulevard Diderot 2021-09-10 13:00:00                2013-01-18   
 2  28 boulevard Diderot 2021-09-10 17:00:00                2013-01-18   
 3  28 boulevard Diderot 2021-09-10 19:00:00                2013-01-18   
 4  28 boulevard Diderot 2021-09-10 22:00:00                2013-01-18   
 
           coordinates counter_technical_id   latitude  longitude  hour  \
 0  48.846028,2.375429          Y2H15027244  48.846028   2.375429     1   
 1  48.846028,2.37

In [7]:
from sklearn.model_selection import train_test_split



# Define features and target

features = ['counter_name', 'site_name', 'hour', 'day_of_week', 'month', 'year', 'latitude', 'longitude', 'day']

target = 'log_bike_count'



# Split the data into training and validation sets

X_train, X_val, y_train, y_val = train_test_split(

    data[features], data[target], test_size=0.2, random_state=42

)



# Check the shapes of the resulting datasets

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((397461, 9), (99366, 9), (397461,), (99366,))

In [8]:
from sklearn.metrics import mean_squared_error

In [21]:
from sklearn.ensemble import (
    RandomForestRegressor,
    RandomForestClassifier,
    HistGradientBoostingClassifier,
    HistGradientBoostingRegressor
)
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
# Creating a pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from skrub import TableVectorizer
from xgboost import XGBRegressor

# Ensure y is a 1D array
# y_train = y_train.values.ravel()

# categorical_encoder = OneHotEncoder(handle_unknown="ignore")
# categorical_cols = ["counter_name", "site_name"]

# date_cols = ["year", "month", "day", "day_of_week", "hour"]

# preprocessor = ColumnTransformer(
#     [
#         ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
#         ("cat", categorical_encoder, categorical_cols),
#     ]
# )

# Defining the pipeline
pipeline = Pipeline(
    steps=[
        ("vectorizer", TableVectorizer()),
        ("regressor", XGBRegressor(n_estimators=500, max_depth=15, learning_rate=0.1, random_state=42, colsample_bytree=0.8, subsample=0.8)),
    ]
)

# Fit the pipeline on the data
pipeline.fit(X_train, y_train)

# hgb_regressor = HistGradientBoostingRegressor(random_state=42)
# hgb_regressor.fit(X_train, y_train)
# y_val_pred_hgb = hgb_regressor.predict(X_val)

# # # Calculate RMSE for the XGBoost model
# rmse_hgb = np.sqrt(mean_squared_error(y_val, y_val_pred_hgb))
# rmse_hgb

In [22]:
y_val_pred_hgb = pipeline.predict(X_val)

rmse_skrub = np.sqrt(mean_squared_error(y_val, y_val_pred_hgb))
rmse_skrub

np.float64(0.3555559263198925)

In [23]:
test_data_predictions = pipeline.predict(test_data[features])
# Add predictions to the test dataset

test_data['log_bike_count'] = test_data_predictions

predictions_output = test_data[['log_bike_count']].copy()

predictions_output.insert(0, 'Id', predictions_output.index)

# Save to CSV file

output_file_path = 'data/Test_Predictions_xgb_skrub2.csv'

predictions_output.to_csv(output_file_path, index=False)

In [None]:
# Define a small parameter grid for XGBRegressor

param_grid = {

    'regressor__n_estimators': [50, 100, 200],

    'regressor__max_depth': [3, 5, 7],

    'regressor__learning_rate': [0.01, 0.1, 0.2],

    'regressor__subsample': [0.8, 1.0],

    'regressor__colsample_bytree': [0.8, 1.0]

}



# # Initialize XGBRegressor

# xgb_model = XGBRegressor(random_state=42)
# Defining the pipeline
pipeline = Pipeline(
    steps=[
        ("vectorizer", TableVectorizer()),
        ("regressor", XGBRegressor(random_state=42)),
    ]
)


# Perform GridSearchCV to find the best parameters

grid_search = GridSearchCV(

    estimator=pipeline,

    param_grid=param_grid,

    scoring='neg_mean_squared_error',

    cv=5,  # 5-fold cross-validation

    verbose=1,

    n_jobs=-1

)



# Perform the grid search on the training data

grid_search.fit(X_train, y_train)



# Get the best parameters and score

best_params = grid_search.best_params_

best_rmse = np.sqrt(-grid_search.best_score_)



best_params, best_rmse

Fitting 5 folds for each of 108 candidates, totalling 540 fits
