In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [2]:
data = pd.read_parquet("data/train.parquet")
data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.609438
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 496827 entries, 48321 to 929187
Data columns (total 12 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[us]
 6   counter_installation_date  496827 non-null  datetime64[us]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   latitude                   496827 non-null  float64       
 10  longitude                  496827 non-null  float64       
 11  log_bike_count             496827 non-null  float64  

In [4]:
data.nunique(axis=0)

counter_id                     56
counter_name                   56
site_id                        30
site_name                      30
bike_count                    998
date                         8974
counter_installation_date      22
coordinates                    30
counter_technical_id           30
latitude                       30
longitude                      30
log_bike_count                998
dtype: int64

In [5]:
(
    data.groupby(["site_name", "counter_name"])["bike_count"].sum()
    .sort_values(ascending=False)
    .head(10)
    .to_frame()
)

  data.groupby(["site_name", "counter_name"])["bike_count"].sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,bike_count
site_name,counter_name,Unnamed: 2_level_1
Totem 73 boulevard de Sébastopol,Totem 73 boulevard de Sébastopol S-N,1992227.0
Totem 64 Rue de Rivoli,Totem 64 Rue de Rivoli O-E,1551730.0
Totem 73 boulevard de Sébastopol,Totem 73 boulevard de Sébastopol N-S,1497912.0
67 boulevard Voltaire SE-NO,67 boulevard Voltaire SE-NO,1130565.0
Totem 64 Rue de Rivoli,Totem 64 Rue de Rivoli E-O,1015875.0
27 quai de la Tournelle,27 quai de la Tournelle SE-NO,984914.0
Quai d'Orsay,Quai d'Orsay E-O,948357.0
Totem Cours la Reine,Totem Cours la Reine O-E,894937.0
Face au 48 quai de la marne,Face au 48 quai de la marne SO-NE,884907.0
Face au 48 quai de la marne,Face au 48 quai de la marne NE-SO,833657.0


In [6]:
# Parse the 'date' column and extract time-based features for the training data

data['date'] = pd.to_datetime(data['date'])

data['hour'] = data['date'].dt.hour

data['day_of_week'] = data['date'].dt.dayofweek

data['day'] = data['date'].dt.day

data['month'] = data['date'].dt.month

data['year'] = data["date"].dt.year


In [7]:
# Check for missing values in the training dataset

missing_values = data.isnull().sum()



# Display the transformed data and missing value summary

data.head(), missing_values

(                counter_id              counter_name    site_id  \
 48321  100007049-102007049  28 boulevard Diderot E-O  100007049   
 48324  100007049-102007049  28 boulevard Diderot E-O  100007049   
 48327  100007049-102007049  28 boulevard Diderot E-O  100007049   
 48330  100007049-102007049  28 boulevard Diderot E-O  100007049   
 48333  100007049-102007049  28 boulevard Diderot E-O  100007049   
 
                   site_name  bike_count                date  \
 48321  28 boulevard Diderot         0.0 2020-09-01 02:00:00   
 48324  28 boulevard Diderot         1.0 2020-09-01 03:00:00   
 48327  28 boulevard Diderot         0.0 2020-09-01 04:00:00   
 48330  28 boulevard Diderot         4.0 2020-09-01 15:00:00   
 48333  28 boulevard Diderot         9.0 2020-09-01 18:00:00   
 
       counter_installation_date         coordinates counter_technical_id  \
 48321                2013-01-18  48.846028,2.375429          Y2H15027244   
 48324                2013-01-18  48.846028,2.3754

In [8]:
test_data = pd.read_parquet("data/final_test.parquet")
test_data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 01:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 13:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 17:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 19:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 22:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429


In [9]:
# Parse the 'date' column and extract time-based features for the test data

test_data['date'] = pd.to_datetime(test_data['date'])

test_data['hour'] = test_data['date'].dt.hour

test_data['day_of_week'] = test_data['date'].dt.dayofweek

test_data['day'] = test_data['date'].dt.day

test_data['month'] = test_data['date'].dt.month

test_data['year'] = test_data["date"].dt.year

# Check for missing values in the test dataset

test_missing_values = test_data.isnull().sum()



# Display the transformed test data and missing value summary

test_data.head(), test_missing_values

(            counter_id              counter_name    site_id  \
 0  100007049-102007049  28 boulevard Diderot E-O  100007049   
 1  100007049-102007049  28 boulevard Diderot E-O  100007049   
 2  100007049-102007049  28 boulevard Diderot E-O  100007049   
 3  100007049-102007049  28 boulevard Diderot E-O  100007049   
 4  100007049-102007049  28 boulevard Diderot E-O  100007049   
 
               site_name                date counter_installation_date  \
 0  28 boulevard Diderot 2021-09-10 01:00:00                2013-01-18   
 1  28 boulevard Diderot 2021-09-10 13:00:00                2013-01-18   
 2  28 boulevard Diderot 2021-09-10 17:00:00                2013-01-18   
 3  28 boulevard Diderot 2021-09-10 19:00:00                2013-01-18   
 4  28 boulevard Diderot 2021-09-10 22:00:00                2013-01-18   
 
           coordinates counter_technical_id   latitude  longitude  hour  \
 0  48.846028,2.375429          Y2H15027244  48.846028   2.375429     1   
 1  48.846028,2.37

In [10]:
from sklearn.model_selection import train_test_split



# Define features and target

features = ['counter_name', 'site_name', 'hour', 'day_of_week', 'month', 'year', 'latitude', 'longitude', 'day']

target = 'log_bike_count'



# Split the data into training and validation sets

X_train, X_val, y_train, y_val = train_test_split(

    data[features], data[target], test_size=0.2, random_state=42

)



# Check the shapes of the resulting datasets

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((397461, 9), (99366, 9), (397461,), (99366,))

In [11]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

# # Initialize and train the linear regression model

# linear_model = LinearRegression()

# linear_model.fit(X_train, y_train)



# # Make predictions on the validation set

# y_val_pred = linear_model.predict(X_val)



# # Calculate RMSE for the linear regression model

# rmse_linear = np.sqrt(mean_squared_error(y_val, y_val_pred))



# rmse_linear

In [12]:
from sklearn.ensemble import RandomForestRegressor



# # Initialize and train the random forest regressor

# rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# rf_model.fit(X_train, y_train)



# # Make predictions on the validation set

# y_val_pred_rf = rf_model.predict(X_val)



# # Calculate RMSE for the random forest model

# rmse_rf = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))



# rmse_rf

In [13]:
from xgboost import XGBRegressor



# # Initialize and train the XGBoost model

# xgb_model = XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)

# xgb_model.fit(X_train, y_train)



# # Make predictions on the validation set

# y_val_pred_xgb = xgb_model.predict(X_val)



# # Calculate RMSE for the XGBoost model

# rmse_xgb = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))



# rmse_xgb

In [14]:
# from lightgbm import LGBMRegressor

# # Initialize and train the LightGBM model
# lgbm_model = LGBMRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
# lgbm_model.fit(X_train, y_train)

# # Make predictions on the validation set
# y_val_pred_lgbm = lgbm_model.predict(X_val)

# # Calculate RMSE for the LightGBM model
# rmse_lgbm = np.sqrt(mean_squared_error(y_val, y_val_pred_lgbm))

# rmse_lgbm


In [15]:
# from sklearn.tree import DecisionTreeRegressor

# # Initialize and train the decision tree model
# dt_model = DecisionTreeRegressor(max_depth=10, random_state=42)
# dt_model.fit(X_train, y_train)

# # Make predictions on the validation set
# y_val_pred_dt = dt_model.predict(X_val)

# # Calculate RMSE for the decision tree model
# rmse_dt = np.sqrt(mean_squared_error(y_val, y_val_pred_dt))

# rmse_dt


In [16]:
from sklearn.ensemble import (
    RandomForestRegressor,
    RandomForestClassifier,
    HistGradientBoostingClassifier,
)
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# # Define the parameter grid for n_estimators and max_depth
# param_grid = {
#     "regressor__n_estimators": [50, 100, 200],  # Number of trees in the forest
#     "regressor__max_depth": [None, 10, 20, 30],  # Depth of the trees
# }

# # Create the pipeline
# pipeline = Pipeline(
#     steps=[
#         ("regressor", RandomForestRegressor(random_state=42)),
#     ]
# )

# # Set up the GridSearchCV with cross-validation
# grid_search = GridSearchCV(
#     estimator=pipeline,
#     param_grid=param_grid,
#     cv=5,  # 5-fold cross-validation
#     scoring="neg_mean_squared_error",
#     n_jobs=-1,  # Use all available cores
#     verbose=2,  # More output to monitor progress
# )

# # Fit the GridSearchCV to find the best parameters
# grid_search.fit(X_train, y_train)

# # Best parameters and score
# best_params = grid_search.best_params_
# best_score = -grid_search.best_score_

# print("Best Parameters:", best_params)
# print("Best Cross-Validation RMSE:", np.sqrt(best_score))

In [17]:
# Creating a pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

date_cols = ["year", "month", "day", "day_of_week", "hour"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

# Initialize XGBRegressor

xgb_model = XGBRegressor(random_state=42)

# Create the pipeline
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", xgb_model),
    ]
)

In [21]:
# Define a small parameter grid for XGBRegressor

param_grid = {

    'regressor__n_estimators': [50, 100, 200, 300, 500],

    'regressor__max_depth': [3, 4, 5, 6, 8, 10, 12, 15],

    'regressor__learning_rate': [0.001, 0.01, 0.1, 0.20, 0.25, 0.30],

    'regressor__subsample': [0.8, 1.0],

    'regressor__colsample_bytree': [0.8, 1.0]

}



# # Initialize XGBRegressor

# xgb_model = XGBRegressor(random_state=42)



# Perform GridSearchCV to find the best parameters

grid_search = GridSearchCV(

    estimator=pipeline,

    param_grid=param_grid,

    scoring='neg_mean_squared_error',

    cv=5,  # 3-fold cross-validation

    verbose=1,

    n_jobs=-1

)



# Perform the grid search on the training data

grid_search.fit(X_train, y_train)



# Get the best parameters and score

best_params = grid_search.best_params_

best_rmse = np.sqrt(-grid_search.best_score_)



best_params, best_rmse

Fitting 5 folds for each of 960 candidates, totalling 4800 fits


({'regressor__colsample_bytree': 0.8,
  'regressor__learning_rate': 0.1,
  'regressor__max_depth': 15,
  'regressor__n_estimators': 500,
  'regressor__subsample': 0.8},
 np.float64(0.3624106704473188))

In [22]:
# test_data_predictions = lgbm_model.predict(test_data[features])
# # Add predictions to the test dataset

# test_data['log_bike_count'] = test_data_predictions

# predictions_output = test_data[['log_bike_count']].copy()

# predictions_output.insert(0, 'Id', predictions_output.index)

# # Save to CSV file

# output_file_path = 'data/Test_Predictions.csv'

# predictions_output.to_csv(output_file_path, index=False)


In [25]:
best_estimator = grid_search.best_estimator_

test_data_predictions = best_estimator.predict(test_data[features])
# Add predictions to the test dataset

test_data['log_bike_count'] = test_data_predictions

predictions_output = test_data[['log_bike_count']].copy()

predictions_output.insert(0, 'Id', predictions_output.index)

# Save to CSV file

output_file_path = 'data/Test_Predictions_xgb_encoded2.csv'

predictions_output.to_csv(output_file_path, index=False)