In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import external_data.external_data_functions as ext
import my_utils
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder
from sklearn.linear_model import Ridge
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression
from skrub import TableVectorizer
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Import data

In [11]:
X, y = my_utils.get_train_data()
X = ext._merge_external_data(X)

In [None]:
X.info()

# Pipeline

In [4]:
X_train, y_train, X_valid, y_valid = my_utils.train_test_split_temporal(X, y)

In [5]:
date_encoder = FunctionTransformer(my_utils._encode_dates)
cyclic_features = FunctionTransformer(my_utils.create_time_features)
categorical_encoder = OrdinalEncoder()
categorical_cols = ["counter_id", "site_id"]

preprocessor = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_cols),
    ],
    remainder="passthrough",
)
regressor = ExtraTreesRegressor()


"""xgb.XGBRegressor(max_depth = 7, 
                             colsample_bytree = 0.8, 
                             learning_rate = 0.3, 
                             min_child_weight = 5, 
                             subsample = 0.6)"""

# ExtraTreesRegressor() --> best score

pipe = Pipeline(
    [
        ("date_encoding", date_encoder),
        ("cyclic_features", cyclic_features),
        ("preprocessor", preprocessor),
        ("regressor", regressor),
    ]
)

In [None]:
pipe.fit(X, y)
# pipe.fit(X_train, y_train)

In [None]:
# pipe.predict(X_valid)

In [None]:
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}"
)
print(
    f"Valid set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}"
)

In [None]:
from joblib import dump

dump(pipe, "trained_pipeline.joblib")

# GridSearch

In [None]:
# GridSearchCV

from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

param_grid = {
    "regressor__max_depth": [5, 6, 7],
    "regressor__min_child_weight": [1, 3, 5],
    "regressor__subsample": [0.6, 0.8, 1.0],
    "regressor__colsample_bytree": [0.6, 0.8, 1.0],
    "regressor__learning_rate": [0.01, 0.1, 0.3],
}


grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    # param_distributions=param_grid, #for RandomizedSearch
    # n_iter = 50, #For RandomizedSearch
    scoring="neg_mean_squared_error",  # Change to appropriate scoring metric for regression
    cv=my_utils.get_cv(X, y),  # Number of cross-validation folds
    verbose=4,  # To display progress
    n_jobs=-1,  # Use all available processors
)

grid_search.fit(X, y)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# Get the best pipeline
best_pipeline = grid_search.best_estimator_

# CV

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

cv = TimeSeriesSplit(n_splits=6)

# When using a scorer in scikit-learn it always needs to be better when smaller, hence the minus sign.
scores = cross_val_score(
    pipe, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error"
)
print("RMSE: ", scores)
print(f"RMSE (all folds): {-scores.mean():.3} ± {(-scores).std():.3}")

# Test set

In [7]:
test_data = pd.read_parquet(Path("data") / "final_test.parquet")

In [8]:
X_test = ext._merge_external_data(test_data)

In [None]:
from joblib import load

# Load the trained pipeline
loaded_model = load("trained_pipeline.joblib")  # here loaded with ExtraTreesRegressor()

predictions = loaded_model.predict(X_test)

In [9]:
predictions = pipe.predict(X_test)

In [10]:
submission = pd.DataFrame(
    {
        "Id": X_test.reset_index().index,  # Use appropriate index or ID column from the test data
        "log_bike_count": predictions,
    }
)
submission.to_csv("submission.csv", index=False)