# SGD Regressor Training

This Jupyter Notebook presents a detailed implementation of training a Stochastic Gradient Descent (SGD) Regressor on the Bike Sharing dataset to accurately predict bike rental demand. The dataset contains information such as weather conditions, time of day, and historical rental data, making it an excellent resource for studying bike usage patterns and forecasting future demand.
Data Preprocessing is already performed. 

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder


bool_imputers = []

bool_pipeline = Pipeline(steps=[
    ("cast_type", FunctionTransformer(lambda df: df.astype(object))),
    ("imputers", ColumnTransformer(bool_imputers, remainder="passthrough")),
    ("onehot", SklearnOneHotEncoder(handle_unknown="ignore", drop="first")),
])

bool_transformers = [("boolean", bool_pipeline, ["workingday", "holiday", "yr"])]

### Numerical Columns

Missing values for numerical columns are imputed with mean by default.

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler

num_imputers = []
num_imputers.append(("impute_mean", SimpleImputer(), ["atemp", "casual", "cnt", "holiday", "hum", "instant", "mnth", "registered", "season", "weathersit", "weekday", "windspeed", "workingday", "yr"]))

numerical_pipeline = Pipeline(steps=[
    ("converter", FunctionTransformer(lambda df: df.apply(pd.to_numeric, errors='coerce'))),
    ("imputers", ColumnTransformer(num_imputers)),
    ("standardizer", StandardScaler()),
])

numerical_transformers = [("numerical", numerical_pipeline, ["weathersit", "windspeed", "instant", "casual", "workingday", "holiday", "season", "atemp", "weekday", "registered", "mnth", "cnt", "hum", "yr"])]

### Categorical Columns

#### Low-Cardinality Categoricals
Convert each low-cardinality categorical column into multiple binary columns through one-hot encoding.
For each input categorical column (string or numeric), the number of output columns is equal to the number of unique values in the input column.

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

one_hot_imputers = []

one_hot_pipeline = Pipeline(steps=[
    ("imputers", ColumnTransformer(one_hot_imputers, remainder="passthrough")),
    ("one_hot_encoder", OneHotEncoder(handle_unknown="indicator")),
])

categorical_one_hot_transformers = [("onehot", one_hot_pipeline, ["mnth", "season", "weathersit", "weekday"])]

In [None]:
from sklearn.compose import ColumnTransformer

transformers = date_transformers + bool_transformers + numerical_transformers + categorical_one_hot_transformers

preprocessor = ColumnTransformer(transformers, remainder="passthrough", sparse_threshold=0)

In [None]:
split_train_df = df_loaded.loc[df_loaded.col_771f == "train"]
split_val_df = df_loaded.loc[df_loaded.col_771f == "val"]
split_test_df = df_loaded.loc[df_loaded.col_771f == "test"]

# Separate target column from features and drop col_771f
X_train = split_train_df.drop([target_col, "col_771f"], axis=1)
y_train = split_train_df[target_col]

X_val = split_val_df.drop([target_col, "col_771f"], axis=1)
y_val = split_val_df[target_col]

X_test = split_test_df.drop([target_col, "col_771f"], axis=1)
y_test = split_test_df[target_col]

## Train Regression Model

In [None]:
from sklearn.linear_model import SGDRegressor

help(SGDRegressor)

Help on class SGDRegressor in module sklearn.linear_model._stochastic_gradient:



class SGDRegressor(BaseSGDRegressor)

 |  SGDRegressor(loss='squared_error', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=False, average=False)

 |  

 |  Linear model fitted by minimizing a regularized empirical loss with SGD.

 |  

 |  SGD stands for Stochastic Gradient Descent: the gradient of the loss is

 |  estimated each sample at a time and the model is updated along the way with

 |  a decreasing strength schedule (aka learning rate).

 |  

 |  The regularizer is a penalty added to the loss function that shrinks model

 |  parameters towards the zero vector using either the squared euclidean norm

 |  L2 or the absolute norm L1 or a combination of both (Elastic N

### Define the Objective Function
The objective function used to find optimal hyperparameters. By default, this notebook only runs
this function once (`max_evals=1` in the `hyperopt.fmin` invocation) with fixed hyperparameters, but
hyperparameters can be tuned by modifying `space`, defined below. `hyperopt.fmin` will then use this
function's return value to search the space to minimize the loss.

In [None]:
import mlflow
from mlflow.models import Model, infer_signature, ModelSignature
from mlflow.pyfunc import PyFuncModel
from mlflow import pyfunc
import sklearn
from sklearn import set_config
from sklearn.pipeline import Pipeline
from hyperopt import hp, tpe, fmin, STATUS_OK, Trials


def objective(params):
  with mlflow.start_run(experiment_id="xxxxxx") as mlflow_run:
    sgdr_regressor = SGDRegressor(**params)

    model = Pipeline([
        ("column_selector", col_selector),
        ("preprocessor", preprocessor),
        ("regressor", sgdr_regressor),
    ])

    # Enable automatic logging of input samples, metrics, parameters, and models
    mlflow.sklearn.autolog(
        log_input_examples=True,
        silent=True,
    )

    model.fit(X_train, y_train)

    
    # Log metrics for the training set
    mlflow_model = Model()
    pyfunc.add_to_model(mlflow_model, loader_module="mlflow.sklearn")
    pyfunc_model = PyFuncModel(model_meta=mlflow_model, model_impl=model)
    training_eval_result = mlflow.evaluate(
        model=pyfunc_model,
        data=X_train.assign(**{str(target_col):y_train}),
        targets=target_col,
        model_type="regressor",
        evaluator_config = {"log_model_explainability": False,
                            "metric_prefix": "training_"}
    )
    # Log metrics for the validation set
    val_eval_result = mlflow.evaluate(
        model=pyfunc_model,
        data=X_val.assign(**{str(target_col):y_val}),
        targets=target_col,
        model_type="regressor",
        evaluator_config= {"log_model_explainability": False,
                           "metric_prefix": "val_"}
   )
    sgdr_val_metrics = val_eval_result.metrics
    # Log metrics for the test set
    test_eval_result = mlflow.evaluate(
        model=pyfunc_model,
        data=X_test.assign(**{str(target_col):y_test}),
        targets=target_col,
        model_type="regressor",
        evaluator_config= {"log_model_explainability": False,
                           "metric_prefix": "test_"}
   )
    sgdr_test_metrics = test_eval_result.metrics

    loss = sgdr_val_metrics["val_r2_score"]

    # Truncate metric key names so they can be displayed together
    sgdr_val_metrics = {k.replace("val_", ""): v for k, v in sgdr_val_metrics.items()}
    sgdr_test_metrics = {k.replace("test_", ""): v for k, v in sgdr_test_metrics.items()}

    return {
      "loss": loss,
      "status": STATUS_OK,
      "val_metrics": sgdr_val_metrics,
      "test_metrics": sgdr_test_metrics,
      "model": model,
      "run": mlflow_run,
    }

### Configure the Hyperparameter Search Space
Configure the search space of parameters. Parameters below are all constant expressions but can be
modified to widen the search space. For example, when training a decision tree regressor, to allow
the maximum tree depth to be either 2 or 3, set the key of 'max_depth' to
`hp.choice('max_depth', [2, 3])`. Be sure to also increase `max_evals` in the `fmin` call below.

In [None]:
space = {
  "alpha": 3.715669458143688e-05,
  "average": False,
  "early_stopping": False,
  "fit_intercept": True,
  "eta0": 0.03146676931013463,
  "learning_rate": "adaptive",
  "epsilon": 0.02227292098581603,
  "loss": "huber",
  "n_iter_no_change": 5,
  "penalty": "l2",
  "tol": 0.0006063728847338268,
  "validation_fraction": 0.1,
  "random_state": 719041280,
}

### Run Trials
When widening the search space and training multiple models, switch to `SparkTrials` to parallelize
training on Spark:
```
from hyperopt import SparkTrials
trials = SparkTrials()
```

In [None]:
trials = Trials()
fmin(objective,
     space=space,
     algo=tpe.suggest,
     max_evals=1,  # Increase this when widening the hyperparameter search space.
     trials=trials)

best_result = trials.best_trial["result"]
model = best_result["model"]
mlflow_run = best_result["run"]

display(
  pd.DataFrame(
    [best_result["val_metrics"], best_result["test_metrics"]],
    index=["validation", "test"]))

set_config(display="diagram")
model


  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]

2023/07/14 17:05:47 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.


2023/07/14 17:05:47 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.


2023/07/14 17:05:47 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.




100%|██████████| 1/1 [00:07<00:00,  7.68s/trial, best loss: 0.996134956935069]


Unnamed: 0,score,example_count,mean_absolute_error,mean_squared_error,root_mean_squared_error,sum_on_target,mean_on_target,r2_score,max_error,mean_absolute_percentage_error
validation,0.996135,133,0.009128,0.000135,0.011619,65.682161,0.493851,0.996135,0.034676,0.022972
test,0.994034,161,0.010478,0.000185,0.013604,79.140128,0.491554,0.994034,0.062461,0.023403


In [None]:
# model_uri for the generated model
print(f"runs:/{ mlflow_run.info.run_id }/model")