In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import ParameterGrid

#import hyperopt from the ML runtime. This notebook need to be run on Databricks ML Runtime. 
#The hyperopt package installed on the ML runtime is different to the open source version. 
from hyperopt import fmin, hp, tpe
from hyperopt import SparkTrials, STATUS_OK, Trials

import lightgbm as lgb

In [None]:
def resampleFixEnds(pdf, frequency):
    """
    The function resamples/aggregates the data according to the sampling frequency. Often the first 
    and last data points will deviate after resampling. As a simple fix, these points are simply deleted
    if they deviate more than 20% from their neighboring data point.
    """

    pdf = pdf.resample(frequency).sum(min_count=1)  #frequency: "D,W,M"

    for column in pdf.columns:
        if pdf[column].iloc[0] < 0.8 * pdf[column].iloc[1]:
            pdf = pdf.drop(pdf.index[0])

        if pdf[column].iloc[-1] < 0.8 * pdf[column].iloc[-2]:
            pdf = pdf.drop(pdf.index[-1])

    return pdf

In [None]:
url = "https://data.stadt-zuerich.ch/dataset/ewz_stromabgabe_netzebenen_stadt_zuerich/download/ewz_stromabgabe_netzebenen_stadt_zuerich.csv"
dataPdf = pd.read_csv(url, index_col=None)

dataPdf["Timestamp"] = pd.to_datetime(dataPdf["Timestamp"], utc=True)

#set timestamp as index to do a daily aggregation
dataPdf = dataPdf.set_index(dataPdf["Timestamp"])  
dataPdf = resampleFixEnds(dataPdf, "D")

#Drop the timezone to avoid warnings
dataPdf.index = dataPdf.index.tz_localize(None)  

#rename the columns into y and ds. needed by prophet
dataPdf["ds"] = dataPdf.index
#rescaling the data to GWh, good practise not to work with huge numbers
dataPdf["y"] = (dataPdf["Value_NE5"].values + dataPdf["Value_NE7"].values)/1e6
dataPdf = dataPdf.drop(columns=["Value_NE5", "Value_NE7"])

# put aside some data for evaluation
split = len(dataPdf)-365
trainPdf, testPdf = dataPdf.iloc[:split], dataPdf.iloc[split:]
trainPdf

In [None]:
def train(params):

    model = lbg.LGBMRegressor(**params)
    
    model.fit(trainPdf)
    predictedValues = model.predict(testPdf)

    rmse = mean_squared_error(y_true=testPdf.y.values, y_pred=predictedValues.yhat.values, squared=False)
    
    return {"loss": rmse, "status": STATUS_OK, "Trained_Model": model}

search_space = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'lambda_l1': hp.loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': hp.loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': hp.quniform('num_leaves', 2, 256,1),
        'feature_fraction': hp.uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': hp.quniform('bagging_freq', 1, 7,1),
        'min_child_samples': hp.quniform('min_child_samples', 5, 100,1),
        }
 


# Select a search algorithm for Hyperopt to use.
algorithm = tpe.suggest  # Tree of Parzen Estimators, a Bayesian method

# Distribute tuning across our Spark cluster
#sparkTrials = SparkTrials(parallelism=4)

hyperparameters = fmin(
    fn=train,
    space=search_space,
    algo=algorithm,
    trials=Trials,
    max_evals=30,
    timeout=5*60     #seconds
    ) 

bestModel = Trials.results[np.argmin([r["loss"] for r in Trials.results])]["Trained_Model"]

print(hyperparameters)

In [None]:

def train(params):
  """
  This is our main training function which we pass to Hyperopt.
  It takes in hyperparameter settings, fits a model based on those settings,
  evaluates the model, and returns the loss.
  """

  with mlflow.start_run(run_name='inner_run', nested=True) as run: 
    
    forecaster = Prophet(
        seasonality_mode=        params["seasonality_mode"],
        changepoint_prior_scale= params["changepoint_prior_scale"],
        seasonality_prior_scale= params["seasonality_prior_scale"],
        holidays_prior_scale=    params["holidays_prior_scale"],
        changepoint_range=       params["changepoint_range"],
    )

    if params["holidays"] != None:
        forecaster.add_country_holidays(country_name=params["holidays"])

    forecaster.fit(trainPdf)
    predictedValues = forecaster.predict(testPdf)

    rmse = mean_squared_error(y_true=testPdf.y.values, y_pred=predictedValues.yhat.values, squared=False)
    

  return {"loss": rmse, "status": STATUS_OK, "Trained_Model": forecaster}

# Define the search space for lbg.LGBMRegressor
# https://facebook.github.io/prophet/docs/diagnostics.html#hyperparameter-tuning

search_space = {
  "seasonality_mode":        hp.choice("seasonality_mode",["multiplicative", "additive"]),
  "holidays":                hp.choice("holidays",[None,"Switzerland"]),
  "changepoint_prior_scale": hp.loguniform("changepoint_prior_scale", -6.9, -0.69),  # according to recom. same as [0.001,0.5]
  "seasonality_prior_scale": hp.loguniform("seasonality_prior_scale", -6.9, 2.3),    # according to recom. same as [0.001, 10]
  "holidays_prior_scale":    hp.loguniform("holidays_prior_scale", -6.9, 2.3),       # according to recom. same as [0.001, 10]
  "changepoint_range":       hp.uniform("changepoint_range", 0.8, 0.95)              # optional according to docs, default = 0.8
}

#Give a name to the run, this name is will be used to group the search results.
with mlflow.start_run(run_name='outer_run_prophet'):
  
  # Select a search algorithm for Hyperopt to use.
  algorithm = tpe.suggest  # Tree of Parzen Estimators, a Bayesian method

  # Distribute tuning across our Spark cluster
  sparkTrials = SparkTrials(parallelism=4)

  hyperparameters = fmin(
      fn=train,
      space=search_space,
      algo=algorithm,
      trials=sparkTrials,
      max_evals=30,
      timeout=5*60     #seconds
      ) 

  bestModel = sparkTrials.results[np.argmin([r["loss"] for r in sparkTrials.results])]["Trained_Model"]

  print(hyperparameters)
