## Section 1: Databricks Machine Learning

In [0]:
%run "./Includes/Classroom-Setup"

### Databricks ML

In [0]:
# Identify when a standard cluster is preferred over a single-node cluster and vice versa

# シングルノードのクラスタでもDriverノードを持つので、Sparkを動かすことは可能。
# 大規模で並列分散処理が必要な場合はマルチノード、小規模・またはSparkを使わない(sklearnなど)でモデルを開発する場合はシングルノードが望ましい

# https://learn.microsoft.com/en-us/azure/databricks/compute/cluster-config-best-practices#--training-machine-learning-models
# https://learn.microsoft.com/en-us/azure/databricks/compute/configure#--single-node-or-multi-node-compute
# https://community.databricks.com/t5/data-engineering/when-should-i-use-single-node-clusters-vs-standard/td-p/24301

In [0]:
# Connect a repo from an external Git provider to Databricks repos.

# ワークスペース>「作成」>「リポジトリ」でリポジトリURL、プロバイダーを入力してclone可能

# https://learn.microsoft.com/en-us/azure/databricks/repos/get-access-tokens-from-git-provider
# https://github.com/lorenzo1285/-scalable-machine-learning-with-apache-spark-english

In [0]:
# Commit changes from a Databricks Repo to an external Git provider.

# Commit/Pushはコンソール上から可能

In [0]:
# Create a new branch and commit changes to an external Git provider.
# 集成了Github的仓库，可以在Databricks进行git各种操作
# ブランチ作成はコンソール上から可能

In [0]:
# Pull changes from an external Git provider back to a Databricks workspace.

# プルもコンソール上から可能

In [0]:
# Orchestrate multi-task ML workflows using Databricks jobs.
# workflow功能可以自定义task和他们的上下依存关系，很像airflow
# 使用的是Job cluster，runtime包括Standard（适合ETL）和ML
# ←の「ワークフロー」>「ジョブを作成」後、タスクに依存関係を持たせることで可能

### Databricks Runtime for Machine Learning

In [0]:
# Create a cluster with the Databricks Runtime for Machine Learning.

# コンソール上から確認

In [0]:
# Install a Python library to be available to all notebooks that run on a cluster.

# !pip install or install libraries in cluster

# https://docs.databricks.com/ja/libraries/index.html
# 分为Notebook级别和Cluster级别，也就是安装的库可以使用的范围不同，Cluster的安装就可给该集群所有的笔记本使用

### AutoML

In [0]:
# Identify the steps of the machine learning workflow completed by AutoML.
# 是左边栏的实验功能

# 1. 欠損値の補完
# 2. チューニング
# 3. 学習
# 4. 評価
# 5. EDA(自動作成だが、本来はモデル作成の前にやるべき)

# モデルのデプロイはModelsのコンポーネント
# 生成最好的模型后，需要右上角手动登录到register，然后再deploy

In [0]:
# Identify how to locate the source code for the best model produced by AutoML.
from databricks import automl

file_path = f"{DA.paths.datasets}/airbnb/sf-listings/sf-listings-2019-03-06-clean.delta/"
airbnb_df = spark.read.format("delta").load(file_path)
train_df, test_df = airbnb_df.randomSplit([.8, .2], seed=42)

summary = automl.regress(train_df, target_col="price", primary_metric="rmse", timeout_minutes=5, max_trials=10)

In [0]:
# Identify which evaluation metrics AutoML can use for regression problems.

help(automl.regress)

# https://docs.databricks.com/ja/machine-learning/automl/train-ml-model-automl-api.html

In [0]:
# Identify the key attributes of the data set using the AutoML data explorationnotebook.

# UI参照

### Feature Store

In [0]:
# Describe the benefits of using Feature Store to store and access features for machine learning pipelines.

# It enables feature sharing and discovery across your organization and also ensures that the same feature computation code is used for model training and inference.

In [0]:
# Create a feature store table.
import pyspark.sql.functions as F
import uuid
from databricks import feature_store
from pyspark.sql.types import StringType, DoubleType
from databricks.feature_store import feature_table, FeatureLookup
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

file_path = f"{DA.paths.datasets}/airbnb/sf-listings/sf-listings-2019-03-06-clean.delta/"
airbnb_df = spark.read.format("delta").load(file_path).coalesce(1).withColumn("index", F.monotonically_increasing_id())

spark.sql(f"CREATE DATABASE IF NOT EXISTS {DA.cleaned_username}")
table_name = f"{DA.cleaned_username}.airbnb_" + str(uuid.uuid4())[:6]
print(table_name)

fs = feature_store.FeatureStoreClient()

## select numeric features and exclude target column "price"
numeric_cols = [x.name for x in airbnb_df.schema.fields if (x.dataType == DoubleType()) and (x.name != "price")]
numeric_features_df = airbnb_df.select(["index"] + numeric_cols)

# create fs table and insert records
fs.create_table(
    name=table_name,
    primary_keys=["index"],
    df=numeric_features_df,
    schema=numeric_features_df.schema,
    description="Numeric features of airbnb data"
)

# # create fs table
# fs.create_table(
#     name=table_name,
#     # 主键必须
#     primary_keys=["index"],
#     schema=numeric_features_df.schema,
#     description="Original Airbnb data"
# )

# # insert records later
# fs.write_table(
#     name=table_name,
#     df=numeric_features_df,
#     mode="overwrite"
# )

In [0]:
# Write data to a feature store table.

# overwrite

df_new_feature = numeric_features_df\
  .filter(F.col('index')< 100)\
  .withColumn('new_feature', F.lit(999))

# fs.write_table(
#     name=table_name,
#     df=df_new_feature,
#     mode="overwrite"
# )

# fs.write_table(
#     name=table_name,
#     df=df_new_feature,
#     mode="merge"
# )

# get_table()とread_table()の違いは押さえておく

feature_table_df = fs.read_table(table_name)
display(feature_table_df)

In [0]:
# Train a model with features from a feature store table.
with mlflow.start_run() as run:

    rf = RandomForestRegressor(max_depth=3, n_estimators=20, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    mlflow.log_metric("test_mse", mean_squared_error(y_test, y_pred))
    mlflow.log_metric("test_r2_score", r2_score(y_test, y_pred))

    # loggingにfsモジュールを使う
    fs.log_model(
        model=rf,
        artifact_path="feature-store-model",
        flavor=mlflow.sklearn,
        training_set=training_set,
        registered_model_name=f"feature_store_airbnb_{DA.cleaned_username}",
        input_example=X_train[:5],
        signature=infer_signature(X_train, y_train)
    )

In [0]:
# Score a model using features from a feature store table.
batch_input_df = inference_data_df.drop("price") # Exclude true label
predictions_df = fs.score_batch(f"models:/feature_store_airbnb_{DA.cleaned_username}/1", 
                                  batch_input_df, result_type="double")
display(predictions_df)

### Managed MLflow

In [0]:
# Identify the best run using the MLflow Client API.

# 単純にexperimentを指定してすべてのrunを取得し、メトリックを昇順(決定係数とかであれば降順)にソートして先頭のrunを取得すればOK
# experiment包括很多run
run_id_best = mlflow.search_runs(
            summary.experiment.experiment_id,
            order_by = ["metrics.val_rmse"]
            )["run_id"][0]

model_uri = f'runs:/{run_id_best}/model'

# PyFuncModelとしてモデルをロード
loaded_model = mlflow.pyfunc.load_model(model_uri)

In [0]:
# Manually log metrics, artifacts, and models in an MLflow Run.
import mlflow
import mlflow.spark
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.sql.functions import col, log, exp
import matplotlib.pyplot as plt

with mlflow.start_run(run_name="LR-Log-Price") as run:
    # Take log of price
    log_train_df = train_df.withColumn("log_price", log(col("price")))
    log_test_df = test_df.withColumn("log_price", log(col("price")))

    # Log parameter
    mlflow.log_param("label", "log_price")
    mlflow.log_param("features", "all_features")

    # Create pipeline
    r_formula = RFormula(
        formula="log_price ~ . - price",
        featuresCol="features",
        labelCol="log_price",
        handleInvalid="skip",
    )
    lr = LinearRegression(labelCol="log_price", predictionCol="log_prediction")
    pipeline = Pipeline(stages=[r_formula, lr])
    pipeline_model = pipeline.fit(log_train_df)

    # Log model
    mlflow.spark.log_model(
        pipeline_model, "log-model", input_example=log_train_df.limit(5).toPandas()
    )

    # Make predictions
    pred_df = pipeline_model.transform(log_test_df)
    exp_df = pred_df.withColumn("prediction", exp(col("log_prediction")))

    # Evaluate predictions
    rmse = regression_evaluator.setMetricName("rmse").evaluate(exp_df)
    r2 = regression_evaluator.setMetricName("r2").evaluate(exp_df)

    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Log artifact
    plt.clf()

    log_train_df.toPandas().hist(column="log_price", bins=100)
    fig = plt.gcf()
    mlflow.log_figure(fig, f"{DA.username}_log_normal.png")
    plt.show()

In [0]:
# Create a nested Run for deeper Tracking organization.

import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

def train_model(df_pandas: pd.DataFrame) -> pd.DataFrame:
    """
    Trains an sklearn model on grouped instances
    """
    # Pull metadata
    device_id = df_pandas["device_id"].iloc[0]
    n_used = df_pandas.shape[0]
    run_id = df_pandas["run_id"].iloc[0] # Pulls run ID to do a nested run

    # Train the model
    X = df_pandas[["feature_1", "feature_2", "feature_3"]]
    y = df_pandas["label"]
    rf = RandomForestRegressor()
    rf.fit(X, y)

    # Evaluate the model
    predictions = rf.predict(X)
    mse = mean_squared_error(y, predictions) # Note we could add a train/test split

    # Resume the top-level training
    with mlflow.start_run(run_id=run_id) as outer_run:
        # Small hack for running as a job
        experiment_id = outer_run.info.experiment_id
        print(f"Current experiment_id = {experiment_id}")

        # Create a nested run for the specific device
        with mlflow.start_run(run_name=str(device_id), nested=True, experiment_id=experiment_id) as run:
            mlflow.sklearn.log_model(rf, str(device_id))
            mlflow.log_metric("mse", mse)
            mlflow.set_tag("device", str(device_id))

            artifact_uri = f"runs:/{run.info.run_id}/{device_id}"
            # Create a return pandas DataFrame that matches the schema above
            return_df = pd.DataFrame([[device_id, n_used, artifact_uri, mse]], 
                                    columns=["device_id", "n_used", "model_path", "mse"])

    return return_df

In [0]:
# Locate the time a run was executed in the MLflow UI.

# GUI参照

# Notebookの場合
import mlflow

exp_id = ''
runs = mlflow.search_runs(exp_id)
df_runs = spark.read.format("mlflow-experiment").load(exp_id)
display(df_runs)

In [0]:
# Locate the code that was executed with a run in the MLflow UI

# GUI参照

In [0]:
# Register a model using the MLflow Client API.
from mlflow.tracking.client import MlflowClient
client = MlflowClient()

model_name = f"{DA.cleaned_username}_review"
model_uri = f"runs:/{run_id_best}/model"

model_details = mlflow.register_model(model_uri=model_uri, name=model_name)

# optional
client.update_registered_model(
    name=model_details.name,
    description="This model forecasts Airbnb housing list prices based on various listing inputs."
)

client.update_model_version(
    name=model_details.name,
    version=model_details.version,
    description="This model version was built using OLS linear regression with sklearn."
)

In [0]:
# Transition a model’s stage using the Model Registry UI page.

# UI参照
client.search_model_versions(f"name = '{model_name}'")[0].current_stage

In [0]:
# Transition a model’s stage using the MLflow Client API.
client.transition_model_version_stage(
    name=model_details.name,
    version=model_details.version,
    stage="Production"
)
client.search_model_versions(f"name = '{model_name}'")[0].current_stage

In [0]:
# Request to transition a model’s stage using the ML Registry UI page.

# UI参照