# Tracking Models with MLflow

MLflow is pre-installed on the Databricks Runtime for ML. If you are not using the ML Runtime, you will need to install mlflow.

In [2]:
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = '--master local[2] pyspark-shell'

import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (SparkSession 
    .builder 
     .master("local[*]") 
    .getOrCreate()
        )

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

filePath = "data/sf-airbnb-clean.parquet"
airbnbDF = spark.read.parquet(filePath)
(trainDF, testDF) = airbnbDF.randomSplit([.8, .2], seed=42)

categoricalCols = [field for (field, dataType) in trainDF.dtypes 
                   if dataType == "string"]
indexOutputCols = [x + "Index" for x in categoricalCols]
stringIndexer = StringIndexer(inputCols=categoricalCols, 
                              outputCols=indexOutputCols, 
                              handleInvalid="skip")

numericCols = [field for (field, dataType) in trainDF.dtypes 
               if ((dataType == "double") & (field != "price"))]
assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, 
                               outputCol="features")

rf = RandomForestRegressor(labelCol="price", maxBins=40, maxDepth=5, 
                           numTrees=100, seed=42)

pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf])

#### Make sure you have GIT installed before running the MLFlow code
https://git-scm.com/download/win

## MLflow

1) Open up Anaconda prompt
2) Navigate to the DSE6220/Week 8/Examples folder
3) Run mlflow ui
3) Open any browser and enter: http://localhost:5000/

In [7]:
import mlflow
import mlflow.spark
import pandas as pd

with mlflow.start_run(run_name="random-forest") as run:
  # Log params: Num Trees and Max Depth
  mlflow.log_param("num_trees", rf.getNumTrees())
  mlflow.log_param("max_depth", rf.getMaxDepth())
 
  # Log model
  pipelineModel = pipeline.fit(trainDF)
  mlflow.spark.log_model(pipelineModel, "model")

  # Log metrics: RMSE and R2
  predDF = pipelineModel.transform(testDF)
  regressionEvaluator = RegressionEvaluator(predictionCol="prediction", 
                                            labelCol="price")
  rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
  r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
  mlflow.log_metrics({"rmse": rmse, "r2": r2})

  # Log artifact: Feature Importance Scores
  rfModel = pipelineModel.stages[-1]
  pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(), 
                                    rfModel.featureImportances)), 
                          columns=["feature", "importance"])
              .sort_values(by="importance", ascending=False))
  # First write to local filesystem, then tell MLflow where to find that file
  pandasDF.to_csv("/tmp/feature-importance.csv", index=False)
  mlflow.log_artifact("/tmp/feature-importance.csv")

## MLflowClient

In [8]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
runs = client.search_runs(run.info.experiment_id,
                          order_by=["attributes.start_time desc"], 
                          max_results=1)
run_id = runs[0].info.run_id
runs[0].data.metrics

{'r2': 0.22794251914574226, 'rmse': 211.5096898777315}

%md ## Generate Batch Predictions

Let's load the model back in to generate batch predictions

In [11]:
# Load saved model with MLflow
pipelineModel = mlflow.spark.load_model(f"runs:/{run_id}/model")

# Generate Predictions
inputPath = "data/sf-airbnb-clean.parquet"
inputDF = spark.read.parquet(inputPath)
predDF = pipelineModel.transform(inputDF)

2024/04/13 13:11:00 INFO mlflow.spark: 'runs:/11fbfafa15e54a1a940fe4b0aaf5ec74/model' resolved as 'file:///C:/Users/Lowhorn/Documents/DSE6220/Week%208/Examples/mlruns/0/11fbfafa15e54a1a940fe4b0aaf5ec74/artifacts/model'
2024/04/13 13:11:00 INFO mlflow.spark: URI 'runs:/11fbfafa15e54a1a940fe4b0aaf5ec74/model/sparkml' does not point to the current DFS.
2024/04/13 13:11:00 INFO mlflow.spark: File 'runs:/11fbfafa15e54a1a940fe4b0aaf5ec74/model/sparkml' not found on DFS. Will attempt to upload the file.
2024/04/13 13:11:00 INFO mlflow.spark: Copied SparkML model to /tmp/mlflow/aa308d23-cffb-4d58-8a9b-410eb4d757a1


## Generate Streaming Predictions

We can do the same thing to generate streaming predictions.

In [12]:
# Load saved model with MLflow
pipelineModel = mlflow.spark.load_model(f"runs:/{run_id}/model")

# Set up simulated streaming data
repartitionedPath = "data/sf-airbnb-clean-100p.parquet"
schema = spark.read.parquet(repartitionedPath).schema

streamingData = (spark
                 .readStream
                 .schema(schema) # Can set the schema this way
                 .option("maxFilesPerTrigger", 1)
                 .parquet(repartitionedPath))

# Generate Predictions
streamPred = pipelineModel.transform(streamingData)

# Uncomment the line below to see the streaming predictions
# display(streamPred)

# Just remember to stop your stream at the end!
# streamPred.exit()

2024/04/13 13:11:11 INFO mlflow.spark: 'runs:/11fbfafa15e54a1a940fe4b0aaf5ec74/model' resolved as 'file:///C:/Users/Lowhorn/Documents/DSE6220/Week%208/Examples/mlruns/0/11fbfafa15e54a1a940fe4b0aaf5ec74/artifacts/model'
2024/04/13 13:11:11 INFO mlflow.spark: URI 'runs:/11fbfafa15e54a1a940fe4b0aaf5ec74/model/sparkml' does not point to the current DFS.
2024/04/13 13:11:11 INFO mlflow.spark: File 'runs:/11fbfafa15e54a1a940fe4b0aaf5ec74/model/sparkml' not found on DFS. Will attempt to upload the file.
2024/04/13 13:11:11 INFO mlflow.spark: Copied SparkML model to /tmp/mlflow/fe0db27d-4bbf-4518-bf91-cc8a333f13e7
