# Module 5: Perform batch scoring and save predictions to lakehouse

We start with mounting the default lakehouse, as in modules 2-4, and setting configurations to optimize performance, as in module 1.

In [None]:
spark.conf.set("sprk.sql.parquet.vorder.enabled", "true") # Enable VOrder write
spark.conf.set("spark.microsoft.delta.optimizeWrite.enabled", "true") # Enable automatic delta optimized write

#### Read a random sample of cleansed data from lakehouse for the year 2016 and month 3

In [None]:
SEED = 1234 # Random seed
input_df = spark.read.format("delta").load("Tables/nyctaxi_prep")\
            .filter("puYear = 2016 AND puMonth = 3")\
            .sample(True, 0.01, seed=SEED) ## Sampling data to reduce execution time for this tutorial

#### Get the trained and registered model to generate predictions

In [None]:
import mlflow
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from synapse.ml.core.platform import *
from synapse.ml.lightgbm import LightGBMRegressor

## Define run_uri to fetch the model
run_uri = "<Enter the run_uri from module 04 here>"
loaded_model = mlflow.spark.load_model(run_uri, dfs_tmpdir="Files/tmp/mlflow")

#### Run model transform on the input dataframe to generate predictions and remove unnecessary vector features created for model training

In [None]:
# Generate predictions by applying model transform on the input dataframe
predictions = loaded_model.transform(input_df)
cols_toremove = ['storeAndFwdFlagIdx', 'timeBinsIdx', 'vendorIDIdx', 'paymentTypeIdx', 'vendorIDEnc',
 'rateCodeIdEnc', 'paymentTypeEnc', 'weekDayEnc', 'pickupHourEnc', 'storeAndFwdFlagEnc', 'timeBinsEnc', 'features','weekDayNameIdx',
 'pickupHourIdx', 'rateCodeIdIdx', 'weekDayNameEnc']
output_df = predictions.withColumnRenamed("prediction", "predictedtripDuration").drop(*cols_toremove)

#### Save predictions to lakehouse delta table

In [None]:
table_name = "nyctaxi_pred"
output_df.write.mode("overwrite").format("delta").save(f"Tables/{table_name}")
print(f"Output Predictions saved to delta table: {table_name}")

#### Preview predicted dataframe

In [None]:
%%sql
SELECT * FROM nyctaxi_pred LIMIT 20