### MLflow introduction.

####0. SETUP

In [0]:
#install latest version of sklearn
%pip install -U scikit-learn

In [0]:
%restart_python

#### Step 1. Importing the desired libraries and defining few constants.

In [0]:
from databricks.feature_store import FeatureStoreClient
from databricks.feature_store import FeatureLookup
import typing

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import mlflow
import pandas as pd


In [0]:
#Name of experiment where we will track all the different model training runs.
EXPERIMENT_NAME = "Bank_Customer_Churn_Analysis"
#Name of the model
MODEL_NAME = "random_forest_classifier"
#This is the name for the entry in model registry
MODEL_REGISTRY_NAME = "Bank_Customer_Churn"
#The email you use to authenticate in the Databricks workspace
USER_EMAIL = "samuel.hmariam@shewit.co.uk"
#Location where the MLflow experiement will be listed in user workspace
EXPERIMENT_NAME = f"/Users/{USER_EMAIL}/{EXPERIMENT_NAME}"
# we have all the features backed into a Delta table so we will read directly
FEATURE_TABLE = "bank_churn_analysis.bank_customer_features"

#### Step 2. Build a simplistic model that uses the feature store table as its source for training and validation.

In [0]:
# set experiment name
mlflow.set_experiment(EXPERIMENT_NAME)

with mlflow.start_run():  
  TEST_SIZE = 0.20
  
  # Now we will read the data directly from the feature table
  training_df = spark.table(FEATURE_TABLE)
  
  # convert the dataset to pandas so that we can fit sklearn RandomForestClassifier on it
  train_df = training_df.toPandas()
  
  # The train_df represents the input dataframe that has all the feature columns along with the new raw input in the form of training_df.
  X = train_df.drop(['Exited'], axis=1)
  y = train_df['Exited']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=54, stratify=y)
  
  # here we are not doing any hyperparameter tuning however.
  model = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
  signature = mlflow.models.signature.infer_signature(X_train, model.predict(X_train))
  
  predictions = model.predict(X_test)
  fpr, tpr, _ = metrics.roc_curve(y_test, predictions, pos_label=1)
  auc = metrics.auc(fpr, tpr)
  accuracy = metrics.accuracy_score(y_test, predictions)
 
  # get the calculated feature importances.
  importances = dict(zip(model.feature_names_in_, model.feature_importances_))  
  # log artifact
  mlflow.log_dict(importances, "feature_importances.json")
  # log metrics
  mlflow.log_metric("auc", auc)
  mlflow.log_metric("accuracy", accuracy)
  # log parameters
  mlflow.log_param("split_size", TEST_SIZE)
  mlflow.log_params(model.get_params())
  # set tag
  mlflow.set_tag(MODEL_NAME, "mlflow demo")
  # log the model itself in mlflow tracking server
  mlflow.sklearn.log_model(model, MODEL_NAME, signature=signature, input_example=X_train.iloc[:4, :])

In [0]:
from mlflow.tracking import MlflowClient
#initialize the mlflow client
client = MlflowClient()
#get the experiment id 
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
#get the latest run id which will allow us to directly access the metrics, and attributes and all th einfo
run_id = mlflow.search_runs(experiment_id, order_by=["start_time DESC"]).head(1)["run_id"].values[0]
#now we will register the latest model into the model registry
new_model_version = mlflow.register_model(f"runs:/{run_id}/{MODEL_NAME}", MODEL_REGISTRY_NAME)