## Import Libraries and Read Data

In [None]:
import pandas as pd
import numpy as np

## Reading Data From KQL DB

In [None]:
!pip install Kqlmagic --no-cache-dir  --upgrade

In [None]:
kusto_uri = "https://trd-qtekrqc7c4upzq11vm.z9.kusto.fabric.microsoft.com"
kql_database = "KQL_Demo_AIML_DB"
kql_table = "KQL_pred"

In [None]:
kustoQuery = f"{kql_table} | take 100"
kusto_df = spark.read\
                .format("com.microsoft.kusto.spark.synapse.datasource")\
                .option("accessToken", mssparkutils.credentials.getToken(kusto_uri))\
                .option("kustoCluster", kusto_uri)\
                .option("kustoDatabase", kql_database) \
                .option("kustoQuery", kustoQuery).load()

In [None]:
display(kusto_df)

## Reading Data from Lakehouse Table

In [None]:
input_df = spark.read.format("delta").load("Tables/predictive_maintenance_processed")

In [None]:
print("records read: " + str(input_df.count()))
print("Schema: ")
input_df.printSchema()

In [None]:
TARGET_COL = 'failure'

In [None]:
import pyspark.sql.functions as F

df_columns = input_df.columns
df_columns.remove(TARGET_COL)

# to make sure the TARGET_COL is the last column
df = input_df.select(df_columns + [TARGET_COL]).withColumn(
    TARGET_COL, F.col(TARGET_COL).cast("float")
)

In [None]:
display(df)

## Preparing Training and Testing Data

In [None]:
# Split the dataset into train and test
from sklearn.model_selection import train_test_split

df_pd = df.toPandas()
df_pd = df_pd.astype(float)

train, test = train_test_split(df_pd, test_size = 0.20)
feature_cols = [c for c in df_pd.columns.tolist() if c not in [TARGET_COL]]

df_pd.dtypes

## Track Machine Learning experiments and models

In [None]:
import mlflow

# Set given experiment as the active experiment. If an experiment with this name does not exist, a new experiment with this name is created.
EXPERIMENT_NAME = "ML_Predictive_Maintenance_Experiment"
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.autolog(disable=True)

In [None]:
!pip install imblearn

In [None]:
from imblearn.over_sampling import SMOTE
smt = SMOTE()
X_train_res, y_train_res = smt.fit_resample(train[feature_cols], train[TARGET_COL])

In [None]:
y_train_res.value_counts()

## Random Forrest Classifier

In [None]:
import mlflow.sklearn
import numpy as np
from mlflow.models.signature import infer_signature
from sklearn.metrics import f1_score, accuracy_score, recall_score

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Start your training job with `start_run()`
with mlflow.start_run() as run:
    rfc_id = run.info.run_id
    print(f"run_id {rfc_id}, status: {run.info.status}")
    
    rfc = RandomForestClassifier(max_depth = 5, n_estimators = 50)
    rfc.fit(X_train_res, y_train_res)
    signature = infer_signature(X_train_res, y_train_res)

    mlflow.sklearn.log_model(
        rfc,
        "predictive_maintenance_rfc",
        signature = signature,
        registered_model_name = "predictive_maintenance_rfc"
    ) 

    y_pred_train = rfc.predict(train[feature_cols])
    # Calculate the classification metrics for test data
    f1_train = f1_score(train[TARGET_COL], y_pred_train, average = 'weighted')
    accuracy_train = accuracy_score(train[TARGET_COL], y_pred_train)
    recall_train = recall_score(train[TARGET_COL], y_pred_train, average='weighted')

    # Log the classification metrics to MLflow
    mlflow.log_metric("f1_score_train", f1_train)
    mlflow.log_metric("accuracy_train", accuracy_train)
    mlflow.log_metric("recall_train", recall_train)

    # Print the run ID and the classification metrics
    print("F1 score_train:", f1_train)
    print("Accuracy_train:", accuracy_train)
    print("Recall_train:", recall_train)  

    y_pred_test = rfc.predict(test[feature_cols])
    # Calculate the classification metrics for test data
    f1_test = f1_score(test[TARGET_COL], y_pred_test, average='weighted')
    accuracy_test = accuracy_score(test[TARGET_COL], y_pred_test)
    recall_test = recall_score(test[TARGET_COL], y_pred_test, average='weighted')

    # Log the classification metrics to MLflow
    mlflow.log_metric("f1_score_test", f1_test)
    mlflow.log_metric("accuracy_test", accuracy_test)
    mlflow.log_metric("recall_test", recall_test)

    # Print the classification metrics
    print("F1 score_test:", f1_test)
    print("Accuracy_test:", accuracy_test)
    print("Recall_test:", recall_test)

    mlflow.log_param("model_name", "predictive_maintenance_rfc")
    print("All done")

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Start your training job with `start_run()`
with mlflow.start_run() as run:
    lr_id = run.info.run_id
    print(f"run_id {lr_id}, status: {run.info.status}")
    
    lr = LogisticRegression(random_state = 42)
    lr.fit(X_train_res, y_train_res)
    signature = infer_signature(X_train_res, y_train_res)

    mlflow.sklearn.log_model(
        lr,
        "predictive_maintenance_lr",
        signature = signature,
        registered_model_name = "predictive_maintenance_lr"
    ) 

    y_pred_train = lr.predict(train[feature_cols])
    # Calculate the classification metrics for test data
    f1_train = f1_score(train[TARGET_COL], y_pred_train, average = 'weighted')
    accuracy_train = accuracy_score(train[TARGET_COL], y_pred_train)
    recall_train = recall_score(train[TARGET_COL], y_pred_train, average='weighted')

    # Log the classification metrics to MLflow
    mlflow.log_metric("f1_score_train", f1_train)
    mlflow.log_metric("accuracy_train", accuracy_train)
    mlflow.log_metric("recall_train", recall_train)

    # Print the run ID and the classification metrics
    print("F1 score_train:", f1_train)
    print("Accuracy_train:", accuracy_train)
    print("Recall_train:", recall_train)  

    y_pred_test = lr.predict(test[feature_cols])
    # Calculate the classification metrics for test data
    f1_test = f1_score(test[TARGET_COL], y_pred_test, average='weighted')
    accuracy_test = accuracy_score(test[TARGET_COL], y_pred_test)
    recall_test = recall_score(test[TARGET_COL], y_pred_test, average='weighted')

    # Log the classification metrics to MLflow
    mlflow.log_metric("f1_score_test", f1_test)
    mlflow.log_metric("accuracy_test", accuracy_test)
    mlflow.log_metric("recall_test", recall_test)

    # Print the classification metrics
    print("F1 score_test:", f1_test)
    print("Accuracy_test:", accuracy_test)
    print("Recall_test:", recall_test)

    mlflow.log_param("model_name", "predictive_maintenance_lr")
    print("All done")

## XGBoost Classifier

In [None]:
from xgboost import XGBClassifier

# Start your training job with `start_run()`
with mlflow.start_run() as run:
    xgb_id = run.info.run_id
    print(f"run_id {xgb_id}, status: {run.info.status}")
    
    xgb = XGBClassifier()
    xgb.fit(X_train_res, y_train_res)
    signature = infer_signature(X_train_res, y_train_res)

    mlflow.xgboost.log_model(
        xgb,
        "predictive_maintenance_xgb",
        signature = signature,
        registered_model_name = "predictive_maintenance_xgb"
    )

    y_pred_train = xgb.predict(train[feature_cols])
    # Calculate the classification metrics for test data
    f1_train = f1_score(train[TARGET_COL], y_pred_train, average = 'weighted')
    accuracy_train = accuracy_score(train[TARGET_COL], y_pred_train)
    recall_train = recall_score(train[TARGET_COL], y_pred_train, average='weighted')

    # Log the classification metrics to MLflow
    mlflow.log_metric("f1_score_train", f1_train)
    mlflow.log_metric("accuracy_train", accuracy_train)
    mlflow.log_metric("recall_train", recall_train)

    # Print the run ID and the classification metrics
    print("F1 score_train:", f1_train)
    print("Accuracy_train:", accuracy_train)
    print("Recall_train:", recall_train)  

    y_pred_test = xgb.predict(test[feature_cols])
    # Calculate the classification metrics for test data
    f1_test = f1_score(test[TARGET_COL], y_pred_test, average='weighted')
    accuracy_test = accuracy_score(test[TARGET_COL], y_pred_test)
    recall_test = recall_score(test[TARGET_COL], y_pred_test, average='weighted')

    # Log the classification metrics to MLflow
    mlflow.log_metric("f1_score_test", f1_test)
    mlflow.log_metric("accuracy_test", accuracy_test)
    mlflow.log_metric("recall_test", recall_test)

    # Print the classification metrics
    print("F1 score_test:", f1_test)
    print("Accuracy_test:", accuracy_test)
    print("Recall_test:", recall_test)

    mlflow.log_param("model_name", "predictive_maintenance_xgb")
    print("All done")

## Best Runs

In [None]:
# mlflow.search_runs(order_by = ['metrics.accuracy_test DESC'])
ml_runs = mlflow.search_runs().drop(['metrics.score', 'params.alpha'], axis = 1).dropna()
ml_runs = ml_runs[ml_runs["status"] == "FINISHED"]
ml_runs

In [None]:
best_run = ml_runs.sort_values(by = 'metrics.accuracy_train', ascending = False).iloc[0]
best_run['run_id']

In [None]:
best_run

In [None]:
# Loading the Best Run
new_model_version = mlflow.pyfunc.load_model(f"runs:/{best_run['run_id']}/{best_run['params.model_name']}")

In [None]:
new_model_version.predict(test[feature_cols])

In [None]:
print(new_model_version.metadata)