In [1]:
# %pip install mlflow

In [2]:
# %pip install boto3

In [3]:
# %pip install "dvc[s3]"

In [4]:
# %pip install dvc-s3

In [5]:
# %pip install catboost

In [6]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from dotenv import load_dotenv
from yaml import load, Loader
import yaml
from pathlib import Path
import dvc.api
import mlflow
import json
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

from catboost import CatBoostRegressor

# import shap
# shap.initjs()

# import matplotlib.pyplot as plt


load_dotenv()

SAVED_PARAMS  =  os.environ.get('MODEL_1_SAVED_PARAMS')

TRANSFORM_DATA_PATH = os.environ.get('MODEL_1_TRANSFORM_DATA_PATH')

X_TRAIN_FILE = os.environ.get('MODEL_1_TRANSFORM_DATA_FILE_TRAIN_X')
Y_TRAIN_FILE =  os.environ.get('MODEL_1_TRANSFORM_DATA_FILE_TRAIN_Y')
X_TEST_FILE = os.environ.get('MODEL_1_TRANSFORM_DATA_FILE_TEST_X')
Y_TEST_FILE =  os.environ.get('MODEL_1_TRANSFORM_DATA_FILE_TEST_Y')

MLFLOW_ENDPOINT_URL = os.environ.get("MLFLOW_ENDPOINT_URL")
MODEL_1_TRAIN_CONFIG_PATH=os.environ.get("MODEL_1_TRAIN_CONFIG_PATH")
MODEL_1_MODEL_METRICS = os.environ.get("MODEL_1_MODEL_METRICS")

X_TRAIN_DATA = str(Path(TRANSFORM_DATA_PATH) / X_TRAIN_FILE)
X_train = pq.read_table(X_TRAIN_DATA).to_pandas()

Y_TRAIN_DATA = str(Path(TRANSFORM_DATA_PATH) / Y_TRAIN_FILE)
y_train = pq.read_table(Y_TRAIN_DATA).to_pandas()


X_TEST_DATA = str(Path(TRANSFORM_DATA_PATH) / X_TEST_FILE)
X_test = pq.read_table(X_TEST_DATA).to_pandas()

Y_TEST_DATA = str(Path(TRANSFORM_DATA_PATH) / Y_TEST_FILE)
y_test = pq.read_table(Y_TEST_DATA).to_pandas()

mlflow.set_tracking_uri(MLFLOW_ENDPOINT_URL)

with mlflow.start_run():

    run = mlflow.active_run()
    run_id = run.info.run_id

    print(f"Active run_id: {run_id}")

    with open(MODEL_1_TRAIN_CONFIG_PATH, "r") as conf:
        train_config = load(conf, Loader=Loader)["train_config"]
    
    model = CatBoostRegressor(iterations=train_config['iterations'],
                               learning_rate=train_config['learning_rate'],
                                depth=train_config['depth'],
                                verbose=train_config['verbose'])

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mse", mse)

    with open(MODEL_1_MODEL_METRICS, 'w') as pfile:
        json.dump({'mae':mae, 'mse':mse}, pfile)
        
    mlflow.catboost.log_model(model, "model")
    
    mlflow.log_param("X_train", str(dvc.api.get_url(path=str(Path(TRANSFORM_DATA_PATH) / X_TRAIN_FILE))))
    mlflow.log_param("y_train", str(dvc.api.get_url(path=str(Path(TRANSFORM_DATA_PATH) / Y_TRAIN_FILE) )))
    mlflow.log_param("X_test", str(dvc.api.get_url(path=str(Path(TRANSFORM_DATA_PATH) / X_TEST_FILE))))
    mlflow.log_param("y_test", str(dvc.api.get_url(path=str(Path(TRANSFORM_DATA_PATH) / Y_TEST_FILE))))
 
    
    with open(SAVED_PARAMS, 'w') as pfile:
        json.dump({'run_id': run_id}, pfile)



Active run_id: 6bea5c1070414efb8c89c558bf9ea6e0






2024/08/22 17:24:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run abundant-grouse-743 at: http://127.0.0.1:5000/#/experiments/0/runs/6bea5c1070414efb8c89c558bf9ea6e0.


2024/08/22 17:24:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


In [7]:
# explainer = shap.TreeExplainer(model)
# shap_values = explainer(X_train)
# plt.show()
# shap.plots.beeswarm(shap_values)
# plt.close()
# plt.savefig('shap_values.png')

# mlflow.log_artifact('shap_values.png')