In [1]:
import os
from pathlib import Path

from hydra import initialize_config_dir, compose

with initialize_config_dir(version_base=None, config_dir=str(Path(os.get_exec_path()[0]).parent.parent / 'config')):

    cfg=compose(overrides= ["+models=model_1"])
    cfg = cfg['models']

    LOAD_DATA_PATH = cfg['load']['LOAD_DATA_PATH']
    LOAD_DATA_FILE = cfg['load']['LOAD_DATA_FILE']

    TRANSFORM_DATA_PATH = cfg['transform']['TRANSFORM_DATA_PATH']

    X_TRAIN_FILE = cfg['transform']['TRANSFORM_DATA_FILE_TRAIN_X']
    Y_TRAIN_FILE =  cfg['transform']['TRANSFORM_DATA_FILE_TRAIN_Y']
    X_TEST_FILE = cfg['transform']['TRANSFORM_DATA_FILE_TEST_X']
    Y_TEST_FILE =  cfg['transform']['TRANSFORM_DATA_FILE_TEST_Y']

    TRAIN_CONFIG_PATH=cfg['train']["TRAIN_CONFIG_PATH"]
    MODEL_METRICS = cfg['train']["MODEL_METRICS"]
    SAVED_PARAMS  =  cfg['train']['SAVED_PARAMS']

    ITERATIONS = cfg['train']['train_config']['iterations']
    LEARNING_RATE = cfg['train']['train_config']['learning_rate']
    DEPTH = cfg['train']['train_config']['depth']
    VERBOSE = cfg['train']['train_config']['verbose']


    
    os.environ['AWS_ACCESS_KEY_ID'] = cfg['s3']['AWS_ACCESS_KEY_ID']
    os.environ['AWS_SECRET_ACCESS_KEY'] = cfg['s3']['AWS_SECRET_ACCESS_KEY']
    os.environ['MLFLOW_S3_ENDPOINT_URL'] = cfg['mlflow']['MLFLOW_S3_ENDPOINT_URL']
    os.environ['MLFLOW_TRACKING_URI'] = cfg['mlflow']['MLFLOW_TRACKING_URI']
    

In [2]:
print('MLFLOW_TRACKING_URI:', os.environ.get('MLFLOW_TRACKING_URI'))
print('AWS_ACCESS_KEY_ID:', os.environ.get('AWS_ACCESS_KEY_ID'))
print('AWS_SECRET_ACCESS_KEY:', os.environ.get('AWS_SECRET_ACCESS_KEY'))
print('MLFLOW_S3_ENDPOINT_URL:', os.environ.get('MLFLOW_S3_ENDPOINT_URL'))

MLFLOW_TRACKING_URI: http://127.0.0.1:5000
AWS_ACCESS_KEY_ID: minio
AWS_SECRET_ACCESS_KEY: minio123
MLFLOW_S3_ENDPOINT_URL: http://127.0.0.1:9000


In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from dotenv import load_dotenv
from yaml import load, Loader
import yaml
from pathlib import Path
import dvc.api
import mlflow
import json
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

from catboost import CatBoostRegressor


X_TRAIN_DATA = str(Path(TRANSFORM_DATA_PATH) / X_TRAIN_FILE)
X_train = pq.read_table(X_TRAIN_DATA).to_pandas()

Y_TRAIN_DATA = str(Path(TRANSFORM_DATA_PATH) / Y_TRAIN_FILE)
y_train = pq.read_table(Y_TRAIN_DATA).to_pandas()


X_TEST_DATA = str(Path(TRANSFORM_DATA_PATH) / X_TEST_FILE)
X_test = pq.read_table(X_TEST_DATA).to_pandas()

Y_TEST_DATA = str(Path(TRANSFORM_DATA_PATH) / Y_TEST_FILE)
y_test = pq.read_table(Y_TEST_DATA).to_pandas()

with mlflow.start_run():

    run = mlflow.active_run()
    run_id = run.info.run_id

    print(f"Active run_id: {run_id}")
    
    model = CatBoostRegressor(iterations=ITERATIONS,
                               learning_rate=LEARNING_RATE,
                                depth=DEPTH,
                                verbose=VERBOSE)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    mlflow.log_metric("mae", mae)
    mlflow.log_metric("mse", mse)


    with open(MODEL_METRICS, 'w') as pfile:
        json.dump({'mae':mae, 'mse':mse}, pfile)
        
    mlflow.catboost.log_model(model, "model")
    
    mlflow.log_param("X_train", str(dvc.api.get_url(path=str(Path(TRANSFORM_DATA_PATH) / X_TRAIN_FILE))))
    mlflow.log_param("y_train", str(dvc.api.get_url(path=str(Path(TRANSFORM_DATA_PATH) / Y_TRAIN_FILE) )))
    mlflow.log_param("X_test", str(dvc.api.get_url(path=str(Path(TRANSFORM_DATA_PATH) / X_TEST_FILE))))
    mlflow.log_param("y_test", str(dvc.api.get_url(path=str(Path(TRANSFORM_DATA_PATH) / Y_TEST_FILE))))
 
    
    with open(SAVED_PARAMS, 'w') as pfile:
        json.dump({'run_id': run_id}, pfile)



Active run_id: 25865324959240fca3826c43182c6f78






2024/08/27 16:02:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run big-deer-129 at: http://127.0.0.1:5000/#/experiments/0/runs/25865324959240fca3826c43182c6f78.


2024/08/27 16:02:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


In [4]:
# import shap
# shap.initjs()

# import matplotlib.pyplot as plt
# explainer = shap.TreeExplainer(model)
# shap_values = explainer(X_train)
# plt.show()
# shap.plots.beeswarm(shap_values)
# plt.close()
# plt.savefig('shap_values.png')

# mlflow.log_artifact('shap_values.png')