In [None]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


In [None]:

# Load dataset
df = pd.read_csv("../data/diabetes_dataset.csv")
X = df.drop(columns=['target'], axis=0)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X,y)


In [None]:

rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)

# Start MLflow run
with mlflow.start_run() as run:
    # Log parameters
    mlflow.log_param("n_estimators", 100)

    params = {
        "max_depth" : 6,
        "max_features" : 3
    }
    mlflow.log_params(params)
    
    # Train model
    rf.fit(X_train, y_train)
    
    # Predict and evaluate
    predictions = rf.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    
    # Log model
    mlflow.sklearn.log_model(rf, "model-1")

    # Log additional artifacts
    # For example, a plot of predicted vs true values
    plt.figure(figsize=(10,6))
    plt.scatter(y_test, predictions)
    plt.xlabel("True Values")
    plt.ylabel("Predictions")
    plt.title("True vs Predictions")
    plot_path = "artifacts/plot.png"
    plt.savefig(plot_path)
    mlflow.log_artifact(plot_path)
    
    # Log feature importance
    feature_importance = rf.feature_importances_
    importance_df = pd.DataFrame({
        "feature": X.columns,
        "importance": feature_importance
    }).sort_values(by="importance", ascending=False)
    importance_csv_path = "artifacts/feature_importance.csv"
    importance_df.to_csv(importance_csv_path, index=False)
    mlflow.log_artifact(importance_csv_path)
    
    # Optionally, log the training and test datasets
    np.savetxt("artifacts/X_train.csv", X_train, delimiter=",")
    np.savetxt("artifacts/X_test.csv", X_test, delimiter=",")
    np.savetxt("artifacts/y_train.csv", y_train, delimiter=",")
    np.savetxt("artifacts/y_test.csv", y_test, delimiter=",")
    mlflow.log_artifact("artifacts/X_train.csv")
    mlflow.log_artifact("artifacts/X_test.csv")
    mlflow.log_artifact("artifacts/y_train.csv")
    mlflow.log_artifact("artifacts/y_test.csv")

    # You can also log the run ID for future reference
    run_id = run.info.run_id
    print(f"Run ID: {run_id}")
