**Import essential libraries**

In [5]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

**Setup MLflow Tracking URI & Experiment from Root Directory**

In [6]:
if os.getcwd().endswith("notebooks"):
    os.chdir("..") 
# Ensure clean base folder
os.makedirs("mlruns/.trash", exist_ok=True)  # ✅ Avoid 'Invalid parent directory' error

mlflow.set_tracking_uri("file:mlruns")  # ✅ Correct relative URI from root
mlflow.set_experiment("MentalHealthExperiment")  # ✅ Auto-creates if missing

2025/06/14 18:55:09 INFO mlflow.tracking.fluent: Experiment with name 'MentalHealthExperiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:C:/Users/DELL/mental-health-ml/mlruns/169078089252648415', creation_time=1749909308999, experiment_id='169078089252648415', last_update_time=1749909308999, lifecycle_stage='active', name='MentalHealthExperiment', tags={}>

**5.1 Load Cleaned Dataset and Split Features and Target**

In [7]:
df = pd.read_csv('data/cleaned/cleaned_mental_health.csv')
X = df.drop('treatment', axis=1)
y = df['treatment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Disable MLflow logging warnings (specific to MLflow)
import logging
logging.getLogger("mlflow").setLevel(logging.ERROR)

**5.2 Define and Train Models with MLflow Logging**

In [9]:

models = {
    "RandomForest": RandomForestClassifier(n_estimators=300, max_depth=15, min_samples_split=4, min_samples_leaf=2, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=3000, C=0.05, solver='liblinear'),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=8, learning_rate=0.05, subsample=0.8)
}

results = []

# Top-level run
with mlflow.start_run(run_name="All_Model_Comparisons"):
    for name, model in models.items():
        with mlflow.start_run(run_name=name, nested=True):  # ✅ nested run
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            acc = accuracy_score(y_test, preds)

            mlflow.sklearn.log_model(model, name, input_example=X_train[:5])
            mlflow.log_metric("accuracy", acc)
            mlflow.log_params(model.get_params())

            results.append((name, acc, model))
            print(f"{name} Accuracy: {acc:.4f}")

RandomForest Accuracy: 0.7928
LogisticRegression Accuracy: 0.8088
XGBoost Accuracy: 0.7849


**5.3 Automatically Select the Best Model**

In [10]:
# Select and log the best model
best_model_name, best_accuracy, best_model = max(results, key=lambda x: x[1])

with mlflow.start_run(run_name=f"Best_{best_model_name}", nested=True):
    mlflow.sklearn.log_model(best_model, "best_model", input_example=X_train[:5])
    mlflow.log_metric("best_accuracy", best_accuracy)
    print(f"\n✅ Best Model: {best_model_name} with Accuracy: {best_accuracy:.4f}")


✅ Best Model: LogisticRegression with Accuracy: 0.8088


**5.4 Save Best Model to Disk**

In [11]:
os.makedirs("../models", exist_ok=True)
joblib.dump(best_model, f"../models/{best_model_name}_model.pkl")
print(f"💾 Best model saved to '../models/{best_model_name}_model.pkl'")

💾 Best model saved to '../models/LogisticRegression_model.pkl'
