### Implementing and Logging an ML Experiment with MLFlow
**Description**: Train an ML model for an e-commerce recommendation engine using MLFlow to track models and experiments.

**Steps**:
1. MLFlow Integration Setup
2. Training the Model
3. Logging the Experiment
4. Accessing MLFlow UI

In [None]:
# write your code from here

In [2]:
def validate_data(df):
    """Check for expected columns and valid values."""
    required_columns = ['price', 'user_rating', 'time_on_site', 'clicked']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")
        if df[col].isnull().any():
            raise ValueError(f"Column '{col}' contains null values.")
        if (df[col] < 0).any():
            raise ValueError(f"Column '{col}' contains negative values.")

    if not df['clicked'].isin([0, 1]).all():
        raise ValueError("'clicked' column must contain only 0 or 1.")
    return True


def run_experiment():
    try:
        data = generate_sample_data()
        validate_data(data)
        X_train, X_test, y_train, y_test = prepare_data(data)

        with mlflow.start_run(run_name="ecommerce_recommendation"):
            model = GradientBoostingClassifier(n_estimators=100, max_depth=4, learning_rate=0.1)
            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)[:, 1]

            acc = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_proba)

            mlflow.log_params({
                "n_estimators": 100,
                "max_depth": 4,
                "learning_rate": 0.1
            })
            mlflow.log_metrics({"accuracy": acc, "roc_auc": auc})
            mlflow.sklearn.log_model(model, "ecommerce_model")
            logging.info(f"✅ Accuracy: {acc:.3f}, AUC: {auc:.3f}")

    except Exception as e:
        logging.error(f"❌ Experiment failed: {e}")


In [None]:
import unittest

class TestMLPipeline(unittest.TestCase):

    def test_generate_sample_data(self):
        df = generate_sample_data()
        self.assertFalse(df.isnull().any().any())
        self.assertTrue((df[['price', 'user_rating', 'time_on_site']] >= 0).all().all())
        self.assertTrue(set(df['clicked'].unique()).issubset({0, 1}))

    def test_validate_data(self):
        df = generate_sample_data()
        self.assertTrue(validate_data(df))
        
        with self.assertRaises(ValueError):
            df_missing = df.drop(columns=['price'])
            validate_data(df_missing)

        with self.assertRaises(ValueError):
            df_invalid = df.copy()
            df_invalid.loc[0, 'clicked'] = 2
            validate_data(df_invalid)

    def test_prepare_data(self):
        df = generate_sample_data()
        X_train, X_test, y_train, y_test = prepare_data(df)
        self.assertEqual(len(X_train) + len(X_test), len(df))

if __name__ == "__main__":
    unittest.main(argv=['first-arg-is-ignored'], exit=False)


...
----------------------------------------------------------------------
Ran 3 tests in 0.063s

OK
