## Implementing ML Model Monitoring Pipelines

### Model Performance Drift:
**Description**: Setup a monitoring pipeline to track key performance metrics (e.g., accuracy, precision) of an ML model over time using a monitoring tool or dashboard.

In [1]:
# write your code from here

### Feature Distribution Drift:
**Description**: Monitor the distribution of your input features in deployed models to detect any significant shifts from training data distributions.

In [2]:
# write your code from here

### Anomaly Detection in Predictions:
**DEscription**: Implement an anomaly detection mechanism to flag unusual model
predictions. Simulate anomalies by altering input data.

In [3]:
import great_expectations as gx
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score
import numpy as np
from scipy.stats import ks_2samp
from sklearn.ensemble import IsolationForest

# Initialize Great Expectations Data Context
context = gx.DataContext()

# --- Model Performance Drift ---
print("\n--- Model Performance Drift ---")
# Simulate historical model performance data (replace with your actual monitoring data)
historical_performance = [
    {"date": "2025-05-01", "accuracy": 0.85, "precision": 0.78},
    {"date": "2025-05-08", "accuracy": 0.86, "precision": 0.79},
    {"date": "2025-05-15", "accuracy": 0.82, "precision": 0.75},
]
performance_df = pd.DataFrame(historical_performance)
performance_df['date'] = pd.to_datetime(performance_df['date'])

# Add Pandas DataFrame Data Source and Data Asset
datasource_name = "model_performance_source"
datasource = context.sources.add_pandas(name=datasource_name)
data_asset_name = "model_performance"
data_asset = datasource.add_dataframe_asset(name=data_asset_name)
batch_request = data_asset.build_batch_request(dataframe=performance_df)

# Get Validator
validator_performance = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="model_performance_drift_suite",
)

print(f"Using Expectation Suite: {validator_performance.expectation_suite.name}")

# Define expectations for performance metrics (example: accuracy should not drop significantly)
validator_performance.expect_column_mean_to_be_between(
    column="accuracy",
    min_value=0.80,
    max_value=1.0,
    mostly=0.9,
)
validator_performance.expect_column_mean_to_be_between(
    column="precision",
    min_value=0.75,
    max_value=1.0,
    mostly=0.9,
)

validator_performance.save_expectation_suite()

checkpoint_performance = context.run_checkpoint(
    checkpoint_name="model_performance_drift_checkpoint",
    validations=[
        {
            "batch_request": batch_request,
            "expectation_suite_name": "model_performance_drift_suite",
        }
    ],
)
print("Model Performance Drift Validation Results:")
print(checkpoint_performance.list_validation_results())

# --- Feature Distribution Drift ---
print("\n--- Feature Distribution Drift ---")
# Simulate training and deployed data (replace with your actual data)
train_data = pd.DataFrame({'feature_a': np.random.normal(0, 1, 1000)})
deployed_data = pd.DataFrame({'feature_a': np.random.normal(0.5, 1.2, 1000)})

# Add Data Sources and Assets
datasource_name = "feature_drift_source"
datasource = context.sources.add_pandas(name=datasource_name)

train_asset = datasource.add_dataframe_asset(name="train_data")
train_batch_request = train_asset.build_batch_request(dataframe=train_data)

deployed_asset = datasource.add_dataframe_asset(name="deployed_data")
deployed_batch_request = deployed_asset.build_batch_request(dataframe=deployed_data)

# Get Validators
validator_train = context.get_validator(
    batch_request=train_batch_request,
    expectation_suite_name="feature_drift_train_suite",
)
validator_deployed = context.get_validator(
    batch_request=deployed_batch_request,
    expectation_suite_name="feature_drift_deployed_suite",
)

print(f"Using Expectation Suite (Train): {validator_train.expectation_suite.name}")
print(f"Using Expectation Suite (Deployed): {validator_deployed.expectation_suite.name}")

# Define expectation to compare distributions using Kolmogorov-Smirnov test (custom expectation)
def expect_kolmogorov_smirnov_p_value_greater_than(self, column, other_batch_request, threshold=0.05):
    batch = self.get_batch(batch_request=self._active_batch_request)
    other_batch = self.get_batch(batch_request=other_batch_request)
    data1 = batch.data[column].dropna()
    data2 = other_batch.data[column].dropna()
    if len(data1) < 2 or len(data2) < 2:
        return self.expectation_failed(
            details={"reason": "Insufficient data for KS test"}
        )
    ks_statistic, p_value = ks_2samp(data1, data2)
    success = p_value > threshold
    return self.expectation_met(
        success=success,
        result={"statistic": ks_statistic, "pvalue": p_value, "threshold": threshold},
    )

gx.validator.ExpectationSuite.expect_kolmogorov_smirnov_p_value_greater_than = expect_kolmogorov_smirnov_p_value_greater_than

validator_deployed.expect_kolmogorov_smirnov_p_value_greater_than(
    column="feature_a",
    other_batch_request=train_batch_request,
    threshold=0.05,
)

validator_deployed.save_expectation_suite()

checkpoint_feature_drift = context.run_checkpoint(
    checkpoint_name="feature_drift_checkpoint",
    validations=[
        {
            "batch_request": deployed_batch_request,
            "expectation_suite_name": "feature_drift_deployed_suite",
        }
    ],
)
print("Feature Distribution Drift Validation Results:")
print(checkpoint_feature_drift.list_validation_results())

# --- Anomaly Detection in Predictions ---
print("\n--- Anomaly Detection in Predictions ---")
# Simulate model predictions (replace with your actual predictions)
predictions = pd.DataFrame({'prediction': np.concatenate([np.random.normal(5, 1, 950), np.random.normal(10, 2, 50)])})

# Add Data Source and Data Asset
datasource_name = "prediction_anomaly_source"
datasource = context.sources.add_pandas(name=datasource_name)
data_asset_name = "model_predictions"
data_asset = datasource.add_dataframe_asset(name=data_asset_name)
batch_request = data_asset.build_batch_request(dataframe=predictions)

# Get Validator
validator_anomaly = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="prediction_anomaly_suite",
)

print(f"Using Expectation Suite: {validator_anomaly.expectation_suite.name}")

# Use Isolation Forest to detect anomalies (outside of Great Expectations for detection)
model = IsolationForest(contamination=0.05)
outliers = model.fit_predict(predictions[['prediction']])
anomalous_predictions = predictions[outliers == -1]
non_anomalous_predictions = predictions[outliers != -1]

print(f"Number of anomalous predictions: {len(anomalous_predictions)}")
print(f"Example anomalous predictions:\n{anomalous_predictions.head()}")

# Define expectations on the range of predictions (to catch gross anomalies)
validator_anomaly.expect_column_min_to_be_greater_than_or_equal_to(column="prediction", min_value=0)
validator_anomaly.expect_column_max_to_be_less_than_or_equal_to(column="prediction", max_value=15)
validator_anomaly.expect_column_values_to_be_in_type_list(column="prediction", type_list=["float", "int"])

# You could also check the percentage of predictions falling outside a typical range
lower_bound = non_anomalous_predictions['prediction'].quantile(0.01)
upper_bound = non_anomalous_predictions['prediction'].quantile(0.99)
validator_anomaly.expect_column_values_to_be_between(
    column="prediction",
    min_value=lower_bound,
    max_value=upper_bound,
    mostly=0.99,
    meta={"notes": "Checking if 99% of predictions fall within the 1st and 99th percentile of non-anomalous data"},
)

validator_anomaly.save_expectation_suite()

checkpoint_anomaly = context.run_checkpoint(
    checkpoint_name="prediction_anomaly_checkpoint",
    validations=[
        {
            "batch_request": batch_request,
            "expectation_suite_name": "prediction_anomaly_suite",
        }
    ],
)
print("Anomaly Detection in Predictions Validation Results:")
print(checkpoint_anomaly.list_validation_results())

print("\nTo view detailed validation reports, run 'great_expectations docs build' in your project directory and open the generated index.html.")

AttributeError: module 'great_expectations' has no attribute 'DataContext'