## Using AI for Anomalies Detection in Data Quality
**Description**: Implement an AI-based approach to detect anomalies in data quality.

**Steps**:
1. Use an Anomaly Detection Algorithm:
    - Use sklearn's Isolation Forest for anomaly detection.

**Example data:**

data = np.array([[25, 50000], [30, 60000], [35, 75000], [40, None], [45, 100000]])

2. Integrate with Great Expectations:
    - Generate alerts if anomalies are detected:

In [2]:
pip install scikit-learn great-expectations pandas numpy

Note: you may need to restart the kernel to use updated packages.


SyntaxError: invalid syntax (614722773.py, line 1)

In [4]:
# Write your code from here
import numpy as np
from sklearn.ensemble import IsolationForest
import great_expectations as gx

# 1. Use an Anomaly Detection Algorithm: Isolation Forest
data = np.array([[25, 50000], [30, 60000], [35, 75000], [40, None], [45, 100000]])

# Handle missing values by replacing None with NaN for Isolation Forest
processed_data = np.array([[row[0], row[1] if row[1] is not None else np.nan] for row in data])

# Identify rows with NaN values for Great Expectations context later
rows_with_missing = [i for i, row in enumerate(data) if None in row]

# Remove rows with NaN for Isolation Forest training
valid_data = processed_data[~np.isnan(processed_data).any(axis=1)]

# Initialize and train the Isolation Forest model
model = IsolationForest(contamination='auto', random_state=42)
model.fit(valid_data)

# Predict anomalies on the original data (excluding NaN rows for prediction)
predictions = model.predict(valid_data)

# Anomaly scores (lower score means more anomalous)
anomaly_scores = model.decision_function(valid_data)

# Identify anomalies (1 for inlier, -1 for outlier)
anomalous_indices = np.where(predictions == -1)[0]

print("Anomaly Scores:", anomaly_scores)
print("Anomalous Indices (based on Isolation Forest):", anomalous_indices)

# 2. Integrate with Great Expectations: Generate alerts if anomalies are detected

# Create a Great Expectations Data Context (replace with your actual context setup)
context = gx.DataContext()

# Create a Pandas DataFrame from the original data
import pandas as pd
df = pd.DataFrame(data, columns=['feature1', 'feature2'])

# Create a Great Expectations Datasource and DataConnector if you haven't already
# For simplicity, we'll directly create a Batch from the DataFrame
batch = context.get_batch_from_df(df=df, batch_kwargs={"datasource": "in_memory"})

# Create an Expectation Suite
expectation_suite_name = "anomaly_detection_suite"
try:
    suite = context.get_expectation_suite(expectation_suite_name)
    print(f"Loaded existing Expectation Suite: {expectation_suite_name}")
except gx.exceptions.ExpectationSuiteNotFoundError:
    suite = context.create_expectation_suite(expectation_suite_name)
    print(f"Created a new Expectation Suite: {expectation_suite_name}")

# Add an expectation to check for anomalies
anomalous_row_indices_original_data = [i for idx in anomalous_indices for i, row in enumerate(data) if list(valid_data[idx]) == [r for r in row if r is not None]]

if anomalous_row_indices_original_data or rows_with_missing:
    failed_expectation = {
        "expectation_type": "custom_anomaly_check",
        "kwargs": {
            "anomalous_rows": anomalous_row_indices_original_data,
            "missing_value_rows": rows_with_missing
        },
        "meta": {
            "notes": {
                "format": "markdown",
                "content": f"""
### Data Quality Alert - Potential Anomalies Detected!

The following rows are flagged as potential anomalies by the Isolation Forest model:
- **Indices:** {anomalous_row_indices_original_data if anomalous_row_indices_original_data else 'None'}
- **Data:** {[list(data[i]) for i in anomalous_row_indices_original_data] if anomalous_row_indices_original_data else 'None'}

The following rows contain missing values:
- **Indices:** {rows_with_missing if rows_with_missing else 'None'}
- **Data:** {[list(data[i]) for i in rows_with_missing] if rows_with_missing else 'None'}

Consider investigating these data points for potential quality issues.
"""
            }
        }
    }
    suite.add_expectation(failed_expectation)
else:
    success_expectation = {
        "expectation_type": "custom_anomaly_check",
        "kwargs": {},
        "meta": {
            "notes": {
                "format": "markdown",
                "content": "### Data Quality Check Passed - No Anomalies Detected by Isolation Forest."
            }
        }
    }
    suite.add_expectation(success_expectation)

# Save the Expectation Suite
context.save_expectation_suite(suite)

# Create a Checkpoint to run the validation
checkpoint_config = {
    "name": "anomaly_detection_checkpoint",
    "config_version": 1.0,
    "class_name": "SimpleCheckpoint",
    "run_name_template": "%Y%m%d-%H%M%S-anomaly-check",
    "expectation_suite_name": expectation_suite_name,
    "batch_request": {
        "datasource_name": "in_memory",
        "data_connector_name": "default_pandas_in_memory_data_connector",
        "data_asset_name": "from_df",
    },
    "action_list": [
        {
            "name": "store_validation_result",
            "action": {
                "class_name": "StoreValidationResultAction",
            },
        },
        {
            "name": "store_checkpoint_result",
            "action": {
                "class_name": "StoreCheckpointResultAction",
            },
        },
        {
            "name": "update_data_docs",
            "action": {
                "class_name": "UpdateDataDocsAction",
            },
        },
        # Add notification actions (e.g., send email, Slack message) here if needed
        # {
        #     "name": "send_slack_notification_on_validation_result",
        #     "action": {
        #         "class_name": "SlackNotificationAction",
        #         "slack_webhook": "YOUR_SLACK_WEBHOOK_URL",
        #         "notify_on": "failure",
        #         "message": "Great Expectations - Anomaly Detection Alert!",
        #         "verbosity_level": "info",
        #     },
        # },
    ],
}

checkpoint = context.create_checkpoint(**checkpoint_config)

# Run the Checkpoint
checkpoint_result = checkpoint.run()

# Print the validation result summary
print("\nGreat Expectations Validation Result:")
print(checkpoint_result.success)

if not checkpoint_result.success:
    print("Data quality anomalies or missing values detected!")
    # You can trigger further actions here, like sending alerts.
else:
    print("No data quality anomalies detected by Isolation Forest.")

Anomaly Scores: [ 0.02166693  0.12637841  0.10182917 -0.03920498]
Anomalous Indices (based on Isolation Forest): [3]


AttributeError: module 'great_expectations' has no attribute 'DataContext'