In [8]:
# ==========================================================
# ‚úÖ FINAL VERSION ‚Äì Technical Implementations for Risk Management
# ==========================================================
# Author: Sathyadharini
# Project: DDI Risk Strategy ‚Äì aggregated_data
# ----------------------------------------------------------

import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("aggregated_data.csv")  # update if different

# ==========================================================
# TECHNICAL IMPLEMENTATION #1: DATA VALIDATION & SCHEMA CHECK
# ==========================================================

def validate_data(df):
    """
    Validates only numeric columns and reports missing, duplicates, and outliers.
    Ignores text fields such as SMILES or DB identifiers.
    """
    numeric_df = df.select_dtypes(include=[np.number])
    report = {
        "total_columns": len(df.columns),
        "numeric_columns_checked": len(numeric_df.columns),
        "missing_values": int(df.isnull().sum().sum()),
        "duplicate_rows": int(df.duplicated().sum()),
        "infinite_values": int(np.isinf(numeric_df).sum().sum())
    }

    # Calculate outliers only for numeric data
    if not numeric_df.empty:
        z_scores = (numeric_df - numeric_df.mean()) / (numeric_df.std() + 1e-6)
        report["outlier_count"] = int((abs(z_scores) > 3).sum().sum())
    else:
        report["outlier_count"] = 0

    return report, numeric_df


def schema_check(df, expected_features):
    """
    Verifies expected schema before deployment or model inference.
    """
    missing = [col for col in expected_features if col not in df.columns]
    extra = [col for col in df.columns if col not in expected_features]
    return {"missing_columns": missing, "unexpected_columns": extra}


expected_features = [
    'MolWt_1', 'MolWt_2', 'LogP_1', 'LogP_2', 'HBD_1', 'HBD_2',
    'HBA_1', 'HBA_2', 'TPSA_1', 'TPSA_2', 'Fingerprint_Similarity', 'Y'
]

# Run validation
validation_report, numeric_df = validate_data(df)
schema_report = schema_check(df, expected_features)

print("üîç DATA VALIDATION REPORT:")
for k, v in validation_report.items():
    print(f"  {k}: {v}")

print("\nüìÇ SCHEMA CHECK REPORT:")
print(schema_report)

# Clean data (safe numeric fill)
df[numeric_df.columns] = numeric_df.fillna(numeric_df.median())
df = df.drop_duplicates()
print("\n‚úÖ Cleaned dataset shape:", df.shape)


# ==========================================================
# TECHNICAL IMPLEMENTATION #2: DATA DRIFT DETECTION
# ==========================================================

def detect_drift(old_df, new_df, column):
    """
    Computes mean/std ratio difference to flag drift for numeric columns only.
    """
    if column not in old_df.columns or column not in new_df.columns:
        return np.nan
    if not np.issubdtype(old_df[column].dtype, np.number):
        return np.nan
    drift_value = abs(old_df[column].mean() - new_df[column].mean()) / (old_df[column].std() + 1e-6)
    return round(drift_value, 3)


# Split data (simulate old vs new)
split = int(len(df) * 0.7)
old_data, new_data = df.iloc[:split], df.iloc[split:]

numeric_cols = df.select_dtypes(include=[np.number]).columns
drift_report = {col: detect_drift(old_data, new_data, col) for col in numeric_cols}

print("\nüìä DRIFT DETECTION REPORT (Mean/Std Ratio):")
for feature, drift in drift_report.items():
    print(f"  {feature}: {drift}")

# Flag potential drifts
high_drift = [col for col, value in drift_report.items() if value > 0.5]
if high_drift:
    print("\n‚ö†Ô∏è Potential drift detected in:", high_drift)
else:
    print("\n‚úÖ No significant drift detected. Dataset stable.")

# ==========================================================
# Summary
# ==========================================================
print("""‚úîÔ∏è Technical Implementation Summary
------------------------------------
1. Data Validation: Checks numeric consistency, schema, and outliers safely.
2. Drift Detection: Monitors feature distribution changes without errors.
These strengthen Data Collection, Deployment, and Monitoring stages.
""")


üîç DATA VALIDATION REPORT:
  total_columns: 20
  numeric_columns_checked: 15
  missing_values: 0
  duplicate_rows: 0
  infinite_values: 0
  outlier_count: 40160

üìÇ SCHEMA CHECK REPORT:
{'missing_columns': [], 'unexpected_columns': ['ID1', 'ID2', 'Map', 'X1', 'X2', 'Map1', 'RotatableBonds_1', 'RotatableBonds_2']}

‚úÖ Cleaned dataset shape: (191808, 20)

üìä DRIFT DETECTION REPORT (Mean/Std Ratio):
  Y: 2.056
  Map1: 0.197
  MolWt_1: 0.243
  MolWt_2: 0.294
  LogP_1: 0.052
  LogP_2: 0.036
  HBD_1: 0.262
  HBD_2: 0.2
  HBA_1: 0.273
  HBA_2: 0.242
  TPSA_1: 0.27
  TPSA_2: 0.276
  RotatableBonds_1: 0.105
  RotatableBonds_2: 0.173
  Fingerprint_Similarity: 0.111

‚ö†Ô∏è Potential drift detected in: ['Y']
‚úîÔ∏è Technical Implementation Summary
------------------------------------
1. Data Validation: Checks numeric consistency, schema, and outliers safely.
2. Drift Detection: Monitors feature distribution changes without errors.
These strengthen Data Collection, Deployment, and Monitori