# üìä Custom Data Drift Detection: Baseline vs Uploaded Data
This notebook detects drift by comparing:
1. **Baseline**: Training data (`data/processed/daily_demand.csv`)
2. **Current**: User uploaded data (from `data/uploads/`)

## Statistical Tests Used:
- **Numerical Columns**: Kolmogorov-Smirnov (K-S) Test
- **Categorical Columns**: Chi-Square Test
- **Drift Threshold**: p-value < 0.05 indicates drift

In [None]:
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Imports successful")

## 1Ô∏è‚É£ Load Baseline (Training Data)

In [None]:
BASELINE_PATH = '../data/processed/daily_demand.csv'

if os.path.exists(BASELINE_PATH):
    baseline_df = pd.read_csv(BASELINE_PATH)
    print(f"‚úÖ Loaded Baseline: {len(baseline_df)} rows")
    print(f"Columns: {list(baseline_df.columns)}")
else:
    print(f"‚ùå Baseline file not found: {BASELINE_PATH}")
    # Fallback for demo
    baseline_df = pd.DataFrame({
        'product_id': ['P' + str(i) for i in range(1000)],
        'demand_quantity': np.random.normal(100, 20, 1000),
        'category': np.random.choice(['A', 'B', 'C'], 1000)
    })
    print("‚ö†Ô∏è Using mock baseline data")
    
print(f"\nBaseline shape: {baseline_df.shape}")
baseline_df.head()

## 2Ô∏è‚É£ Load Current (Uploaded Data)

In [None]:
UPLOADS_DIR = '../data/uploads'

# Check if uploads folder exists, if not create it
os.makedirs(UPLOADS_DIR, exist_ok=True)

# Find latest CSV file
list_of_files = glob.glob(f'{UPLOADS_DIR}/*.csv')

if list_of_files:
    latest_file = max(list_of_files, key=os.path.getctime)
    print(f"üìÇ Found latest upload: {latest_file}")
    current_df = pd.read_csv(latest_file)
    print(f"‚úÖ Loaded Current: {len(current_df)} rows")
else:
    print("‚ö†Ô∏è No uploaded files found in 'data/uploads'.")
    print("üé≤ Creating MOCK uploaded file with simulated drift...")
    
    # Create mock data with drift
    mock_df = baseline_df.copy()
    
    # Simulate drift in numeric columns
    numeric_cols = mock_df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        # Shift mean and increase variance
        mock_df[col] = mock_df[col] * 1.5 + 50
    
    # Simulate drift in categorical columns
    categorical_cols = mock_df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if col != 'product_id':  # Don't change IDs
            # Change distribution of categories
            unique_vals = mock_df[col].unique()
            if len(unique_vals) > 1:
                # Heavily bias towards first category
                mock_df[col] = np.random.choice(
                    unique_vals, 
                    size=len(mock_df), 
                    p=[0.7] + [0.3/(len(unique_vals)-1)]*(len(unique_vals)-1)
                )
            
    timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
    mock_path = f"{UPLOADS_DIR}/mock_upload_{timestamp}.csv"
    mock_df.to_csv(mock_path, index=False)
    
    current_df = mock_df
    print(f"‚úÖ Created and Loaded Mock Data: {mock_path}")

print(f"\nCurrent shape: {current_df.shape}")
current_df.head()

## 3Ô∏è‚É£ Custom Drift Detection Functions

In [None]:
def detect_numerical_drift(baseline_col, current_col, column_name, alpha=0.05):
    """
    Detect drift in numerical columns using Kolmogorov-Smirnov test.
    
    Args:
        baseline_col: Reference data column
        current_col: Current data column
        column_name: Name of the column
        alpha: Significance level (default 0.05)
    
    Returns:
        Dictionary with drift detection results
    """
    # Remove NaN values
    baseline_clean = baseline_col.dropna()
    current_clean = current_col.dropna()
    
    # Kolmogorov-Smirnov test
    statistic, p_value = stats.ks_2samp(baseline_clean, current_clean)
    
    # Statistical summary
    baseline_mean = baseline_clean.mean()
    current_mean = current_clean.mean()
    baseline_std = baseline_clean.std()
    current_std = current_clean.std()
    
    mean_shift = ((current_mean - baseline_mean) / baseline_mean * 100) if baseline_mean != 0 else 0
    
    drift_detected = p_value < alpha
    
    return {
        'column': column_name,
        'type': 'numerical',
        'drift_detected': drift_detected,
        'test': 'Kolmogorov-Smirnov',
        'statistic': statistic,
        'p_value': p_value,
        'baseline_mean': baseline_mean,
        'current_mean': current_mean,
        'baseline_std': baseline_std,
        'current_std': current_std,
        'mean_shift_pct': mean_shift
    }

def detect_categorical_drift(baseline_col, current_col, column_name, alpha=0.05):
    """
    Detect drift in categorical columns using Chi-Square test.
    
    Args:
        baseline_col: Reference data column
        current_col: Current data column
        column_name: Name of the column
        alpha: Significance level (default 0.05)
    
    Returns:
        Dictionary with drift detection results
    """
    # Remove NaN values
    baseline_clean = baseline_col.dropna()
    current_clean = current_col.dropna()
    
    # Get value counts
    baseline_counts = baseline_clean.value_counts()
    current_counts = current_clean.value_counts()
    
    # Align categories
    all_categories = sorted(set(baseline_counts.index) | set(current_counts.index))
    baseline_freq = [baseline_counts.get(cat, 0) for cat in all_categories]
    current_freq = [current_counts.get(cat, 0) for cat in all_categories]
    
    # Chi-square test
    try:
        statistic, p_value = stats.chisquare(current_freq, baseline_freq)
    except:
        # If chi-square fails, use default
        statistic, p_value = 0, 1.0
    
    drift_detected = p_value < alpha
    
    # Calculate distribution shift
    baseline_dist = baseline_counts / len(baseline_clean)
    current_dist = current_counts / len(current_clean)
    
    return {
        'column': column_name,
        'type': 'categorical',
        'drift_detected': drift_detected,
        'test': 'Chi-Square',
        'statistic': statistic,
        'p_value': p_value,
        'baseline_categories': len(baseline_counts),
        'current_categories': len(current_counts),
        'baseline_top_category': baseline_counts.index[0] if len(baseline_counts) > 0 else None,
        'current_top_category': current_counts.index[0] if len(current_counts) > 0 else None
    }

print("‚úÖ Drift detection functions defined")

## 4Ô∏è‚É£ Run Drift Detection on All Columns

In [None]:
# Find common columns
common_cols = list(set(baseline_df.columns) & set(current_df.columns))
print(f"üìä Analyzing {len(common_cols)} common columns")
print(f"Columns: {common_cols}\n")

# Detect drift for each column
drift_results = []

for col in common_cols:
    # Determine column type
    if pd.api.types.is_numeric_dtype(baseline_df[col]):
        result = detect_numerical_drift(
            baseline_df[col], 
            current_df[col], 
            col
        )
    else:
        result = detect_categorical_drift(
            baseline_df[col], 
            current_df[col], 
            col
        )
    
    drift_results.append(result)

# Convert to DataFrame for easy viewing
drift_df = pd.DataFrame(drift_results)
print(f"‚úÖ Drift detection complete for {len(drift_results)} columns")
drift_df

## 5Ô∏è‚É£ Drift Detection Summary

In [None]:
total_columns = len(drift_results)
drifted_columns = drift_df['drift_detected'].sum()
drift_share = drifted_columns / total_columns if total_columns > 0 else 0

print("\n" + "="*60)
print("üìä DATA DRIFT DETECTION SUMMARY")
print("="*60)

if drift_share > 0.5:
    print("üö® STATUS: SIGNIFICANT DRIFT DETECTED!")
elif drift_share > 0:
    print("‚ö†Ô∏è  STATUS: MODERATE DRIFT DETECTED")
else:
    print("‚úÖ STATUS: NO DRIFT DETECTED")

print(f"\nüìà Total Columns Analyzed: {total_columns}")
print(f"üö® Drifted Columns: {drifted_columns}")
print(f"üìä Drift Share: {drift_share:.2%}")
print("="*60)

# Show drifted columns detail
if drifted_columns > 0:
    print("\nüîç DRIFTED COLUMNS DETAILS:")
    print("-"*60)
    
    drifted_cols = drift_df[drift_df['drift_detected'] == True]
    for idx, row in drifted_cols.iterrows():
        print(f"\nüö® {row['column']} ({row['type'].upper()})")
        print(f"   Test: {row['test']}")
        print(f"   Statistic: {row['statistic']:.4f}")
        print(f"   P-value: {row['p_value']:.4f}")
        
        if row['type'] == 'numerical':
            print(f"   Mean Shift: {row['mean_shift_pct']:.2f}%")
            print(f"   Baseline Mean: {row['baseline_mean']:.2f} (¬±{row['baseline_std']:.2f})")
            print(f"   Current Mean: {row['current_mean']:.2f} (¬±{row['current_std']:.2f})")
        else:
            print(f"   Baseline Categories: {row['baseline_categories']}")
            print(f"   Current Categories: {row['current_categories']}")
            print(f"   Baseline Top: {row['baseline_top_category']}")
            print(f"   Current Top: {row['current_top_category']}")

# Show non-drifted columns
non_drifted = total_columns - drifted_columns
if non_drifted > 0:
    print("\n\n‚úÖ NON-DRIFTED COLUMNS:")
    print("-"*60)
    non_drifted_cols = drift_df[drift_df['drift_detected'] == False]
    for idx, row in non_drifted_cols.iterrows():
        print(f"   ‚úÖ {row['column']} ({row['type']}) - p-value: {row['p_value']:.4f}")

## 6Ô∏è‚É£ Save Drift Report

In [None]:
# Create reports directory
os.makedirs('../reports', exist_ok=True)

# Save detailed CSV report
report_csv_path = '../reports/drift_report_detailed.csv'
drift_df.to_csv(report_csv_path, index=False)
print(f"‚úÖ Detailed drift report saved to: {report_csv_path}")

# Save summary report
summary_path = '../reports/drift_summary.txt'
with open(summary_path, 'w') as f:
    f.write("="*60 + "\n")
    f.write("DATA DRIFT DETECTION SUMMARY\n")
    f.write("="*60 + "\n\n")
    f.write(f"Total Columns Analyzed: {total_columns}\n")
    f.write(f"Drifted Columns: {drifted_columns}\n")
    f.write(f"Drift Share: {drift_share:.2%}\n\n")
    
    if drifted_columns > 0:
        f.write("DRIFTED COLUMNS:\n")
        f.write("-"*60 + "\n")
        drifted_cols = drift_df[drift_df['drift_detected'] == True]
        for idx, row in drifted_cols.iterrows():
            f.write(f"\n{row['column']} ({row['type']})\n")
            f.write(f"  Test: {row['test']}\n")
            f.write(f"  P-value: {row['p_value']:.4f}\n")

print(f"‚úÖ Summary report saved to: {summary_path}")
print("\nüìÅ Reports generated successfully!")

## 7Ô∏è‚É£ Visualization (Optional)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.figure(figsize=(12, 6))

# Plot 1: Drift Overview
plt.subplot(1, 2, 1)
drift_counts = drift_df['drift_detected'].value_counts()
colors = ['#2ecc71', '#e74c3c']
labels = ['No Drift', 'Drift Detected']
plt.pie(
    [drift_counts.get(False, 0), drift_counts.get(True, 0)],
    labels=labels,
    autopct='%1.1f%%',
    colors=colors,
    startangle=90
)
plt.title('Overall Drift Detection', fontsize=14, fontweight='bold')

# Plot 2: P-values by column
plt.subplot(1, 2, 2)
colors_bar = ['#e74c3c' if x else '#2ecc71' for x in drift_df['drift_detected']]
plt.barh(drift_df['column'], drift_df['p_value'], color=colors_bar, alpha=0.7)
plt.axvline(x=0.05, color='red', linestyle='--', label='Significance Level (Œ±=0.05)')
plt.xlabel('P-value', fontsize=12)
plt.ylabel('Column', fontsize=12)
plt.title('P-values by Column', fontsize=14, fontweight='bold')
plt.legend()
plt.tight_layout()

# Save visualization
viz_path = '../reports/drift_visualization.png'
plt.savefig(viz_path, dpi=300, bbox_inches='tight')
print(f"‚úÖ Visualization saved to: {viz_path}")
plt.show()