In [1]:
# Question: Detecting Data Drift
# Description: Identify potential data drift between two time periods for a numeric attribute.


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp

def detect_data_drift_numeric(df1, df2, column_name, threshold=0.05):
    """
    Identifies potential data drift for a numeric attribute between two DataFrames
    representing two time periods.

    Args:
        df1: Pandas DataFrame representing the first time period.
        df2: Pandas DataFrame representing the second time period.
        column_name: The name of the numeric column to compare.
        threshold: The significance level (alpha) for the Kolmogorov-Smirnov test.
                   If the p-value is below this threshold, drift is suspected.

    Returns:
        A dictionary containing the KS statistic and p-value. Prints a message
        indicating if drift is likely detected based on the threshold.
    """

    if column_name not in df1.columns or column_name not in df2.columns:
        print(f"Error: Column '{column_name}' not found in one or both DataFrames.")
        return None

    if not pd.api.types.is_numeric_dtype(df1[column_name]) or not pd.api.types.is_numeric_dtype(df2[column_name]):
        print(f"Error: Column '{column_name}' is not numeric in one or both DataFrames.")
        return None

    # Perform the Kolmogorov-Smirnov (KS) test
    ks_statistic, p_value = ks_2samp(df1[column_name].dropna(), df2[column_name].dropna())

    print(f"--- Data Drift Analysis for Column '{column_name}' ---")
    print(f"KS Statistic: {ks_statistic:.4f}")
    print(f"P-value: {p_value:.4f}")

    if p_value < threshold:
        print(f"Potential data drift detected (p-value < {threshold}). The distributions of '{column_name}' in the two time periods are significantly different.")
    else:
        print(f"No significant data drift detected (p-value >= {threshold}). The distributions of '{column_name}' in the two time periods are not significantly different based on the KS test.")

    # Visualize the distributions
    plt.figure(figsize=(10, 6))
    df1[column_name].hist(alpha=0.6, label='Time Period 1', density=True)
    df2[column_name].hist(alpha=0.6, label='Time Period 2', density=True)
    plt.title(f"Distribution of '{column_name}' Over Time")
    plt.xlabel(column_name)
    plt.ylabel("Density")
    plt.legend()
    plt.grid(True)
    plt.show()

    return {'ks_statistic': ks_statistic, 'p_value': p_value}

if __name__ == '__main__':
    # --- Example Usage (Replace with your actual data loading) ---
    # Assume you have two CSV files representing data from two time periods
    try:
        df_period1 = pd.read_csv('customer_data_period1.csv')
        df_period2 = pd.read_csv('customer_data_period2.csv')
        numeric_column_to_check = 'TotalSpend'  # Replace with your numeric column

        drift_results = detect_data_drift_numeric(df_period1, df_period2, numeric_column_to_check)
        if drift_results:
            print("\nDrift Detection Results:", drift_results)

    except FileNotFoundError:
        print("Error: One or both of the CSV files ('customer_data_period1.csv', 'customer_data_period2.csv') were not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

Error: One or both of the CSV files ('customer_data_period1.csv', 'customer_data_period2.csv') were not found.
