### Implementing Adversarial Validation for Data Drift
Description: Create and train a classifier that distinguishes between train and test datasets, using the classifier’s performance to infer data drift.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

# Assume you have your baseline data in 'baseline_df' and current data in 'current_df'
# For demonstration, let's create some sample data
np.random.seed(42)
n_samples = 1000
baseline_df = pd.DataFrame({
    'feature_1': np.random.normal(0, 1, n_samples),
    'feature_2': np.random.normal(0, 1, n_samples),
    'feature_3': np.random.rand(n_samples)
})
current_df = pd.DataFrame({
    'feature_1': np.random.normal(0.5, 1.2, n_samples),
    'feature_2': np.random.normal(-0.2, 0.8, n_samples),
    'feature_3': np.random.rand(n_samples) + 0.1
})

# Add a target variable to distinguish between the datasets
baseline_df['is_current'] = 0  # 0 for baseline
current_df['is_current'] = 1   # 1 for current

# Combine the datasets
combined_df = pd.concat([baseline_df, current_df], ignore_index=True)

# Separate features (X) and the target variable (y)
X = combined_df.drop('is_current', axis=1)
y = combined_df['is_current']

# Split the combined data into training and testing sets for the adversarial classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Train a classifier to distinguish between the two datasets
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# Evaluate the performance of the adversarial classifier
auc_score = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)

print(f"Adversarial Validation AUC: {auc_score:.4f}")
print(f"Adversarial Validation Accuracy: {accuracy:.4f}")

# Interpret the results
drift_threshold = 0.7  # You'll need to determine an appropriate threshold

if auc_score > drift_threshold:
    print(f"High AUC ({auc_score:.4f}) suggests significant data drift.")
else:
    print(f"Lower AUC ({auc_score:.4f}) suggests less significant data drift.")

Adversarial Validation AUC: 0.7178
Adversarial Validation Accuracy: 0.6233
High AUC (0.7178) suggests significant data drift.
