# Deep Analysis: Diversity Check Impact on Performance

This notebook investigates:
1. How diversity checking affects synthetic sample quality
2. Performance vs Quality trade-offs
3. Alternative optimization strategies

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from imblearn.over_sampling import SMOTE

import kagglehub
import os

# Import DistAwareAug
import sys
sys.path.insert(0, '..')
from distawareaug import DistAwareAugmentor

print("✓ Imports successful")

✓ Imports successful


  from .autonotebook import tqdm as notebook_tqdm


## 1. Load Real-World Dataset

In [2]:
# Load stroke dataset
path = kagglehub.dataset_download("shashwatwork/cerebral-stroke-predictionimbalaced-dataset")
data = pd.read_csv(os.path.join(path, "dataset.csv"))

# Preprocess
data_clean = data.drop(['id', 'smoking_status'], axis=1)
data_clean['bmi'] = data_clean['bmi'].fillna(data_clean['bmi'].median())
data_clean = pd.get_dummies(data_clean, columns=['gender', 'ever_married', 'work_type', 'Residence_type'])

X = data_clean.drop('stroke', axis=1).values
y = data_clean['stroke'].values

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
print(f"Training: {X_train.shape[0]}, Test: {X_test.shape[0]}")
print(f"Imbalance ratio: {sum(y_train==0)/sum(y_train==1):.2f}:1")

Dataset: 43400 samples, 17 features
Training: 34720, Test: 8680
Imbalance ratio: 54.46:1


## 2. Test Different Diversity Check Strategies

We'll modify DistAwareAugmentor temporarily to test different approaches

In [3]:
def test_diversity_strategy(X_train, y_train, X_test, y_test, strategy_name, max_check_size=None):
    """
    Test a diversity checking strategy.
    
    strategy_name options:
    - 'none': Skip diversity checks entirely
    - 'sample_50': Check against 50 random samples
    - 'sample_100': Check against 100 random samples  
    - 'sample_200': Check against 200 random samples (current default)
    - 'sample_500': Check against 500 random samples
    - 'all': Check against all samples (v0.1.0 behavior)
    - 'adaptive': Check against sqrt(n) samples
    """
    
    # NOTE: This requires temporarily modifying augmentor.py
    # For now, we'll test with the current implementation
    
    results = {}
    
    # Time augmentation
    aug = DistAwareAugmentor(sampling_strategy='auto', random_state=42)
    
    t0 = time.time()
    X_aug, y_aug = aug.fit_resample(X_train, y_train)
    aug_time = time.time() - t0
    
    results['augmentation_time'] = aug_time
    results['samples_generated'] = len(X_aug) - len(X_train)
    
    # Train model
    scaler = StandardScaler()
    X_aug_scaled = scaler.fit_transform(X_aug)
    X_test_scaled = scaler.transform(X_test)
    
    clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    
    t0 = time.time()
    clf.fit(X_aug_scaled, y_aug)
    train_time = time.time() - t0
    
    results['training_time'] = train_time
    
    # Evaluate
    y_pred = clf.predict(X_test_scaled)
    y_proba = clf.predict_proba(X_test_scaled)[:, 1]
    
    results['precision'] = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
    results['recall'] = recall_score(y_test, y_pred, pos_label=1)
    results['f1_score'] = f1_score(y_test, y_pred, pos_label=1)
    results['roc_auc'] = roc_auc_score(y_test, y_proba)
    
    # Diversity analysis of synthetic samples
    synthetic_samples = X_aug[len(X_train):]
    if len(synthetic_samples) > 1:
        from sklearn.metrics import pairwise_distances
        dists = pairwise_distances(synthetic_samples[:min(500, len(synthetic_samples))])
        np.fill_diagonal(dists, np.inf)  # Ignore self-distances
        results['min_distance'] = dists.min()
        results['mean_distance'] = dists[dists < np.inf].mean()
        results['std_distance'] = dists[dists < np.inf].std()
    
    return results

print("✓ Test function defined")

✓ Test function defined


## 3. Run Baseline Tests

In [4]:
print("="*80)
print("BASELINE TESTS")
print("="*80)

baselines = {}

# 1. No augmentation
print("\n1. No Augmentation...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
t0 = time.time()
clf.fit(X_train_scaled, y_train)
train_time = time.time() - t0

y_pred = clf.predict(X_test_scaled)
y_proba = clf.predict_proba(X_test_scaled)[:, 1]

baselines['No Augmentation'] = {
    'augmentation_time': 0,
    'training_time': train_time,
    'samples_generated': 0,
    'precision': precision_score(y_test, y_pred, pos_label=1, zero_division=0),
    'recall': recall_score(y_test, y_pred, pos_label=1),
    'f1_score': f1_score(y_test, y_pred, pos_label=1),
    'roc_auc': roc_auc_score(y_test, y_proba)
}

print(f"   F1-Score: {baselines['No Augmentation']['f1_score']:.4f}")

# 2. SMOTE
print("\n2. SMOTE...")
smote = SMOTE(sampling_strategy='auto', random_state=42)
t0 = time.time()
X_smote, y_smote = smote.fit_resample(X_train, y_train)
aug_time = time.time() - t0

X_smote_scaled = scaler.fit_transform(X_smote)
clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
t0 = time.time()
clf.fit(X_smote_scaled, y_smote)
train_time = time.time() - t0

y_pred = clf.predict(X_test_scaled)
y_proba = clf.predict_proba(X_test_scaled)[:, 1]

baselines['SMOTE'] = {
    'augmentation_time': aug_time,
    'training_time': train_time,
    'samples_generated': len(X_smote) - len(X_train),
    'precision': precision_score(y_test, y_pred, pos_label=1, zero_division=0),
    'recall': recall_score(y_test, y_pred, pos_label=1),
    'f1_score': f1_score(y_test, y_pred, pos_label=1),
    'roc_auc': roc_auc_score(y_test, y_proba)
}

print(f"   Time: {aug_time:.3f}s")
print(f"   F1-Score: {baselines['SMOTE']['f1_score']:.4f}")

# 3. DistAwareAug (current v0.2.0 with 200-sample check)
print("\n3. DistAwareAug (v0.2.0 - 200 sample check)...")
baselines['DistAwareAug_v0.2'] = test_diversity_strategy(X_train, y_train, X_test, y_test, 'sample_200')
print(f"   Time: {baselines['DistAwareAug_v0.2']['augmentation_time']:.3f}s")
print(f"   F1-Score: {baselines['DistAwareAug_v0.2']['f1_score']:.4f}")
print(f"   Min distance between synthetic samples: {baselines['DistAwareAug_v0.2'].get('min_distance', 'N/A')}")

print("\n" + "="*80)
print("BASELINE RESULTS")
print("="*80)

baseline_df = pd.DataFrame(baselines).T
print(baseline_df[['augmentation_time', 'f1_score', 'roc_auc', 'min_distance']].to_string())

BASELINE TESTS

1. No Augmentation...
   F1-Score: 0.0000

2. SMOTE...
   Time: 0.065s
   F1-Score: 0.0355

3. DistAwareAug (v0.2.0 - 200 sample check)...
   Time: 0.884s
   F1-Score: 0.0000
   Min distance between synthetic samples: 7.998365915779784e-05

BASELINE RESULTS
                   augmentation_time  f1_score   roc_auc  min_distance
No Augmentation             0.000000  0.000000  0.711558           NaN
SMOTE                       0.064637  0.035532  0.558983           NaN
DistAwareAug_v0.2           0.884123  0.000000  0.708935       0.00008


## 4. Analyze Diversity Check Impact

**Key Questions:**
1. Is the 200-sample check actually hurting quality?
2. What's the optimal balance between speed and quality?
3. Are there alternative optimization strategies?

In [None]:
# Check actual diversity in current synthetic samples
print("="*80)
print("DIVERSITY ANALYSIS")
print("="*80)

aug = DistAwareAugmentor(sampling_strategy='auto', random_state=42)
X_aug, y_aug = aug.fit_resample(X_train, y_train)
synthetic_samples = X_aug[len(X_train):]

print(f"\nGenerated {len(synthetic_samples)} synthetic samples")
print(f"Diversity threshold used: {aug.diversity_threshold}")

# Compute pairwise distances
from sklearn.metrics import pairwise_distances
dists = pairwise_distances(synthetic_samples)
np.fill_diagonal(dists, np.inf)

print(f"\nDistance Statistics:")
print(f"  Min distance: {dists.min():.6f}")
print(f"  Mean distance: {dists[dists < np.inf].mean():.6f}")
print(f"  Median distance: {np.median(dists[dists < np.inf]):.6f}")
print(f"  Std distance: {dists[dists < np.inf].std():.6f}")

# Count violations
violations = np.sum(dists < aug.diversity_threshold)
total_pairs = len(synthetic_samples) * (len(synthetic_samples) - 1)
print(f"\nDiversity threshold violations: {violations}/{total_pairs} ({violations/total_pairs*100:.2f}%)")

if violations > 0:
    print("\n⚠️  WARNING: Some synthetic samples are closer than diversity_threshold!")
    print("   This suggests the 200-sample random check is missing some duplicates.")
else:
    print("\n✓ All synthetic samples respect diversity_threshold")

DIVERSITY ANALYSIS

Generated 33468 synthetic samples
Diversity threshold used: 0.1


## 5. Alternative Optimization Strategies

Instead of random sampling, we could:
1. **Spatial indexing** - Use KD-Tree or Ball-Tree for O(log n) lookups
2. **Clustering** - Check against cluster centroids
3. **Progressive relaxation** - Start strict, relax threshold as samples grow
4. **Batch acceptance** - Accept batches if avg distance is good
5. **No diversity checks** - Rely on distribution sampling alone

In [1]:
print("="*80)
print("PROPOSED OPTIMIZATIONS")
print("="*80)

print("\n1. KD-Tree/Ball-Tree Approach:")
print("   - Use sklearn's NearestNeighbors with ball_tree algorithm")
print("   - O(log n) lookups instead of O(n)")
print("   - Already partially implemented for original samples")
print("   - Could extend to synthetic samples")

print("\n2. No Diversity Checks (Pure Distribution Sampling):")
print("   - Trust KDE/Gaussian to generate diverse samples")
print("   - Maximum speed")
print("   - Relies on statistical properties of distributions")

print("\n3. Adaptive Threshold:")
print("   - Start with strict threshold (0.1)")
print("   - Gradually relax as more samples generated")
print("   - Balance quality vs speed dynamically")

print("\n4. Batch-Level Diversity:")
print("   - Check diversity of entire batch, not individual samples")
print("   - Accept/reject batches based on average metrics")
print("   - Fewer diversity computations")

print("\n5. Hybrid Approach:")
print("   - Use KD-Tree for original samples (always check)")
print("   - Use random sampling for synthetic samples (approximate)")
print("   - Best of both worlds")

PROPOSED OPTIMIZATIONS

1. KD-Tree/Ball-Tree Approach:
   - Use sklearn's NearestNeighbors with ball_tree algorithm
   - O(log n) lookups instead of O(n)
   - Already partially implemented for original samples
   - Could extend to synthetic samples

2. No Diversity Checks (Pure Distribution Sampling):
   - Trust KDE/Gaussian to generate diverse samples
   - Maximum speed
   - Relies on statistical properties of distributions

3. Adaptive Threshold:
   - Start with strict threshold (0.1)
   - Gradually relax as more samples generated
   - Balance quality vs speed dynamically

4. Batch-Level Diversity:
   - Check diversity of entire batch, not individual samples
   - Accept/reject batches based on average metrics
   - Fewer diversity computations

5. Hybrid Approach:
   - Use KD-Tree for original samples (always check)
   - Use random sampling for synthetic samples (approximate)
   - Best of both worlds


## 6. Recommendations

Based on the analysis above, we'll determine:
1. Is the 200-sample check causing quality degradation?
2. What optimization should be implemented next?
3. Should we add a parameter for users to control this?

In [3]:
print("="*80)
print("SUMMARY AND RECOMMENDATIONS")
print("="*80)

print("\nPerformance Comparison:")
print(f"  SMOTE:        {baselines['SMOTE']['augmentation_time']:.3f}s, F1={baselines['SMOTE']['f1_score']:.4f}")
print(f"  DistAwareAug: {baselines['DistAwareAug_v0.2']['augmentation_time']:.3f}s, F1={baselines['DistAwareAug_v0.2']['f1_score']:.4f}")
print(f"  Slowdown:     {baselines['DistAwareAug_v0.2']['augmentation_time'] / baselines['SMOTE']['augmentation_time']:.1f}x")

# Determine if quality is degraded
f1_diff = baselines['DistAwareAug_v0.2']['f1_score'] - baselines['SMOTE']['f1_score']

print("\nQuality Assessment:")
if f1_diff > 0.01:
    print(f"  ✓ DistAwareAug outperforms SMOTE by {f1_diff:.4f} F1-score")
elif f1_diff < -0.01:
    print(f"  ⚠️  DistAwareAug underperforms SMOTE by {abs(f1_diff):.4f} F1-score")
    print("  → The 200-sample check MAY be hurting quality")
else:
    print(f"  → Similar performance (diff: {f1_diff:.4f})")

print("\nNext Steps:")
print("  1. Test KD-Tree approach for synthetic samples")
print("  2. Experiment with adaptive thresholds")
print("  3. Consider making diversity check strategy a parameter")
print("  4. Profile to find remaining bottlenecks")

SUMMARY AND RECOMMENDATIONS

Performance Comparison:


NameError: name 'baselines' is not defined