# Top-N Bias Analysis

This notebook analyzes how bias injection affects the **exact count** of Adventure/Mystery books
in Top-15, Top-25, and Top-35 recommendations.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (15, 10)
sns.set_palette("husl")

ImportError: cannot import name 'AxisInt' from 'pandas._typing' (/usr/local/anaconda3/lib/python3.9/site-packages/pandas/_typing.py)

## 1. Load Top-N Results

In [None]:
# Load experimental results
results_df = pd.read_csv('../results/biased/topN_bias_injection_results.csv')

print(f"📊 Loaded {len(results_df)} experimental results")
print(f"🧪 Experiments: {results_df['genre'].value_counts().to_dict()}")

# Display structure
print(f"\n📋 Available columns:")
for col in sorted(results_df.columns):
    if 'top_' in col:
        print(f"   {col}")

# Show baseline values
baseline = results_df[results_df['genre'] == 'baseline'].iloc[0]
print(f"\n📊 BASELINE VALUES (avg books per user):")
print(f"   Top-15: Adventure={baseline['baseline_top_15_adventure']:.2f}, Mystery={baseline['baseline_top_15_mystery']:.2f}")
print(f"   Top-25: Adventure={baseline['baseline_top_25_adventure']:.2f}, Mystery={baseline['baseline_top_25_mystery']:.2f}")
print(f"   Top-35: Adventure={baseline['baseline_top_35_adventure']:.2f}, Mystery={baseline['baseline_top_35_mystery']:.2f}")

## 2. Adventure Bias Impact Analysis

In [None]:
# Focus on Adventure bias experiments
adventure_results = results_df[results_df['genre'] == 'adventure'].sort_values('num_biased_users')

print("🗺️  ADVENTURE BIAS IMPACT")
print("=" * 40)

# Create visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Top row: Absolute counts
N_values = [15, 25, 35]
for i, N in enumerate(N_values):
    ax = axes[0, i]
    
    # Plot Adventure book counts
    ax.plot(adventure_results['num_biased_users'], adventure_results[f'top_{N}_adventure'], 
            marker='o', linewidth=3, label='Adventure books', color='green', markersize=8)
    
    # Plot Mystery book counts
    ax.plot(adventure_results['num_biased_users'], adventure_results[f'top_{N}_mystery'], 
            marker='s', linewidth=2, label='Mystery books', color='red', markersize=6)
    
    # Add baseline lines
    ax.axhline(y=baseline[f'baseline_top_{N}_adventure'], color='green', 
               linestyle='--', alpha=0.7, label='Baseline Adventure')
    ax.axhline(y=baseline[f'baseline_top_{N}_mystery'], color='red', 
               linestyle='--', alpha=0.7, label='Baseline Mystery')
    
    ax.set_title(f'Top-{N} Recommendations: Adventure Bias', fontsize=14, fontweight='bold')
    ax.set_xlabel('Number of Adventure-Biased Users')
    ax.set_ylabel(f'Avg Books per User (out of {N})')
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_ylim(0, None)

# Bottom row: Changes from baseline
for i, N in enumerate(N_values):
    ax = axes[1, i]
    
    # Plot changes
    ax.plot(adventure_results['num_biased_users'], adventure_results[f'top_{N}_adventure_change'], 
            marker='o', linewidth=3, label='Adventure change', color='green', markersize=8)
    ax.plot(adventure_results['num_biased_users'], adventure_results[f'top_{N}_mystery_change'], 
            marker='s', linewidth=2, label='Mystery change', color='red', markersize=6)
    
    ax.axhline(y=0, color='black', linestyle='-', alpha=0.5)
    ax.set_title(f'Change from Baseline: Top-{N}', fontsize=14, fontweight='bold')
    ax.set_xlabel('Number of Adventure-Biased Users')
    ax.set_ylabel('Change in Avg Books per User')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.suptitle('Adventure Bias Impact on Recommendations', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Print numerical summary
max_bias_row = adventure_results[adventure_results['num_biased_users'] == adventure_results['num_biased_users'].max()].iloc[0]
print(f"\n📈 MAXIMUM ADVENTURE BIAS IMPACT ({int(max_bias_row['num_biased_users']):,} biased users):")
for N in N_values:
    baseline_adv = baseline[f'baseline_top_{N}_adventure']
    biased_adv = max_bias_row[f'top_{N}_adventure']
    change = max_bias_row[f'top_{N}_adventure_change']
    percent_change = (change / baseline_adv * 100) if baseline_adv > 0 else 0
    
    print(f"   Top-{N}: {baseline_adv:.2f} → {biased_adv:.2f} (+{change:.2f}, {percent_change:+.1f}%)")

## 3. Mystery Bias Impact Analysis

In [None]:
# Focus on Mystery bias experiments
mystery_results = results_df[results_df['genre'] == 'mystery'].sort_values('num_biased_users')

print("🔍 MYSTERY BIAS IMPACT")
print("=" * 35)

# Create visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Top row: Absolute counts
for i, N in enumerate(N_values):
    ax = axes[0, i]
    
    # Plot Mystery book counts
    ax.plot(mystery_results['num_biased_users'], mystery_results[f'top_{N}_mystery'], 
            marker='o', linewidth=3, label='Mystery books', color='purple', markersize=8)
    
    # Plot Adventure book counts
    ax.plot(mystery_results['num_biased_users'], mystery_results[f'top_{N}_adventure'], 
            marker='s', linewidth=2, label='Adventure books', color='orange', markersize=6)
    
    # Add baseline lines
    ax.axhline(y=baseline[f'baseline_top_{N}_mystery'], color='purple', 
               linestyle='--', alpha=0.7, label='Baseline Mystery')
    ax.axhline(y=baseline[f'baseline_top_{N}_adventure'], color='orange', 
               linestyle='--', alpha=0.7, label='Baseline Adventure')
    
    ax.set_title(f'Top-{N} Recommendations: Mystery Bias', fontsize=14, fontweight='bold')
    ax.set_xlabel('Number of Mystery-Biased Users')
    ax.set_ylabel(f'Avg Books per User (out of {N})')
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_ylim(0, None)

# Bottom row: Changes from baseline
for i, N in enumerate(N_values):
    ax = axes[1, i]
    
    # Plot changes
    ax.plot(mystery_results['num_biased_users'], mystery_results[f'top_{N}_mystery_change'], 
            marker='o', linewidth=3, label='Mystery change', color='purple', markersize=8)
    ax.plot(mystery_results['num_biased_users'], mystery_results[f'top_{N}_adventure_change'], 
            marker='s', linewidth=2, label='Adventure change', color='orange', markersize=6)
    
    ax.axhline(y=0, color='black', linestyle='-', alpha=0.5)
    ax.set_title(f'Change from Baseline: Top-{N}', fontsize=14, fontweight='bold')
    ax.set_xlabel('Number of Mystery-Biased Users')
    ax.set_ylabel('Change in Avg Books per User')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.suptitle('Mystery Bias Impact on Recommendations', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Print numerical summary
max_bias_row = mystery_results[mystery_results['num_biased_users'] == mystery_results['num_biased_users'].max()].iloc[0]
print(f"\n📈 MAXIMUM MYSTERY BIAS IMPACT ({int(max_bias_row['num_biased_users']):,} biased users):")
for N in N_values:
    baseline_mys = baseline[f'baseline_top_{N}_mystery']
    biased_mys = max_bias_row[f'top_{N}_mystery']
    change = max_bias_row[f'top_{N}_mystery_change']
    percent_change = (change / baseline_mys * 100) if baseline_mys > 0 else 0
    
    print(f"   Top-{N}: {baseline_mys:.2f} → {biased_mys:.2f} (+{change:.2f}, {percent_change:+.1f}%)")

## 4. Comparative Analysis: All Top-N Values

In [None]:
# Create comprehensive comparison table
print("📋 COMPREHENSIVE COMPARISON TABLE")
print("=" * 50)

comparison_data = []

# Add baseline
for N in N_values:
    comparison_data.append({
        'Experiment': 'Baseline',
        'Users': 0,
        'Top_N': N,
        'Adventure': baseline[f'baseline_top_{N}_adventure'],
        'Mystery': baseline[f'baseline_top_{N}_mystery'],
        'Adventure_Change': 0.0,
        'Mystery_Change': 0.0
    })

# Add experiment results
for _, row in results_df[results_df['genre'] != 'baseline'].iterrows():
    for N in N_values:
        comparison_data.append({
            'Experiment': f"{row['genre'].title()} Bias",
            'Users': int(row['num_biased_users']),
            'Top_N': N,
            'Adventure': row[f'top_{N}_adventure'],
            'Mystery': row[f'top_{N}_mystery'],
            'Adventure_Change': row[f'top_{N}_adventure_change'],
            'Mystery_Change': row[f'top_{N}_mystery_change']
        })

comparison_df = pd.DataFrame(comparison_data)

# Show top effects for each Top-N
print("\n🏆 TOP EFFECTS BY RECOMMENDATION LIST SIZE:")
for N in N_values:
    print(f"\n📊 TOP-{N} RECOMMENDATIONS:")
    n_data = comparison_df[comparison_df['Top_N'] == N]
    
    # Find maximum changes
    max_adv_change = n_data['Adventure_Change'].max()
    max_mys_change = n_data['Mystery_Change'].max()
    
    max_adv_row = n_data[n_data['Adventure_Change'] == max_adv_change].iloc[0]
    max_mys_row = n_data[n_data['Mystery_Change'] == max_mys_change].iloc[0]
    
    print(f"   🗺️  Max Adventure increase: +{max_adv_change:.2f} books/user ({max_adv_row['Experiment']}, {max_adv_row['Users']} users)")
    print(f"   🔍 Max Mystery increase: +{max_mys_change:.2f} books/user ({max_mys_row['Experiment']}, {max_mys_row['Users']} users)")
    
    # Show baseline context
    baseline_adv = baseline[f'baseline_top_{N}_adventure']
    baseline_mys = baseline[f'baseline_top_{N}_mystery']
    
    adv_percent = (max_adv_change / baseline_adv * 100) if baseline_adv > 0 else 0
    mys_percent = (max_mys_change / baseline_mys * 100) if baseline_mys > 0 else 0
    
    print(f"   📈 Relative increases: Adventure {adv_percent:.1f}%, Mystery {mys_percent:.1f}%")

## 5. Threshold Analysis for Top-N

In [None]:
# Find minimum users needed for +1 book increase in recommendations
def find_threshold_for_book_increase(data, genre, target_increase=1.0):
    """
    Find minimum biased users needed for target book increase
    """
    thresholds = {}
    
    for N in N_values:
        change_col = f'top_{N}_{genre}_change'
        for _, row in data.sort_values('num_biased_users').iterrows():
            if row[change_col] >= target_increase:
                thresholds[f'top_{N}'] = {
                    'users': int(row['num_biased_users']),
                    'change': row[change_col]
                }
                break
    
    return thresholds

print("🎯 THRESHOLD ANALYSIS: +1 Book Increase")
print("=" * 45)

# Adventure bias thresholds
adv_thresholds = find_threshold_for_book_increase(adventure_results, 'adventure', 1.0)
print(f"\n🗺️  ADVENTURE BIAS (for +1 Adventure book/user):")
for topn, data in adv_thresholds.items():
    print(f"   {topn.upper()}: {data['users']:,} users → +{data['change']:.2f} books/user")

# Mystery bias thresholds
mys_thresholds = find_threshold_for_book_increase(mystery_results, 'mystery', 1.0)
print(f"\n🔍 MYSTERY BIAS (for +1 Mystery book/user):")
for topn, data in mys_thresholds.items():
    print(f"   {topn.upper()}: {data['users']:,} users → +{data['change']:.2f} books/user")

# Cross-genre contamination analysis
print(f"\n🔄 CROSS-GENRE CONTAMINATION ANALYSIS:")
max_adv_bias = adventure_results.iloc[-1]  # Highest bias level
max_mys_bias = mystery_results.iloc[-1]   # Highest bias level

print(f"\n🗺️  Adventure bias ({int(max_adv_bias['num_biased_users']):,} users) effects:")
for N in N_values:
    adv_change = max_adv_bias[f'top_{N}_adventure_change']
    mys_change = max_adv_bias[f'top_{N}_mystery_change']
    print(f"   Top-{N}: Adventure +{adv_change:.2f}, Mystery {mys_change:+.2f} (contamination)")

print(f"\n🔍 Mystery bias ({int(max_mys_bias['num_biased_users']):,} users) effects:")
for N in N_values:
    mys_change = max_mys_bias[f'top_{N}_mystery_change']
    adv_change = max_mys_bias[f'top_{N}_adventure_change']
    print(f"   Top-{N}: Mystery +{mys_change:.2f}, Adventure {adv_change:+.2f} (contamination)")

## 6. Final Summary and Conclusions

In [None]:
# Generate final summary
print("📋 FINAL SUMMARY: TOP-N BIAS ANALYSIS")
print("=" * 50)

# Calculate maximum effects across all experiments
max_effects = {}
for N in N_values:
    # Adventure bias max effect
    max_adv_effect = adventure_results[f'top_{N}_adventure_change'].max()
    max_adv_users = adventure_results.loc[adventure_results[f'top_{N}_adventure_change'].idxmax(), 'num_biased_users']
    
    # Mystery bias max effect
    max_mys_effect = mystery_results[f'top_{N}_mystery_change'].max()
    max_mys_users = mystery_results.loc[mystery_results[f'top_{N}_mystery_change'].idxmax(), 'num_biased_users']
    
    max_effects[f'top_{N}'] = {
        'adventure_max_change': max_adv_effect,
        'adventure_max_users': int(max_adv_users),
        'mystery_max_change': max_mys_effect,
        'mystery_max_users': int(max_mys_users)
    }

print(f"\n🎯 MAXIMUM OBSERVED EFFECTS:")
for N in N_values:
    data = max_effects[f'top_{N}']
    baseline_adv = baseline[f'baseline_top_{N}_adventure']
    baseline_mys = baseline[f'baseline_top_{N}_mystery']
    
    print(f"\n📊 Top-{N} recommendations:")
    print(f"   🗺️  Adventure: +{data['adventure_max_change']:.2f} books/user ({data['adventure_max_users']:,} biased users)")
    print(f"      From {baseline_adv:.2f} → {baseline_adv + data['adventure_max_change']:.2f} "
          f"({data['adventure_max_change']/baseline_adv*100:+.1f}%)")
    
    print(f"   🔍 Mystery: +{data['mystery_max_change']:.2f} books/user ({data['mystery_max_users']:,} biased users)")
    print(f"      From {baseline_mys:.2f} → {baseline_mys + data['mystery_max_change']:.2f} "
          f"({data['mystery_max_change']/baseline_mys*100:+.1f}%)")

# Attack feasibility assessment
total_original_users = 53424  # From original dataset
min_effective_users = min([data['adventure_max_users'] for data in max_effects.values()] + 
                         [data['mystery_max_users'] for data in max_effects.values()])

attack_percentage = (min_effective_users / total_original_users) * 100

print(f"\n⚠️  ATTACK FEASIBILITY ASSESSMENT:")
print(f"   Minimum users for significant impact: {min_effective_users:,}")
print(f"   Percentage of total users: {attack_percentage:.2f}%")
print(f"   Feasibility: {'HIGH' if attack_percentage < 1 else 'MODERATE' if attack_percentage < 5 else 'LOW'}")

print(f"\n🔍 KEY FINDINGS:")
print(f"   1. Bias injection successfully increases target genre recommendations")
print(f"   2. Effects scale with recommendation list size (Top-15 < Top-25 < Top-35)")
print(f"   3. Cross-genre contamination effects are observable")
print(f"   4. Relatively small numbers of biased users can create measurable impact")
print(f"   5. Adventure and Mystery biases show similar propagation patterns")

# Save summary
summary_data = {'analysis_type': 'top_n_bias_analysis'}
for N in N_values:
    data = max_effects[f'top_{N}']
    summary_data.update({
        f'top_{N}_max_adventure_change': data['adventure_max_change'],
        f'top_{N}_max_adventure_users': data['adventure_max_users'],
        f'top_{N}_max_mystery_change': data['mystery_max_change'],
        f'top_{N}_max_mystery_users': data['mystery_max_users']
    })

summary_data.update({
    'min_effective_attack_size': min_effective_users,
    'attack_feasibility_percentage': attack_percentage
})

pd.DataFrame([summary_data]).to_csv('../results/biased/topN_analysis_summary.csv', index=False)

print(f"\n✅ Analysis complete!")
print(f"📁 Detailed results: ../results/biased/topN_bias_injection_results.csv")
print(f"📁 Summary: ../results/biased/topN_analysis_summary.csv")