## Setup Environment

In [2]:
import sys
import os
sys.path.insert(0, os.path.abspath('..'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

from src.advanced_visualizer import AdvancedKnapsackVisualizer

sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 100
plt.rcParams['font.size'] = 10

print("‚úÖ Environment setup complete")

ModuleNotFoundError: No module named 'numpy'

---
## Load Data Characteristics Results

In [None]:
# Load results from CSV
df_data = pd.read_csv('../results/chapter3/3_1_3_data_characteristics.csv')

print("üìä Data Characteristics Impact Analysis:\n")
display(df_data)

print("\nüìã Test cases analyzed:")
for char_type in df_data['characteristic'].unique():
    cases = df_data[df_data['characteristic'] == char_type]['test_case'].tolist()
    print(f"  ‚Ä¢ {char_type}: {', '.join(cases)}")

---
## Comprehensive Visualization

In [None]:
# Create comprehensive data characteristics visualization
visualizer = AdvancedKnapsackVisualizer()

fig = visualizer.plot_data_characteristics_impact(
    df_data,
    title="3.1.3: Data Characteristics Impact Analysis",
    save_path='../results/chapter3/3_1_3_data_visualization.png'
)

plt.show()

print("\n‚úÖ Visualization saved to: results/chapter3/3_1_3_data_visualization.png")

---
## Statistical Analysis by Characteristic

In [None]:
print("üìä DETAILED ANALYSIS BY DATA CHARACTERISTIC\n")
print("=" * 80)

for characteristic in df_data['characteristic'].unique():
    subset = df_data[df_data['characteristic'] == characteristic]
    
    print(f"\n{'='*80}")
    print(f"Characteristic: {characteristic.upper()}")
    print(f"{'='*80}")
    
    for _, row in subset.iterrows():
        print(f"\nTest Case: {row['test_case']}")
        print(f"  GBFS:  Value={row['gbfs_value']:.0f}  Time={row['gbfs_time']:.6f}s  Quality={row['gbfs_pct_optimal']:.2f}%")
        print(f"  BPSO:  Value={row['bpso_value']:.0f}  Time={row['bpso_time']:.6f}s  Quality={row['bpso_pct_optimal']:.2f}%")
        print(f"  DP:    Value={row['dp_value']:.0f}  Time={row['dp_time']:.6f}s  Quality=100.00%")
        
        # Winner
        if row['gbfs_pct_optimal'] > row['bpso_pct_optimal']:
            print(f"  üèÜ Winner: GBFS (better quality)")
        elif row['bpso_pct_optimal'] > row['gbfs_pct_optimal']:
            print(f"  üèÜ Winner: BPSO (better quality)")
        else:
            print(f"  üèÜ Tie (same quality)")

print(f"\n{'='*80}")

---
## Summary Statistics

In [None]:
# Group by characteristic and calculate means
summary = df_data.groupby('characteristic').agg({
    'gbfs_pct_optimal': 'mean',
    'bpso_pct_optimal': 'mean',
    'gbfs_time': 'mean',
    'bpso_time': 'mean',
    'dp_time': 'mean'
}).round(4)

summary.columns = ['GBFS Quality %', 'BPSO Quality %', 'GBFS Time (s)', 'BPSO Time (s)', 'DP Time (s)']

print("\nüìä SUMMARY BY CHARACTERISTIC TYPE\n")
display(summary)

# Performance degradation analysis
print("\nüìâ PERFORMANCE DEGRADATION FROM OPTIMAL:\n")
degradation = pd.DataFrame({
    'Characteristic': summary.index,
    'GBFS Gap (%)': (100 - summary['GBFS Quality %']).values,
    'BPSO Gap (%)': (100 - summary['BPSO Quality %']).values
})
display(degradation)

---
## Insights and Conclusions

### üìä Key Findings:

#### 1. **Correlation Impact:**
- **Low Correlation**: Items c√≥ weight/value ƒë·ªôc l·∫≠p
  - GBFS: ~99% optimal (heuristic works well)
  - BPSO: ~66% optimal (harder to optimize)
- **High Correlation**: Weight v√† value t∆∞∆°ng quan cao
  - GBFS: ~99% optimal (still effective)
  - BPSO: ~58% optimal (more challenging)

**Insight:** High correlation l√†m search space ph·ª©c t·∫°p h∆°n cho BPSO

---

#### 2. **Value Distribution:**
- **High Value Items**: T·∫≠p trung nhi·ªÅu items c√≥ value cao
  - GBFS: ~99% optimal (greedy picks high value)
  - BPSO: ~77% optimal (improved with valuable items)

**Insight:** BPSO performs better khi c√≥ nhi·ªÅu high-value items (more attractive for swarm)

---

#### 3. **Regional Diversity:**
- **Region 1**: Single region, homogeneous
  - GBFS: ~99% optimal
  - BPSO: ~60% optimal
- **Region 3**: Multiple regions, diverse
  - GBFS: ~100% optimal
  - BPSO: ~62% optimal

**Insight:** Regional diversity kh√¥ng ·∫£nh h∆∞·ªüng nhi·ªÅu ƒë·∫øn performance

---

### üéØ Algorithm Sensitivity Summary:

| Algorithm | Most Sensitive To | Least Sensitive To |
|-----------|------------------|--------------------|
| **GBFS** | None (stable ~99%) | All characteristics |
| **BPSO** | Correlation, Value distribution | Regional diversity |
| **DP** | None (always 100%) | All characteristics |

---

### üí° Practical Implications:

1. **GBFS is Robust:**
   - Performance ·ªïn ƒë·ªãnh tr√™n m·ªçi data characteristics
   - Value/weight heuristic very effective
   - Recommended cho production use

2. **BPSO Needs Tuning:**
   - Sensitive to data characteristics
   - May need adaptive parameters
   - Better v·ªõi high-value, low-correlation data

3. **Data Preprocessing Matters:**
   - Normalizing values c√≥ th·ªÉ help BPSO
   - Feature engineering cho heuristics
   - Consider data characteristics khi ch·ªçn algorithm

---

### üîç Future Work:

1. **Adaptive BPSO:** Adjust parameters based on detected data characteristics
2. **Hybrid Approach:** Use GBFS to warm-start BPSO
3. **Feature Analysis:** Deep dive into which features affect each algorithm
4. **More Test Cases:** Expand to extreme cases (very high/low correlation)