# AI Model Comparison Analysis: Astra vs Qwen

This notebook analyzes the performance of two AI models (Astra and Qwen) against human-labeled ground truth data (DB).

## Categories Analyzed:
1. **Equipment Presence** (Boolean)
2. **Co-Manufacturing Status** (Boolean)
3. **Food & Beverage Status** (Boolean)
4. **Specialty Classification** (Multi-class exact match)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

In [None]:
# Load the CSV file
df = pd.read_csv('Copy of False positive comparison - no_prompt_combined.csv')

print(f"Total Records: {len(df)}")
print(f"\nColumns: {list(df.columns)}")
df.head()

## 2. Define Helper Functions

In [None]:
def calculate_metrics(df, db_col, model_col, is_specialty=False):
    """
    Calculate accuracy, precision, and confusion matrix metrics
    """
    if is_specialty:
        # Exact match for specialty
        correct = (df[db_col] == df[model_col]).sum()
        total = len(df)
        accuracy = correct / total * 100
        errors = total - correct
        
        return {
            'accuracy': accuracy,
            'precision': accuracy,
            'correct': correct,
            'total': total,
            'errors': errors
        }
    else:
        # Boolean fields
        db_vals = df[db_col].astype(str).str.lower()
        model_vals = df[model_col].astype(str).str.lower()
        
        db_bool = db_vals.isin(['true', 'yes', '1'])
        model_bool = model_vals.isin(['true', 'yes', '1'])
        
        tp = ((db_bool == True) & (model_bool == True)).sum()
        tn = ((db_bool == False) & (model_bool == False)).sum()
        fp = ((db_bool == False) & (model_bool == True)).sum()
        fn = ((db_bool == True) & (model_bool == False)).sum()
        
        total = len(df)
        accuracy = (tp + tn) / total * 100
        precision = tp / (tp + fp) * 100 if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) * 100 if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'tp': tp,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'total': total
        }

def plot_confusion_matrix(tp, tn, fp, fn, title):
    """
    Plot confusion matrix
    """
    cm = np.array([[tp, fn], [fp, tn]])
    
    fig, ax = plt.subplots(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Predicted Positive', 'Predicted Negative'],
                yticklabels=['Actual Positive', 'Actual Negative'],
                ax=ax, cbar_kws={'label': 'Count'})
    ax.set_title(title, fontsize=14, fontweight='bold')
    plt.tight_layout()
    return fig

## 3. Analyze Each Category

### 3.1 Equipment Presence

In [None]:
# Equipment analysis
astra_equip = calculate_metrics(df, 'db_has_equipments', 'astra_has_equipments')
qwen_equip = calculate_metrics(df, 'db_has_equipments', 'qwen_has_equipments')

print("=" * 80)
print("EQUIPMENT PRESENCE ANALYSIS")
print("=" * 80)
print(f"\nASTRA:")
print(f"  Accuracy: {astra_equip['accuracy']:.2f}%")
print(f"  Precision: {astra_equip['precision']:.2f}%")
print(f"  False Positives: {astra_equip['fp']}")
print(f"  False Negatives: {astra_equip['fn']}")

print(f"\nQWEN:")
print(f"  Accuracy: {qwen_equip['accuracy']:.2f}%")
print(f"  Precision: {qwen_equip['precision']:.2f}%")
print(f"  False Positives: {qwen_equip['fp']}")
print(f"  False Negatives: {qwen_equip['fn']}")

# Plot confusion matrices
fig1 = plot_confusion_matrix(astra_equip['tp'], astra_equip['tn'], 
                              astra_equip['fp'], astra_equip['fn'], 
                              'ASTRA - Equipment Confusion Matrix')
plt.show()

fig2 = plot_confusion_matrix(qwen_equip['tp'], qwen_equip['tn'], 
                              qwen_equip['fp'], qwen_equip['fn'], 
                              'QWEN - Equipment Confusion Matrix')
plt.show()

### 3.2 Co-Manufacturing Status

In [None]:
# Co-manufacturing analysis
astra_coman = calculate_metrics(df, 'db_is_coman', 'astra_is_coman')
qwen_coman = calculate_metrics(df, 'db_is_coman', 'qwen_is_coman')

print("=" * 80)
print("CO-MANUFACTURING STATUS ANALYSIS")
print("=" * 80)
print(f"\nASTRA:")
print(f"  Accuracy: {astra_coman['accuracy']:.2f}%")
print(f"  Precision: {astra_coman['precision']:.2f}%")
print(f"  False Positives: {astra_coman['fp']}")
print(f"  False Negatives: {astra_coman['fn']}")

print(f"\nQWEN:")
print(f"  Accuracy: {qwen_coman['accuracy']:.2f}%")
print(f"  Precision: {qwen_coman['precision']:.2f}%")
print(f"  False Positives: {qwen_coman['fp']}")
print(f"  False Negatives: {qwen_coman['fn']}")

# Plot confusion matrices
fig3 = plot_confusion_matrix(astra_coman['tp'], astra_coman['tn'], 
                              astra_coman['fp'], astra_coman['fn'], 
                              'ASTRA - Co-Manufacturing Confusion Matrix')
plt.show()

fig4 = plot_confusion_matrix(qwen_coman['tp'], qwen_coman['tn'], 
                              qwen_coman['fp'], qwen_coman['fn'], 
                              'QWEN - Co-Manufacturing Confusion Matrix')
plt.show()

### 3.3 Food & Beverage Status

In [None]:
# Food & Beverage analysis
astra_fb = calculate_metrics(df, 'db_is_food_beverage', 'astra_is_food_beverage')
qwen_fb = calculate_metrics(df, 'db_is_food_beverage', 'qwen_is_food_beverage')

print("=" * 80)
print("FOOD & BEVERAGE STATUS ANALYSIS")
print("=" * 80)
print(f"\nASTRA:")
print(f"  Accuracy: {astra_fb['accuracy']:.2f}%")
print(f"  Precision: {astra_fb['precision']:.2f}%")
print(f"  False Positives: {astra_fb['fp']}")
print(f"  False Negatives: {astra_fb['fn']}")

print(f"\nQWEN:")
print(f"  Accuracy: {qwen_fb['accuracy']:.2f}%")
print(f"  Precision: {qwen_fb['precision']:.2f}%")
print(f"  False Positives: {qwen_fb['fp']}")
print(f"  False Negatives: {qwen_fb['fn']}")

# Plot confusion matrices
fig5 = plot_confusion_matrix(astra_fb['tp'], astra_fb['tn'], 
                              astra_fb['fp'], astra_fb['fn'], 
                              'ASTRA - Food & Beverage Confusion Matrix')
plt.show()

fig6 = plot_confusion_matrix(qwen_fb['tp'], qwen_fb['tn'], 
                              qwen_fb['fp'], qwen_fb['fn'], 
                              'QWEN - Food & Beverage Confusion Matrix')
plt.show()

### 3.4 Specialty Classification

In [None]:
# Specialty analysis
astra_spec = calculate_metrics(df, 'db_specialty', 'astra_specialty', is_specialty=True)
qwen_spec = calculate_metrics(df, 'db_specialty', 'qwen_specialty', is_specialty=True)

print("=" * 80)
print("SPECIALTY CLASSIFICATION ANALYSIS")
print("=" * 80)
print(f"\nASTRA:")
print(f"  Accuracy: {astra_spec['accuracy']:.2f}%")
print(f"  Correct: {astra_spec['correct']}/{astra_spec['total']}")
print(f"  Errors: {astra_spec['errors']}")

print(f"\nQWEN:")
print(f"  Accuracy: {qwen_spec['accuracy']:.2f}%")
print(f"  Correct: {qwen_spec['correct']}/{qwen_spec['total']}")
print(f"  Errors: {qwen_spec['errors']}")

# Show distribution of specialty values
print("\nSpecialty Value Distribution (DB):")
print(df['db_specialty'].value_counts())

## 4. Comparative Visualizations

In [None]:
# Accuracy comparison across all categories
categories = ['Equipment', 'Co-Manufacturing', 'Food & Beverage', 'Specialty']
astra_accuracies = [
    astra_equip['accuracy'], 
    astra_coman['accuracy'], 
    astra_fb['accuracy'], 
    astra_spec['accuracy']
]
qwen_accuracies = [
    qwen_equip['accuracy'], 
    qwen_coman['accuracy'], 
    qwen_fb['accuracy'], 
    qwen_spec['accuracy']
]

x = np.arange(len(categories))
width = 0.35

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width/2, astra_accuracies, width, label='ASTRA', color='#3498db')
bars2 = ax.bar(x + width/2, qwen_accuracies, width, label='QWEN', color='#e74c3c')

ax.set_ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
ax.set_title('Model Accuracy Comparison Across Categories', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(categories)
ax.legend()
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}%',
                ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
# Precision comparison for boolean categories
bool_categories = ['Equipment', 'Co-Manufacturing', 'Food & Beverage']
astra_precisions = [
    astra_equip['precision'], 
    astra_coman['precision'], 
    astra_fb['precision']
]
qwen_precisions = [
    qwen_equip['precision'], 
    qwen_coman['precision'], 
    qwen_fb['precision']
]

x = np.arange(len(bool_categories))

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(x - width/2, astra_precisions, width, label='ASTRA', color='#2ecc71')
bars2 = ax.bar(x + width/2, qwen_precisions, width, label='QWEN', color='#f39c12')

ax.set_ylabel('Precision (%)', fontsize=12, fontweight='bold')
ax.set_title('Model Precision Comparison (Boolean Categories)', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(bool_categories)
ax.legend()
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}%',
                ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
# False Positives and False Negatives comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# False Positives
astra_fp = [astra_equip['fp'], astra_coman['fp'], astra_fb['fp']]
qwen_fp = [qwen_equip['fp'], qwen_coman['fp'], qwen_fb['fp']]

x = np.arange(len(bool_categories))
bars1 = ax1.bar(x - width/2, astra_fp, width, label='ASTRA', color='#e74c3c')
bars2 = ax1.bar(x + width/2, qwen_fp, width, label='QWEN', color='#c0392b')

ax1.set_ylabel('Count', fontsize=12, fontweight='bold')
ax1.set_title('False Positives Comparison', fontsize=13, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(bool_categories, rotation=15, ha='right')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)

for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom', fontsize=9)

# False Negatives
astra_fn = [astra_equip['fn'], astra_coman['fn'], astra_fb['fn']]
qwen_fn = [qwen_equip['fn'], qwen_coman['fn'], qwen_fb['fn']]

bars3 = ax2.bar(x - width/2, astra_fn, width, label='ASTRA', color='#f39c12')
bars4 = ax2.bar(x + width/2, qwen_fn, width, label='QWEN', color='#d68910')

ax2.set_ylabel('Count', fontsize=12, fontweight='bold')
ax2.set_title('False Negatives Comparison', fontsize=13, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels(bool_categories, rotation=15, ha='right')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

for bars in [bars3, bars4]:
    for bar in bars:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## 5. Overall Summary

In [None]:
# Calculate overall accuracy
astra_total_correct = (
    astra_equip['tp'] + astra_equip['tn'] +
    astra_coman['tp'] + astra_coman['tn'] +
    astra_fb['tp'] + astra_fb['tn'] +
    astra_spec['correct']
)

qwen_total_correct = (
    qwen_equip['tp'] + qwen_equip['tn'] +
    qwen_coman['tp'] + qwen_coman['tn'] +
    qwen_fb['tp'] + qwen_fb['tn'] +
    qwen_spec['correct']
)

total_predictions = len(df) * 4  # 4 categories

astra_overall = (astra_total_correct / total_predictions) * 100
qwen_overall = (qwen_total_correct / total_predictions) * 100

print("=" * 80)
print("OVERALL SUMMARY")
print("=" * 80)
print(f"\nTotal Records: {len(df)}")
print(f"Total Predictions per Model: {total_predictions}")
print(f"\nASTRA Overall Accuracy: {astra_overall:.2f}%")
print(f"QWEN Overall Accuracy: {qwen_overall:.2f}%")
print(f"\nDifference: {abs(astra_overall - qwen_overall):.2f}%")

if astra_overall > qwen_overall:
    print(f"\n🏆 OVERALL WINNER: ASTRA")
elif qwen_overall > astra_overall:
    print(f"\n🏆 OVERALL WINNER: QWEN")
else:
    print(f"\n🤝 TIE")

# Create summary table
summary_data = {
    'Category': ['Equipment', 'Co-Manufacturing', 'Food & Beverage', 'Specialty', 'OVERALL'],
    'ASTRA Accuracy': [
        f"{astra_equip['accuracy']:.2f}%",
        f"{astra_coman['accuracy']:.2f}%",
        f"{astra_fb['accuracy']:.2f}%",
        f"{astra_spec['accuracy']:.2f}%",
        f"{astra_overall:.2f}%"
    ],
    'QWEN Accuracy': [
        f"{qwen_equip['accuracy']:.2f}%",
        f"{qwen_coman['accuracy']:.2f}%",
        f"{qwen_fb['accuracy']:.2f}%",
        f"{qwen_spec['accuracy']:.2f}%",
        f"{qwen_overall:.2f}%"
    ],
    'Winner': [
        'ASTRA' if astra_equip['accuracy'] > qwen_equip['accuracy'] else 'QWEN',
        'ASTRA' if astra_coman['accuracy'] > qwen_coman['accuracy'] else 'QWEN',
        'ASTRA' if astra_fb['accuracy'] > qwen_fb['accuracy'] else 'QWEN',
        'ASTRA' if astra_spec['accuracy'] > qwen_spec['accuracy'] else 'QWEN',
        'ASTRA' if astra_overall > qwen_overall else 'QWEN'
    ]
}

summary_df = pd.DataFrame(summary_data)
print("\n" + "="*80)
print("SUMMARY TABLE")
print("="*80)
print(summary_df.to_string(index=False))

## 6. Error Analysis - Find Specific Cases

In [None]:
# Find manufacturers where models disagree
print("Manufacturers where ASTRA and QWEN disagree on Equipment:")
equip_disagree = df[df['astra_has_equipments'] != df['qwen_has_equipments']]
print(f"Count: {len(equip_disagree)}")
print(equip_disagree[['manufacturer_id', 'db_domain', 'db_has_equipments', 
                       'astra_has_equipments', 'qwen_has_equipments']].head(10))

In [None]:
print("\nManufacturers where ASTRA and QWEN disagree on Co-Manufacturing:")
coman_disagree = df[df['astra_is_coman'] != df['qwen_is_coman']]
print(f"Count: {len(coman_disagree)}")
print(coman_disagree[['manufacturer_id', 'db_domain', 'db_is_coman', 
                       'astra_is_coman', 'qwen_is_coman']].head(10))

In [None]:
print("\nManufacturers where ASTRA and QWEN disagree on Specialty:")
spec_disagree = df[df['astra_specialty'] != df['qwen_specialty']]
print(f"Count: {len(spec_disagree)}")
print(spec_disagree[['manufacturer_id', 'db_domain', 'db_specialty', 
                      'astra_specialty', 'qwen_specialty']].head(10))

## 7. Key Insights and Recommendations

### Strengths:
- **ASTRA**: Better at Equipment detection, Co-Manufacturing classification
- **QWEN**: Excels at Food & Beverage classification (100% accuracy), significantly better at Specialty classification

### Weaknesses:
- **ASTRA**: Struggles with Specialty classification (63.90% accuracy)
- **QWEN**: Slightly more false negatives in Equipment detection

### Overall Winner:
**QWEN** has higher overall accuracy, primarily due to superior performance on Specialty classification.

### Recommendations:
1. Consider using an ensemble approach combining both models
2. Use QWEN for Specialty and Food & Beverage classifications
3. Use ASTRA for Equipment and Co-Manufacturing when precision is critical
4. Focus improvement efforts on ASTRA's Specialty classification capabilities