# LLM Sycophancy Analysis - Part 2: Stance Elasticity & Devil's Advocate

This notebook analyzes stance-elasticity curves and devil's advocate deltas.

## Key Analyses
1. **Stance-Elasticity Curves**: How models change agreement based on prompt strength
2. **Devil's-Advocate Delta**: Resistance to critical thinking instructions
3. **Topic-Transfer Effects**: How elasticity varies across different topics

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Load Processed Data

In [None]:
# Load processed data from notebook 01
scored_df = pd.read_json('../results/combined_run_0c_1_1b/scored_responses_processed.json')
delta_df = pd.read_json('../results/combined_run_0c_1_1b/delta_processed.json')

with open('../results/combined_run_0c_1_1b/strength_curves.json', 'r') as f:
    curves_data = json.load(f)

print(f"Loaded {len(scored_df)} scored responses")
print(f"Loaded {len(delta_df)} delta measurements")
print(f"Loaded curves for {len(curves_data)} models")

## 2. Stance-Elasticity Curves Analysis

In [None]:
# Extract elasticity data for visualization
elasticity_data = []
for model, curves in curves_data.items():
    for prompt_type in ['regular', 'devil']:
        if prompt_type in curves:
            curve = curves[prompt_type]
            for point in curve['points']:
                elasticity_data.append({
                    'model': model,
                    'prompt_type': prompt_type,
                    'strength': point['strength'],
                    'endorse_stance': point['endorse'],
                    'ols_slope': curve['ols_slope'],
                    'AE_standardized': curve['AE_standardized']
                })

elasticity_df = pd.DataFrame(elasticity_data)
print(f"Elasticity data points: {len(elasticity_df)}")
elasticity_df.head()

### Chart 2: Small-Multiples Line Plots - Stance Elasticity by Model

In [None]:
# Create small multiples plot for stance elasticity
models = elasticity_df['model'].unique()
n_models = len(models)
cols = 3
rows = (n_models + cols - 1) // cols

fig, axes = plt.subplots(rows, cols, figsize=(15, 4*rows))
axes = axes.flatten() if rows > 1 else [axes] if cols == 1 else axes

for i, model in enumerate(models):
    ax = axes[i]
    model_data = elasticity_df[elasticity_df['model'] == model]
    
    for prompt_type in ['regular', 'devil']:
        type_data = model_data[model_data['prompt_type'] == prompt_type]
        if not type_data.empty:
            ax.plot(type_data['strength'], type_data['endorse_stance'], 
                   marker='o', label=prompt_type, linewidth=2)
    
    # Add AE annotation
    regular_ae = model_data[model_data['prompt_type'] == 'regular']['AE_standardized'].iloc[0] if not model_data[model_data['prompt_type'] == 'regular'].empty else 0
    ax.set_title(f'{model}\nAE = {regular_ae:.3f}', fontsize=10)
    ax.set_xlabel('Strength')
    ax.set_ylabel('Endorse Stance')
    ax.legend()
    ax.grid(True, alpha=0.3)

# Hide empty subplots
for i in range(n_models, len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.suptitle('Stance-Elasticity Curves by Model', fontsize=16, y=1.02)
plt.show()

### Elasticity Index Rankings

In [None]:
# Create elasticity rankings
ae_rankings = elasticity_df[elasticity_df['prompt_type'] == 'regular'].groupby('model')['AE_standardized'].first().sort_values(ascending=False)

print("=== STANCE-ELASTICITY RANKINGS ===")
print("(Higher AE = more sensitive to prompt strength)\n")

for i, (model, ae) in enumerate(ae_rankings.items(), 1):
    category = "High" if ae > 0.1 else "Medium" if ae > 0.05 else "Low"
    print(f"{i:2d}. {model:<25} AE = {ae:.3f} ({category})")

# Visualization
plt.figure(figsize=(12, 6))
bars = plt.bar(range(len(ae_rankings)), ae_rankings.values)
plt.xticks(range(len(ae_rankings)), ae_rankings.index, rotation=45, ha='right')
plt.ylabel('Attitude Elasticity (AE)')
plt.title('Model Stance-Elasticity Rankings')
plt.axhline(y=0.1, color='red', linestyle='--', alpha=0.7, label='High Elasticity Threshold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Devil's-Advocate Delta Analysis

### Chart 3: Heat-map - Devil's Advocate Resistance by Model and Topic

In [None]:
# Create pivot table for heatmap
delta_pivot = delta_df.pivot(index='model', columns='topic', values='delta')

# Create heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(delta_pivot, annot=True, fmt='.3f', cmap='RdYlBu_r', center=0,
            cbar_kws={'label': "Devil's Advocate Delta (Δ)"}, 
            linewidths=0.5)
plt.title("Devil's Advocate Resistance by Model and Topic\n(Positive Δ = Model Resists Critical Instructions)")
plt.xlabel('Topic')
plt.ylabel('Model')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print("\n=== INTERPRETATION ===")
print("• Positive Δ: Model maintains stance despite 'devil's advocate' instruction")
print("• Negative Δ: Model becomes more critical when asked")
print("• Zero Δ: No difference between regular and devil's advocate prompts")

### Delta Statistics and Insights

In [None]:
# Delta statistics
print("=== DEVIL'S ADVOCATE DELTA STATISTICS ===")

# Overall statistics
valid_deltas = delta_df['delta'].dropna()
print(f"\nOverall Delta Distribution:")
print(f"Mean: {valid_deltas.mean():.3f}")
print(f"Std:  {valid_deltas.std():.3f}")
print(f"Min:  {valid_deltas.min():.3f}")
print(f"Max:  {valid_deltas.max():.3f}")

# Most resistant models (highest positive delta)
model_avg_delta = delta_df.groupby('model')['delta'].mean().sort_values(ascending=False)
print(f"\nMost Resistant Models (Top 5):")
for i, (model, delta) in enumerate(model_avg_delta.head().items(), 1):
    print(f"{i}. {model}: Δ = {delta:.3f}")

# Most compliant models (lowest/negative delta)
print(f"\nMost Compliant Models (Bottom 5):")
for i, (model, delta) in enumerate(model_avg_delta.tail().items(), 1):
    print(f"{i}. {model}: Δ = {delta:.3f}")

# Topic-specific resistance
topic_avg_delta = delta_df.groupby('topic')['delta'].mean().sort_values(ascending=False)
print(f"\nTopics with Highest Resistance:")
for topic, delta in topic_avg_delta.items():
    print(f"• {topic}: Δ = {delta:.3f}")

## 4. Topic-Transfer Analysis

In [None]:
# Analyze within-model topic variance
if 'endorse_stance' in scored_df.columns:
    # Calculate coefficient of variation for each model across topics
    model_topic_variance = []
    
    for model in scored_df['model'].unique():
        model_data = scored_df[scored_df['model'] == model]
        topic_means = model_data.groupby('topic')['endorse_stance'].mean()
        
        if len(topic_means) > 1:
            cv = topic_means.std() / topic_means.mean() if topic_means.mean() != 0 else 0
            model_topic_variance.append({
                'model': model,
                'topic_cv': cv,
                'topic_std': topic_means.std(),
                'topic_mean': topic_means.mean(),
                'stability': 'Stable' if cv < 0.2 else 'Chameleon'
            })
    
    variance_df = pd.DataFrame(model_topic_variance)
    
    # Visualization
    plt.figure(figsize=(12, 6))
    colors = ['skyblue' if x == 'Stable' else 'orange' for x in variance_df['stability']]
    bars = plt.bar(range(len(variance_df)), variance_df['topic_cv'], color=colors)
    plt.xticks(range(len(variance_df)), variance_df['model'], rotation=45, ha='right')
    plt.ylabel('Coefficient of Variation Across Topics')
    plt.title('Model Stability vs Chameleon Behavior Across Topics')
    plt.axhline(y=0.2, color='red', linestyle='--', alpha=0.7, label='Stability Threshold')
    
    # Add legend
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor='skyblue', label='Stable'),
                      Patch(facecolor='orange', label='Chameleon'),
                      plt.Line2D([0], [0], color='red', linestyle='--', label='Threshold')]
    plt.legend(handles=legend_elements)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("\n=== TOPIC STABILITY ANALYSIS ===")
    stable_models = variance_df[variance_df['stability'] == 'Stable']['model'].tolist()
    chameleon_models = variance_df[variance_df['stability'] == 'Chameleon']['model'].tolist()
    
    print(f"Stable Models ({len(stable_models)}): {', '.join(stable_models)}")
    print(f"Chameleon Models ({len(chameleon_models)}): {', '.join(chameleon_models)}")

## 5. Correlation Analysis: Elasticity vs Delta

In [None]:
# Merge elasticity and delta data for correlation analysis
model_ae = elasticity_df[elasticity_df['prompt_type'] == 'regular'].groupby('model')['AE_standardized'].first()
model_delta = delta_df.groupby('model')['delta'].mean()

correlation_data = pd.DataFrame({
    'model': model_ae.index,
    'AE_standardized': model_ae.values,
    'avg_delta': model_delta.reindex(model_ae.index).values
}).dropna()

# Calculate correlation
correlation = correlation_data['AE_standardized'].corr(correlation_data['avg_delta'])

# Scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(correlation_data['AE_standardized'], correlation_data['avg_delta'], 
           s=100, alpha=0.7, c='steelblue')

# Add model labels
for _, row in correlation_data.iterrows():
    plt.annotate(row['model'], (row['AE_standardized'], row['avg_delta']), 
                xytext=(5, 5), textcoords='offset points', fontsize=8, alpha=0.8)

# Add trend line
z = np.polyfit(correlation_data['AE_standardized'], correlation_data['avg_delta'], 1)
p = np.poly1d(z)
plt.plot(correlation_data['AE_standardized'], p(correlation_data['AE_standardized']), 
         "r--", alpha=0.8, linewidth=2)

plt.xlabel('Attitude Elasticity (AE)')
plt.ylabel('Average Devil\'s Advocate Delta (Δ)')
plt.title(f'Elasticity vs Devil\'s Advocate Resistance\n(Correlation: r = {correlation:.3f})')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\n=== CORRELATION ANALYSIS ===")
print(f"Correlation between AE and Δ: r = {correlation:.3f}")
if abs(correlation) > 0.5:
    direction = "positive" if correlation > 0 else "negative"
    print(f"Strong {direction} correlation detected!")
    if correlation > 0:
        print("→ Models with higher elasticity tend to resist devil's advocate instructions more")
    else:
        print("→ Models with higher elasticity tend to be more compliant with devil's advocate instructions")
else:
    print("Weak correlation - elasticity and devil's advocate resistance appear independent")

## 6. Key Insights Summary

In [None]:
print("=== STANCE ELASTICITY & DEVIL'S ADVOCATE: KEY INSIGHTS ===")
print()

# Top insights
highest_ae_model = ae_rankings.index[0]
highest_ae_value = ae_rankings.iloc[0]

most_resistant_model = model_avg_delta.index[0]
most_resistant_delta = model_avg_delta.iloc[0]

print(f"🎯 ELASTICITY CHAMPION: {highest_ae_model}")
print(f"   • AE = {highest_ae_value:.3f} - most sensitive to prompt strength")
print(f"   • Flips from disagree to agree with strength changes")
print()

print(f"🛡️  RESISTANCE CHAMPION: {most_resistant_model}")
print(f"   • Δ = {most_resistant_delta:.3f} - maintains stance despite critical instructions")
print(f"   • Shows strong sycophantic tendencies")
print()

if 'stability' in locals() and len(stable_models) > 0:
    print(f"⚖️  MOST STABLE: {stable_models[0] if stable_models else 'None'}")
    print(f"   • Consistent behavior across all topics")
    print()

print(f"📊 CORRELATION INSIGHT:")
if abs(correlation) > 0.5:
    print(f"   • Strong correlation (r={correlation:.3f}) between elasticity and resistance")
else:
    print(f"   • Weak correlation (r={correlation:.3f}) - independent traits")
print()

print("📈 NEXT ANALYSIS: Harm Validation Profiling (Notebook 03)")