# DilemmaAgent: LLM Behavior in Game Theoretical Scenarios Analysis

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from app.utils.experiment_storage import ExperimentStorage
from app.utils.experiment_analyzer import ExperimentAnalyzer

## 1. Load Data

In [None]:
import os
print("Current working directory:", os.getcwd())

In [2]:
storage = ExperimentStorage()
experiments_df = storage.get_experiments_summary()

## 2. Cooperation Rate Analysis

In [None]:
def plot_cooperation_rates(experiment_id):
    results = storage.get_experiment_results(experiment_id)
    games_df = pd.DataFrame([
        {
            'game_number': i,
            'cooperation_rate': game.cooperation_rate,
            'strategy': results.player2_strategy
        }
        for i, game in enumerate(results.games)
    ])
    
    plt.figure(figsize=(12,6))
    sns.boxplot(data=games_df, x='strategy', y='cooperation_rate')
    plt.title(f'Cooperation Rates by Strategy\nMatrix: {results.matrix_type}')
    plt.show()


## 3. Score Distribution Analysis

In [None]:
def plot_score_distributions(experiment_id):
    results = storage.get_experiment_results(experiment_id)
    scores_df = pd.DataFrame([
        {
            'game_number': i,
            'ai_score': game.final_scores[0],
            'opponent_score': game.final_scores[1],
            'strategy': results.player2_strategy,
        }
        for i, game in enumerate(results.games)
    ])
    
    plt.figure(figsize=(12,6))
    sns.violinplot(data=scores_df.melt(id_vars=['strategy'], 
                                     value_vars=['ai_score', 'opponent_score']),
                  x='strategy', y='value', hue='variable')
    plt.title(f'Score Distributions\nMatrix: {results.matrix_type}')
    plt.show()

## 4. Token Usage Analysis

In [None]:
def plot_token_usage(experiment_id):
    results = storage.get_experiment_results(experiment_id)
    token_df = pd.DataFrame([
        {
            'round': r.round_number,
            'game': i,
            'tokens': r.token_usage.total_tokens if r.token_usage else 0,
            'strategy': results.player2_strategy
        }
        for i, game in enumerate(results.games)
        for r in game.rounds
    ])
    
    plt.figure(figsize=(12,6))
    sns.lineplot(data=token_df, x='round', y='tokens', hue='strategy')
    plt.title(f'Token Usage Over Time\nMatrix: {results.matrix_type}')
    plt.show()

## 5. Reasoning Analysis

In [None]:
def analyze_reasoning(experiment_id):
    results = storage.get_experiment_results(experiment_id)
    reasoning_df = pd.DataFrame([
        {
            'round': r.round_number,
            'game': i,
            'reasoning': r.player1_reasoning,
            'move': r.player1_move.value,
            'strategy': results.player2_strategy
        }
        for i, game in enumerate(results.games)
        for r in game.rounds
    ])
    
    # Basic text analysis of reasoning
    from collections import Counter
    import nltk
    nltk.download('punkt')
    nltk.download('stopwords')
    
    words = ' '.join(reasoning_df['reasoning']).lower().split()
    word_freq = Counter(words).most_common(20)
    
    plt.figure(figsize=(12,6))
    sns.barplot(x=[w[0] for w in word_freq], y=[w[1] for w in word_freq])
    plt.xticks(rotation=45)
    plt.title('Most Common Words in AI Reasoning')
    plt.show()

# Experimentation

In [None]:
experiment_id = '9874f0f2-739c-4134-9093-c7a13d482890'

In [None]:
plot_cooperation_rates(experiment_id)
plot_score_distributions(experiment_id)
plot_token_usage(experiment_id)
analyze_reasoning(experiment_id)

In [None]:
results = storage.get_experiment_results(experiment_id)

# Experiment

## Functions

In [3]:
def display_experiment_overview():
    """Show basic information about all experiments"""
    storage = ExperimentStorage()
    experiments_df = storage.get_experiments_summary()
    
    overview = experiments_df[['experiment_id', 'matrix_type', 'player1_strategy', 
                             'player2_strategy', 'total_games', 'cooperation_rate']]
    
    # Add duration column
    overview['duration'] = pd.to_datetime(experiments_df['end_time']) - pd.to_datetime(experiments_df['start_time'])
    
    print("Available Experiments:")
    display(overview)
    return overview

In [4]:
def analyze_experiments(experiment_id: str):
    """Display game statistics for a specific experiment"""
    storage = ExperimentStorage()
    
    # Get experiment metadata
    experiments_df = storage.get_experiments_summary()
    exp = experiments_df[experiments_df['experiment_id'] == experiment_id].iloc[0]
    
    # Read game data
    games_df = pd.read_csv(storage.csv_dir / f"{experiment_id}_games.csv")
    
    # Group by game_id to get game-level stats
    game_stats = games_df.groupby('game_id').agg({
        'matrix_type': 'first',
        'player1_move': lambda x: (x == 'cooperate').mean(),  # cooperation rate
        'player2_move': lambda x: (x == 'cooperate').mean(),
        'cumulative_player1_score': 'last',  # final scores
        'cumulative_player2_score': 'last',
        'player2_reasoning': 'first'  # strategy name
    }).reset_index()
    
    game_stats['winner'] = np.where(
        game_stats['cumulative_player1_score'] > game_stats['cumulative_player2_score'],
        'AI', 'Opponent')
        
    game_stats.columns = ['game_id', 'matrix_type', 'p1_coop_rate', 'p2_coop_rate', 
                         'p1_final_score', 'p2_final_score', 'opponent_strategy', 'winner']
    
    print(f"\nAnalysis for experiment {experiment_id}")
    print(f"Matrix Type: {exp.matrix_type}")
    print(f"Player 1: {exp.player1_strategy}")
    print(f"Player 2: {exp.player2_strategy}")
    print("\nGame Statistics:")
    display(game_stats)
    
    return game_stats

In [13]:
def view_game_details(game_id: str):
    """View detailed round-by-round information for a specific game"""
    # Set pandas display options for better readability
    pd.set_option('display.max_colwidth', None)  # Show full reasoning
    pd.set_option('display.max_rows', None)      # Show all rounds
    pd.set_option('display.width', None)         # Auto-adjust width
    
    storage = ExperimentStorage()
    
    # Find which experiment contains this game
    for exp_id in storage.get_experiments_summary()['experiment_id']:
        games_df = pd.read_csv(storage.csv_dir / f"{exp_id}_games.csv")
        game_rounds = games_df[games_df['game_id'] == game_id]
        
        if len(game_rounds) > 0:
            print(f"Game ID: {game_id}")
            print(f"Matrix Type: {game_rounds['matrix_type'].iloc[0]}")
            print(f"Opponent Strategy: {game_rounds['player2_reasoning'].iloc[0]}")
            print(f"Final Score - AI: {game_rounds['cumulative_player1_score'].iloc[-1]}, " 
                  f"Opponent: {game_rounds['cumulative_player2_score'].iloc[-1]}\n")
            
            # Rename columns for cleaner display
            display_df = game_rounds[[
                'round_number', 
                'player1_move', 
                'player2_move', 
                'player1_score', 
                'player2_score', 
                'cumulative_player1_score', 
                'cumulative_player2_score', 
                'player1_reasoning'
            ]].rename(columns={
                'round_number': 'round',
                'player1_move': 'p1 move',
                'player2_move': 'p2 move',
                'player1_score': 'p1 score',
                'player2_score': 'p2 score',
                'cumulative_player1_score': 'p1 total',
                'cumulative_player2_score': 'p2 total',
                'player1_reasoning': 'AI reasoning'
            })
            
            # Display the cleaned up DataFrame
            display(display_df)
            return
            
    print(f"Game {game_id} not found")

    # Reset display options to defaults
    pd.reset_option('display.max_colwidth')
    pd.reset_option('display.max_rows')
    pd.reset_option('display.width')

# Analysis Flow

## Step 1: See all experiments

In [6]:
overview = display_experiment_overview()

Available Experiments:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  overview['duration'] = pd.to_datetime(experiments_df['end_time']) - pd.to_datetime(experiments_df['start_time'])


Unnamed: 0,experiment_id,matrix_type,player1_strategy,player2_strategy,total_games,cooperation_rate,duration
0,9e8586c5-2168-4ff8-82e5-a6e5d621735d,baseline,claude_haiku,multiple,10,0.56,0 days 00:01:11.481562
1,9874f0f2-739c-4134-9093-c7a13d482890,baseline,claude_haiku,multiple,10,0.55,0 days 00:01:08.466060
2,80cc6eaa-7bf3-4feb-bf06-1e2f98c86cd4,mixed_30,claude_haiku,multiple,5,0.38,0 days 00:00:50.215439


## Step 2: Analyze specific experiment

In [7]:
experiment_id = '80cc6eaa-7bf3-4feb-bf06-1e2f98c86cd4'

In [8]:
games = analyze_experiments(experiment_id)


Analysis for experiment 80cc6eaa-7bf3-4feb-bf06-1e2f98c86cd4
Matrix Type: mixed_30
Player 1: claude_haiku
Player 2: multiple

Game Statistics:


Unnamed: 0,game_id,matrix_type,p1_coop_rate,p2_coop_rate,p1_final_score,p2_final_score,opponent_strategy,winner
0,045fe670-c1ad-4f02-b07d-69fc4d897cb7,mixed_30,0.4,0.0,-4,16,Optimal (mixed_30),Opponent
1,064fb006-8c56-4b4f-8aed-1348a594ff8c,mixed_30,0.4,0.4,10,10,Optimal (mixed_30),Opponent
2,67f75889-62aa-46c8-8439-bdc8c22184f0,mixed_30,0.4,0.2,2,12,Optimal (mixed_30),Opponent
3,9e1ae3a2-c2e2-4817-be6b-06dcacc468ad,mixed_30,0.4,0.6,18,8,Optimal (mixed_30),AI
4,f2df14e8-2c87-43b6-a6ae-a138b4569e16,mixed_30,0.6,0.4,6,16,Optimal (mixed_30),Opponent


## Step 3: View specific game details

In [11]:
# game_id = games['game_id'].iloc[0]  # or any game_id you find interesting
game_id = '9e1ae3a2-c2e2-4817-be6b-06dcacc468ad'

In [14]:
view_game_details(game_id)

Game ID: 9e1ae3a2-c2e2-4817-be6b-06dcacc468ad
Matrix Type: mixed_30
Opponent Strategy: Optimal (mixed_30)
Final Score - AI: 18, Opponent: 8



Unnamed: 0,round,p1 move,p2 move,p1 score,p2 score,p1 total,p2 total,AI reasoning
15,1,cooperate,cooperate,4,4,4,4,"Since this is the first round of the game, there is no history to base my decision on. The optimal strategy for the first round is to COOPERATE, as this will result in the highest joint payoff for both players. By cooperating, we can establish a cooperative relationship that may be beneficial in future rounds."
16,2,cooperate,defect,-2,8,2,12,"In the previous round, we both cooperated, which resulted in us both receiving 3 points each. This is the best outcome for both players, as it maximizes the total points gained. Since the opponent also cooperated in the previous round, it is reasonable to assume they may continue to cooperate. Therefore, the best strategy for me is to continue cooperating, as this will result in the highest possible total score for both of us."
17,3,defect,cooperate,8,-2,10,10,"In the first two rounds, the opponent has played a mix of cooperate and defect. Since the opponent defected in the previous round, resulting in a significant score advantage for them, I believe the best strategy is to defect in this round to try to even the scores. Defecting will give me a higher payoff than cooperating, and I need to be more competitive to catch up to the opponent's lead."
18,4,defect,defect,0,0,10,10,"Based on the game history, it's clear that the opponent is willing to cooperate, but they are also willing to defect if they think it will benefit them. In the long run, mutual cooperation is the best strategy to maximize the total points for both players. However, given the current score difference, I believe the best move is to defect in this round to try to even out the scores. This will ensure that neither player has a significant advantage going forward and preserve the possibility of a mutually beneficial cooperative relationship in future rounds."
19,5,defect,cooperate,8,-2,18,8,"Based on the game history, it seems the opponent has a mixed strategy, sometimes cooperating and sometimes defecting. Since I have a higher score than the opponent at the moment, I will choose to defect in this round to maximize my points and widen the gap. However, I will keep an eye out for any patterns in the opponent's behavior that may indicate a more consistent strategy in the future rounds."
