In [1]:
from collections import defaultdict
from convokit import Corpus, Utterance, Speaker, PolitenessStrategies, Coordination, TextParser
import json
import numpy as np
import os
import pandas as pd
import uuid

In [None]:
# Load data
folder_path = "results/base/"
data = []

for root, dirs, files in os.walk(folder_path):
    for filename in files:
        if filename.endswith(".json"):
            file_path = os.path.join(root, filename)
            try:
                with open(file_path, "r") as f:
                    entry = json.load(f)
                    data.append(entry)
            except json.JSONDecodeError as e:
                print(f"Skipped invalid JSON: {file_path} ({e})")

print(f"Loaded {len(data)} valid JSON result files from '{folder_path}' and its subfolders.")

Loaded 145 valid JSON result files from 'results/' and its subfolders.


In [3]:
utterances = []
speakers = {}
conversation_idx = 0

for log in data:
    model_0 = log['model_general_name_0']
    model_1 = log['model_general_name_1']
    dialogue = log['dialogue']
    conv_id = f"conv_{conversation_idx}"
    conversation_idx += 1

    for model in [model_0, model_1]:
        if model not in speakers:
            speakers[model] = Speaker(id=model)

    for i, turn in enumerate(dialogue):
        speaker_id = model_0 if turn['role'] == 'agent_0' else model_1
        utterances.append(Utterance(
            id=str(uuid.uuid4()),
            speaker=speakers[speaker_id],
            conversation_id=conv_id,
            reply_to=None if i == 0 else utterances[-1].id,
            text=turn['content'],
            meta={"role": turn['role']}
        ))

# Create corpus from utterances only (speakers auto-inferred)
corpus = Corpus(utterances=utterances)

# Required for linguistic features
parser = TextParser()
corpus = parser.transform(corpus)

In [4]:
# Analyze politeness
ps = PolitenessStrategies()
corpus = ps.transform(corpus)

# Step 1: Aggregate politeness per speaker
politeness_counts = defaultdict(lambda: {'polite': 0, 'total': 0})

for utt in corpus.iter_utterances():
    speaker = utt.speaker.id
    if 'politeness_strategies' in utt.meta:
        num_strategies = sum(utt.meta['politeness_strategies'].values())
        politeness_counts[speaker]['polite'] += num_strategies > 0
        politeness_counts[speaker]['total'] += 1

output_dict = {}
speakers = list(politeness_counts.keys())
for speaker in speakers:
    output_dict[speaker] = {}
    output_dict[speaker]['Politeness'] = politeness_counts[speaker]['polite'] / politeness_counts[speaker]['total']

In [5]:
# Analyze coordination
co = Coordination()
corpus = co.fit_transform(corpus)

# Compute average coordination and standard deviation
coord_scores = defaultdict(list)

for speaker in corpus.iter_speakers():
    if 'coord' in speaker.meta:
        for target, sub_scores in speaker.meta['coord'].items():
            if isinstance(sub_scores, dict):
                values = [v for v in sub_scores.values() if v is not None]
                if values:
                    avg_score = np.mean(values)
                    coord_scores[speaker.id].append(avg_score)

# Calculate mean and standard deviation
for speaker, scores in coord_scores.items():
    output_dict[speaker]["Coordination"] = str(np.mean(scores).round(3)) + " ± " + str(np.std(scores).round(3))

In [6]:
output_df = pd.DataFrame(output_dict).T
output_df['Politeness'] = output_df['Politeness'].map(lambda x: f"{round(x, 3):.3f}")
print(output_df.to_latex())

\begin{tabular}{lll}
\toprule
 & Politeness & Coordination \\
\midrule
llama & 1.000 & 0.003 ± 0.003 \\
qwen & 0.829 & -0.021 ± 0.022 \\
deepseek & 0.957 & -0.016 ± 0.016 \\
\bottomrule
\end{tabular}

