In [1]:
from collections import Counter
import json
import nltk
from nltk.corpus import stopwords
import os
import re
import pandas as pd

In [2]:
# Load data
folder_path = "results/"
data = []

for root, dirs, files in os.walk(folder_path):
    for filename in files:
        if filename.endswith(".json"):
            file_path = os.path.join(root, filename)
            try:
                with open(file_path, "r") as f:
                    entry = json.load(f)
                    data.append(entry)
            except json.JSONDecodeError as e:
                print(f"Skipped invalid JSON: {file_path} ({e})")

print(f"Loaded {len(data)} valid JSON result files from '{folder_path}' and its subfolders.")

Loaded 145 valid JSON result files from 'results/' and its subfolders.


In [3]:
# Download stopwords if not already downloaded
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Rebuild DataFrame
records = []
for entry in data:
    for turn in entry["dialogue"]:
        agent_id = int(turn["role"].split("_")[1])
        content = turn["content"]
        model = entry["model_general_name_0"] if agent_id == 0 else entry["model_general_name_1"]
        records.append({"agent_model": model, "content": content})

df = pd.DataFrame(records)

# Clean and tokenize
def clean_and_tokenize(text):
    words = re.findall(r'\b[a-z]+\b', text.lower())
    return [w for w in words if w not in stop_words and len(w) > 2]

cleaned_word_freqs = {}
for model in df['agent_model'].unique():
    text = ' '.join(df[df['agent_model'] == model]['content'].astype(str))
    words = clean_and_tokenize(text)
    total = len(words)
    if total == 0:
        continue
    freqs = Counter(words)
    cleaned_word_freqs[model] = {w: c / total for w, c in freqs.most_common(200)}

# Create a LaTeX-style table with word and relative frequency per model
latex_rows = []
for model, freqs in cleaned_word_freqs.items():
    top_items = list(freqs.items())[:5]
    words_with_freqs = [f"{word} ({round(freq * 100, 1)}\\%)" for word, freq in top_items]
    latex_rows.append({"Model": model, "Top Words": ', '.join(words_with_freqs)})

# Convert to DataFrame and generate LaTeX
table_df = pd.DataFrame(latex_rows)
latex_output = table_df.to_latex(index=False, escape=False)
print(latex_output)

[nltk_data] Downloading package stopwords to /home/shrey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


NameError: name 'Counter' is not defined