In [1]:
from collections import Counter, defaultdict
import json
import nltk
from nltk.corpus import stopwords
import numpy as np
import os
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load data
folder_path = "../results/base/"
data = []

for root, dirs, files in os.walk(folder_path):
    for filename in files:
        if filename.endswith(".json"):
            file_path = os.path.join(root, filename)
            try:
                with open(file_path, "r") as f:
                    entry = json.load(f)
                    data.append(entry)
            except json.JSONDecodeError as e:
                print(f"Skipped invalid JSON: {file_path} ({e})")

print(f"Loaded {len(data)} valid JSON result files from '{folder_path}' and its subfolders.")

Loaded 4217 valid JSON result files from '../results/base/' and its subfolders.


In [3]:
# Download stopwords if not already downloaded
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Rebuild DataFrame
records = []
for entry in data:
    for turn in entry["dialogue"]:
        agent_id = int(turn["role"].split("_")[1])
        content = turn["content"]
        model = entry["model_general_name_0"] if agent_id == 0 else entry["model_general_name_1"]
        records.append({"agent_model": model, "content": content})

df = pd.DataFrame(records)

# Clean and tokenize
def clean_and_tokenize(text):
    words = re.findall(r'\b[a-z]+\b', text.lower())
    return [w for w in words if w not in stop_words and len(w) > 2]

cleaned_word_freqs = {}
for model in df['agent_model'].unique():
    text = ' '.join(df[df['agent_model'] == model]['content'].astype(str))
    words = clean_and_tokenize(text)
    total = len(words)
    if total == 0:
        continue
    freqs = Counter(words)
    cleaned_word_freqs[model] = {w: c / total for w, c in freqs.most_common(200)}

# Create a LaTeX-style table with word and relative frequency per model
latex_rows = []
for model, freqs in cleaned_word_freqs.items():
    top_items = list(freqs.items())[:5]
    words_with_freqs = [f"{word} ({round(freq * 100, 1)}\\%)" for word, freq in top_items]
    latex_rows.append({"Model": model, "Top Words": ', '.join(words_with_freqs)})

# Convert to DataFrame and generate LaTeX
table_df = pd.DataFrame(latex_rows)
latex_output = table_df.to_latex(index=False, escape=False)
print(latex_output)

[nltk_data] Downloading package stopwords to /home/shrey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


\begin{tabular}{ll}
\toprule
Model & Top Words \\
\midrule
claude & like (1.6\%), help (1.3\%), would (1.2\%), potential (0.7\%), today (0.6\%) \\
deepseek & asbury (1.0\%), university (0.9\%), like (0.8\%), asbestos (0.7\%), let (0.6\%) \\
gemini & human (0.9\%), like (0.8\%), could (0.6\%), creative (0.5\%), data (0.5\%) \\
gpt & like (1.2\%), could (0.7\%), ethical (0.6\%), questions (0.5\%), help (0.5\%) \\
llama & like (1.1\%), think (0.9\%), could (0.8\%), human (0.8\%), would (0.8\%) \\
mistral & like (1.2\%), space (0.6\%), would (0.6\%), could (0.5\%), let (0.5\%) \\
qwen & like (1.1\%), let (0.8\%), could (0.7\%), human (0.5\%), time (0.4\%) \\
\bottomrule
\end{tabular}



In [4]:
# Step 1: Collect utterances per model (as before)
model_texts = defaultdict(list)

for entry in data:
    model_0 = entry["model_general_name_0"]
    model_1 = entry["model_general_name_1"]
    text_0 = []
    text_1 = []

    for turn in entry["dialogue"]:
        if turn["role"] == "agent_0":
            text_0.append(turn["content"])
        elif turn["role"] == "agent_1":
            text_1.append(turn["content"])
    
    model_texts[model_0].append(" ".join(text_0))
    model_texts[model_1].append(" ".join(text_1))

# Step 2: Merge all conversations into one document per model
model_docs = {model: " ".join(docs) for model, docs in model_texts.items()}

# Step 3: Compute TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = vectorizer.fit_transform(model_docs.values())
feature_names = vectorizer.get_feature_names_out()
model_names = list(model_docs.keys())
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=model_names, columns=feature_names)

# Step 4: Get top-N words per model
top_n = 40  # you can adjust this
model_top_words = {}
word_occurrence = Counter()

for model in model_names:
    top_words = tfidf_df.loc[model].nlargest(top_n)
    words = list(top_words.index)
    model_top_words[model] = top_words
    word_occurrence.update(words)

# Step 5: Keep only words unique to one model
distinctive_words_per_model = {}

for model, top_words in model_top_words.items():
    unique_words = [(word, score) for word, score in top_words.items() if word_occurrence[word] == 1]
    distinctive_words_per_model[model] = unique_words

# Step 6: Print result
for model, words in distinctive_words_per_model.items():
    print(f"\n{model.upper()} (Distinctive Words):")
    for word, score in words[:5]:  # only show top 10 unique
        print(f"  {word} ({score:.3f})")

# Create LaTeX-friendly rows
latex_rows = []

for model, words in distinctive_words_per_model.items():
    top_5 = words[:5]
    word_list = ", ".join([f"{word} ({score:.3f})" for word, score in top_5])
    latex_rows.append([model, word_list])

# Create DataFrame
latex_df = pd.DataFrame(latex_rows, columns=["Model", "Top 5 Distinctive Words"])

# Convert to LaTeX
latex_code = latex_df.to_latex(index=False, escape=False, column_format="lp{12cm}")
print(latex_code)


CLAUDE (Distinctive Words):
  appreciate (0.134)
  direct (0.123)
  interested (0.107)
  ready (0.104)
  tasks (0.102)

DEEPSEEK (Distinctive Words):
  asbury (0.354)
  university (0.341)
  asbestos (0.271)
  military (0.116)
  day (0.099)

GEMINI (Distinctive Words):
  process (0.076)
  incredibly (0.074)
  truly (0.068)
  sense (0.058)
  flow (0.057)

GPT (Distinctive Words):
  technology (0.101)
  llms (0.081)
  model (0.074)
  training (0.073)
  areas (0.072)

LLAMA (Distinctive Words):
  make (0.072)
  experiences (0.070)
  excited (0.068)
  unique (0.067)
  development (0.064)

MISTRAL (Distinctive Words):
  interesting (0.114)
  earth (0.102)
  venus (0.096)
  share (0.096)
  exploration (0.078)

QWEN (Distinctive Words):
  quantum (0.116)
  tools (0.063)
  real (0.052)
\begin{tabular}{lp{12cm}}
\toprule
Model & Top 5 Distinctive Words \\
\midrule
claude & appreciate (0.134), direct (0.123), interested (0.107), ready (0.104), tasks (0.102) \\
deepseek & asbury (0.354), universi