In [1]:
import sys
from pathlib import Path

parent_folder = str(Path.cwd().parent)
print(f"Adding {parent_folder} to sys.path")
if parent_folder not in sys.path:
    sys.path.insert(0, parent_folder)

Adding c:\Users\bergsmann\code\turing-game\fourmind\experiment to sys.path


In [2]:
RAW_DATA_FILE_PATH = Path(".").absolute().parent / "daten_severin_20250901.json"
RAW_DATA_FILE_PATH

WindowsPath('c:/Users/bergsmann/code/turing-game/fourmind/experiment/daten_severin_20250901.json')

In [3]:
import json
from typing import Any

from models import GameData

with open(RAW_DATA_FILE_PATH, "r") as f:
    raw_data: dict[str, Any] = json.load(f)

data = [GameData(**item) for item in raw_data]  # type: ignore

lets make one dataset with Message_ID, Message, Message_Embedding, Label [one-hot]

In [14]:
data[0].messages[0]

Message(gameID=5117, oldidx=None, color='GameMaster', userID='GameMaster', botID=0, message='LANGUAGE en', create_time='2025-09-16T10:20:30', colorID='gray400', messageidx=39778)

In [15]:
from tqdm import tqdm


all_messages = []
for game in tqdm(data):
    messages = game.messages
    color_player_mapping = {bot.color: bot.name for bot in game.bots}
    color_player_mapping[list(game.player_info.keys())[0]] = "Human"
    for msg in messages:
        player_name = color_player_mapping.get(msg.color)
        if player_name is not None:
            msg.color = player_name
            all_messages.append(msg)

100%|██████████| 78/78 [00:00<00:00, 38799.30it/s]


In [16]:
len(all_messages)

2723

In [17]:
from collections import Counter

color_counts = Counter(msg.color for msg in all_messages)
print(color_counts)

Counter({'AllTalker': 1109, 'fourminds': 876, 'Human': 738})


In [18]:
all_messages[0].model_dump().keys()

dict_keys(['gameID', 'oldidx', 'color', 'userID', 'botID', 'message', 'create_time', 'colorID', 'messageidx'])

In [19]:
embedding_model_name = "intfloat/multilingual-e5-large-instruct"

from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(embedding_model_name, device="cpu")

  from tqdm.autonotebook import tqdm, trange
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
from pydantic import BaseModel
from numpy.typing import NDArray
from torch import Tensor

In [40]:
class EmbeddedMessage(BaseModel):
    # metadata
    game_id: int
    message_idx: int
    message: str
    label: str

    # training data
    onehot_label: Tensor | NDArray[Any]
    embedding:Tensor | NDArray[Any]

    class Config:
        arbitrary_types_allowed = True

In [41]:
message_embeddings = []

In [42]:
onehot_label_mapping = {
    "Human": Tensor([1, 0, 0]),
    "fourminds": Tensor([0, 1, 0]),
    "AllTalker": Tensor([0, 0, 1]),
}

In [43]:
for message in tqdm(all_messages):
    embedding: Tensor = embedding_model.encode(message.message, normalize_embeddings=True)
    message_embeddings.append(
        EmbeddedMessage(
            game_id=message.gameID,
            message_idx=message.messageidx,
            message=message.message,
            label=message.color,
            onehot_label=onehot_label_mapping[message.color],
            embedding=embedding,
        )
    )
    

100%|██████████| 2723/2723 [05:23<00:00,  8.43it/s]


In [46]:
import pickle

dataset_size_bytes = len(pickle.dumps(message_embeddings))
print(f"message_embeddings size: {dataset_size_bytes/1024/1024} bytes")

message_embeddings size: 10.980293273925781 bytes


In [47]:
import json

def tensor_to_list(tensor):
    # Handles both torch.Tensor and numpy.ndarray
    if hasattr(tensor, "tolist"):
        return tensor.tolist()
    return list(tensor)

def embedded_message_to_dict(embedded_msg):
    d = embedded_msg.model_dump()
    d["onehot_label"] = tensor_to_list(d["onehot_label"])
    d["embedding"] = tensor_to_list(d["embedding"])
    return d

with open("message_embeddings.json", "w", encoding="utf-8") as f:
    json.dump([embedded_message_to_dict(em) for em in message_embeddings], f, ensure_ascii=False, indent=2)