In [3]:
import json
from typing import List, Dict

In [4]:
import json
from collections import defaultdict
from typing import List, Dict

def preprocess_data_for_context(data: List[Dict]) -> Dict[int, Dict[str, List[Dict]]]:
    """
    Preprocess the dataset to group data by game and generate context for each country.

    Args:
        data (List[Dict]): List of JSON objects representing conversations between player pairs.

    Returns:
        Dict[int, Dict[str, List[Dict]]]: A nested dictionary where:
            - Keys are game IDs.
            - Values are dictionaries mapping each country to its ordered contextual data.
    """
    games_context = defaultdict(lambda: defaultdict(list))

    # Group conversations by game and country
    for conversation in data:
        game_id = conversation["game_id"]
        players = conversation["players"]  # List of the two players in the conversation
        messages = conversation["messages"]
        absolute_indices = conversation["absolute_message_index"]
        seasons = conversation["seasons"]
        years = conversation["years"]
        speakers = conversation["speakers"]
        receivers = conversation["receivers"]
        sender_labels = conversation["sender_labels"]
        receiver_labels = conversation["receiver_labels"]
        game_score = conversation["game_score"]
        game_score_delta = conversation["game_score_delta"]

        # For each message, add context to both the speaker's and receiver's country contexts
        for i in range(len(messages)):
            entry = {
                "message": messages[i],
                "absolute_message_index": absolute_indices[i],
                "relative_message_index": i,
                "season": seasons[i],
                "year": years[i],
                "speaker": speakers[i],
                "receiver": receivers[i],
                "sender_label": sender_labels[i],
                "receiver_label": receiver_labels[i],
                "game_score": game_score[i],
                "game_score_delta": game_score_delta[i],
            }

            # Add to the speaker's and receiver's context in this game
            games_context[game_id][speakers[i]].append(entry)
            games_context[game_id][receivers[i]].append(entry)

    # Sort messages for each country in each game by absolute message index
    for game_id, country_data in games_context.items():
        for country, messages in country_data.items():
            country_data[country] = sorted(messages, key=lambda x: x["absolute_message_index"])

    return games_context

def save_to_json(data: Dict, filename: str):
    """
    Save preprocessed data to a JSON file.

    Args:
        data (Dict): Preprocessed dataset with contextual features.
        filename (str): Name of the output JSON file.
    """
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)


# Load your dataset (JSON Lines format)
with open("data/train.jsonl", "r") as f:
    raw_data = [json.loads(line) for line in f]

# Preprocess the dataset to generate context
preprocessed_context = preprocess_data_for_context(raw_data)

# Save the preprocessed data to a JSON file
output_filename = "data/preprocessed_context.json"
save_to_json(preprocessed_context, output_filename)

print(f"Preprocessed context saved to {output_filename}")



Preprocessed context saved to data/preprocessed_context.json
