In [28]:
import json
import csv
import pandas as pd

In [67]:
runs = [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 20, 21, 22, 23, 24, 25, 26, 27]
output_csv = 'forecast_output.csv'
columns = ["sesion", "round", "price", "prev_price", "part_label_ID", "pl_f0", "pl_f1", "pl_f2", "pl_f3", "pl_fcast_rnd_0", "pl_fcast_rnd_1", "pl_fcast_rnd_2", "pl_fcast_rnd_3", "pl_fcast_err_0", "pl_fcast_err_1", "pl_fcast_err_2", "pl_fcast_err_3", "model"]
metadata_file = "runs.metadata"
mixed_bot_types_dict = {
    1: "gpt-3.5",
    2: "gpt-3.5",
    3: "gpt-3.5",
    4: "gpt-3.5",
    5: "gpt-4o",
    6: "gpt-4o",
    7: "gpt-4o",
    8: "gpt-4o",
    9: "gemini-1.5-pro",
    10: "gemini-1.5-pro",
    11: "gemini-1.5-pro",
    12: "gemini-1.5-pro",
    13: "grok-2",
    14: "grok-2",
    15: "grok-2",
    16: "grok-2",
    17: "mistral-large",
    18: "mistral-large",
    19: "mistral-large",
    20: "mistral-large",
    21: "claude-3.5-sonnet",
    22: "claude-3.5-sonnet",
    23: "claude-3.5-sonnet",
    24: "claude-3.5-sonnet"
}

In [26]:
def parse_metadata(filename):
    # all metadata is in rows of this form: 2025-01-06 23:49:24.064839 | run-1 | gpt-3.5 | 20 subjects | Experiment 1/3 w/ 20 subjects and 3.5 using standard profit maximizing prompt + fixed auto timeout take 2 | hrei2bal
    runs_metadata = {}
    # add the runs with 'run-X' as key
    with open(filename, 'r') as f:
        for line in f:
            parts = line.split('|')
            run_id = parts[1].strip()
            # Make dict storing the rest of the parts
            part_labels = ["time", "run_id", "model", "num_subjects", "experiment_description", "session_id"]
            parts = {part_labels[i]: parts[i].strip() for i in range(0, len(part_labels))}
            # Remove run_id from parts
            del parts['run_id']
            # Remove the run- prefix from run_id and cast to int
            run_id = int(run_id.split('-')[1])
            # Make num_subjects an int
            parts['num_subjects'] = int(parts['num_subjects'].split(' ')[0])
            runs_metadata[run_id] = parts
    return runs_metadata
        

In [65]:
def extract_run(run_id, run_metadata, all_market_prices):
    # Make dataframe for this run only using pd
    run_data = pd.DataFrame(columns=columns)
    run_folder = f"run-{run_id}"
    num_subjects = run_metadata['num_subjects']
    for subject in range(1, num_subjects+1):
        # Concat session id and bot id to get the part_id
        part_id = f"{run_metadata['session_id']}bot{subject}"
        filename = f"{run_folder}/bot-{subject}-history.json"
        # Load the json file
        with open(filename, 'r') as f:
            rounds = json.load(f)
        # Drop the first 3 practice rounds
        rounds = rounds[3:]
        for i in range(len(rounds)):
            # Extract the data
            round = rounds[i]
            round_num = int(round['round_num'])-3
            forecast_dict = {}
            for forecast in round['forecast']:
                forecast_dict[forecast['field']] = forecast['input_forecast']
            model = run_metadata['model']
            if model == "mixed":
                model = mixed_bot_types_dict[subject]
            row = {
                "sesion": run_metadata['session_id'],
                "round": round_num,
                "price": round['market_state']['market_price'],
                "prev_price": rounds[i - 1]['market_state']['market_price'] if i > 0 else None,
                "part_label_ID": part_id,
                "pl_f0": forecast_dict['f0'] if 'f0' in forecast_dict else None,
                "pl_f1": forecast_dict['f1'] if 'f1' in forecast_dict else None,
                "pl_f2": forecast_dict['f2'] if 'f2' in forecast_dict else None,
                "pl_f3": forecast_dict['f3'] if 'f3' in forecast_dict else None,
                "pl_fcast_rnd_0": round_num if 'f0' in forecast_dict else None,
                "pl_fcast_rnd_1": round_num + 2 if 'f1' in forecast_dict else None,
                "pl_fcast_rnd_2": round_num + 5 if 'f2' in forecast_dict else None,
                "pl_fcast_rnd_3": round_num + 10 if 'f3' in forecast_dict else None,
                "pl_fcast_err_0": all_market_prices[round_num - 1] - float(forecast_dict['f0']) if ('f0' in forecast_dict and forecast_dict['f0']) else None,
                "pl_fcast_err_1": all_market_prices[round_num + 1] - float(forecast_dict['f1']) if ('f1' in forecast_dict and forecast_dict['f1']) else None,
                "pl_fcast_err_2": all_market_prices[round_num + 4] - float(forecast_dict['f2']) if ('f2' in forecast_dict and forecast_dict['f2']) else None,
                "pl_fcast_err_3": all_market_prices[round_num + 9] - float(forecast_dict['f3']) if ('f3' in forecast_dict and forecast_dict['f3']) else None,
                "model": model
            }
            row_df = pd.DataFrame([row]) 
            run_data = pd.concat([run_data, row_df], ignore_index=True)

    return run_data

In [52]:
def extract_all_runs(runs, metadata):
    def get_all_market_prices(run):
        run_data_folder = "run-" + str(run) + "/"
        bot_history_file = run_data_folder + "bot-9-history.json"

        with open(bot_history_file) as f:
            bot_history = json.load(f)

        # Extract market data
        market_prices = []
        market_volumes = []
        buy_back_price = bot_history[0]['market_state']["buy_back"]

        for entry in bot_history:
            market_prices.append(entry['market_state']["market_price"])
            market_volumes.append(entry['market_state']["volume"])
        return market_prices
    # Get all metadata for the runs
    run_metadata = parse_metadata(metadata)
    # Make master dataframe
    df = pd.DataFrame(columns=columns)

    for run in runs:
        all_market_prices = get_all_market_prices(run)
        run_df = extract_run(run, run_metadata[run], all_market_prices)
        # add to master dataframe
        df = pd.concat([df, run_df])

    # Write to csv
    df.to_csv(output_csv, index=False)

In [69]:
extract_all_runs(runs, metadata_file)

  run_data = pd.concat([run_data, row_df], ignore_index=True)
  df = pd.concat([df, run_df])
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_df], ignore_index=True)
  run_data = pd.concat([run_data, row_d