In [1]:
from pathlib import Path
import os
import requests
import json
from tqdm.auto import tqdm
import datetime
import time
import polars as pl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
OUTPUT_DIR = Path("./dataset")
META_DIR = Path("/home/naohiro/.cache/kagglehub/datasets/kaggle/meta-kaggle/versions/1591")
BASE_URL = "https://www.kaggle.com/api/i/competitions.EpisodeService/"
GET_URL = BASE_URL + "GetEpisodeReplay"

LOWEST_SCORE_THRESH = 2000
EPISODE_LIMIT_SIZE = 1000

In [3]:
COMPETITION_ID = 86411

## Extract Top Submission

In [4]:
%%time

episodes_df = pl.scan_csv(META_DIR / "Episodes.csv")
episodes_df = (
    episodes_df
    .filter(pl.col('CompetitionId')==COMPETITION_ID)
    .with_columns(
        pl.col("CreateTime").str.to_datetime("%m/%d/%Y %H:%M:%S", strict=False),
        pl.col("EndTime").str.to_datetime("%m/%d/%Y %H:%M:%S", strict=False),
    )
    .sort("Id")
    .collect()
)
episodes_df.head()

CPU times: user 4.8 s, sys: 419 ms, total: 5.22 s
Wall time: 877 ms


Id,Type,CompetitionId,CreateTime,EndTime
i64,i64,i64,datetime[μs],datetime[μs]
58076100,4,86411,2024-12-09 19:50:26,2024-12-09 19:53:03
58078323,4,86411,2024-12-09 20:18:13,2024-12-09 20:20:45
58078642,4,86411,2024-12-09 20:22:44,2024-12-09 20:25:14
58079278,1,86411,2024-12-09 20:28:05,2024-12-09 20:30:37
58079282,4,86411,2024-12-09 20:31:23,2024-12-09 20:33:59


In [5]:
%%time

agents_df = pl.scan_csv(
    META_DIR / "EpisodeAgents.csv", 
    schema_overrides={'Reward':pl.Float32, 'UpdatedConfidence': pl.Float32, 'UpdatedScore': pl.Float32}
)

agents_df = (
    agents_df
    .filter(pl.col("EpisodeId").is_in(episodes_df['Id'].to_list()))
    .with_columns([
        pl.when(pl.col("InitialConfidence") == "")
        .then(None)
        .otherwise(pl.col("InitialConfidence"))
        .cast(pl.Float64)
        .alias("InitialConfidence"),
        
        pl.when(pl.col("InitialScore") == "")
        .then(None)
        .otherwise(pl.col("InitialScore"))
        .cast(pl.Float64)
        .alias("InitialScore")])
    .collect()
)
agents_df.head()

CPU times: user 22.6 s, sys: 1.36 s, total: 24 s
Wall time: 3.44 s


Id,EpisodeId,Index,Reward,State,SubmissionId,InitialConfidence,InitialScore,UpdatedConfidence,UpdatedScore
i64,i64,i64,f32,i64,i64,f64,f64,f32,f32
132077807,58080239,0,2.0,2,41706676,185.0,637.104505,170.0,573.455139
132077808,58080239,1,3.0,2,41706820,200.0,726.035886,200.0,800.425171
132077809,58080240,0,2.0,2,41706995,,,200.0,600.0
132077810,58080240,1,3.0,2,41706995,,,200.0,600.0
132078447,58080559,0,2.0,2,41706995,200.0,600.0,185.0,546.159546


In [6]:
target_agents_df = (
    agents_df
    .sort('EpisodeId', descending=True)
    .group_by('SubmissionId')
    .head(1)
    .filter(pl.col("UpdatedScore")>LOWEST_SCORE_THRESH)
)

create_time_df = (
    agents_df
    .sort('EpisodeId', descending=False)
    .group_by('SubmissionId')
    .head(1)
    .join(episodes_df, left_on='EpisodeId', right_on='Id')
    .select(['SubmissionId', 'CreateTime'])
)

num_episodes_df = (
    agents_df
    .group_by('SubmissionId')
    .agg(pl.count().alias('NumEpisodes'))
)

target_agents_df = (
    target_agents_df
    .join(num_episodes_df, on='SubmissionId')
    .join(create_time_df, on='SubmissionId')
    .select(['SubmissionId', 'EpisodeId', 'UpdatedScore', 'NumEpisodes', 'CreateTime', "Index"])
)

  .agg(pl.count().alias('NumEpisodes'))


In [7]:
target_agents_df

SubmissionId,EpisodeId,UpdatedScore,NumEpisodes,CreateTime,Index
i64,i64,f32,u32,datetime[μs],i64
41713530,58655018,2229.996826,696,2024-12-10 02:24:34,1
41721002,58595304,2897.83252,508,2024-12-10 09:12:22,0
41750602,58655018,2253.915771,684,2024-12-11 10:02:37,0
41789980,58639402,3020.480957,192,2024-12-12 18:00:24,0
41821835,58721648,2493.154053,156,2024-12-14 04:20:59,1
…,…,…,…,…,…
41862933,59054558,2445.623779,234,2024-12-15 19:53:12,1
41863713,59296719,2237.673096,544,2024-12-15 20:51:32,1
42147674,61091046,2074.076416,509,2024-12-30 16:06:43,0
42165033,61054881,2050.952881,68,2024-12-31 16:52:16,1


In [8]:
team_name_list = []
for row in tqdm(target_agents_df.iter_rows(named=True), total=len(target_agents_df)):
    ep_id = row['EpisodeId']
    team_idx = int(row['Index'])
    re = requests.post(GET_URL, json = {"episodeId": int(ep_id)})
    replay = re.json()
    team_name_list.append(replay['info']['TeamNames'][team_idx])

100%|██████████| 11/11 [00:25<00:00,  2.32s/it]


In [9]:
target_agents_df = (
    target_agents_df
    .with_columns(pl.Series(team_name_list).alias('TeamName'))
    .drop(['EpisodeId', "Index"])
    .sort('UpdatedScore', descending=True)
)

In [10]:
target_agents_df.head(10)

SubmissionId,UpdatedScore,NumEpisodes,CreateTime,TeamName
i64,f32,u32,datetime[μs],str
41789980,3020.480957,192,2024-12-12 18:00:24,"""aDg4b"""
41721002,2897.83252,508,2024-12-10 09:12:22,"""aDg4b"""
42211368,2496.510498,61,2025-01-03 19:08:06,"""ry_andy_"""
41821835,2493.154053,156,2024-12-14 04:20:59,"""aDg4b"""
41862933,2445.623779,234,2024-12-15 19:53:12,"""ry_andy_"""
41750602,2253.915771,684,2024-12-11 10:02:37,"""Zhu Liang"""
41863713,2237.673096,544,2024-12-15 20:51:32,"""ry_andy_"""
41713530,2229.996826,696,2024-12-10 02:24:34,"""ry_andy_"""
42147674,2074.076416,509,2024-12-30 16:06:43,"""Boey"""
42165033,2050.952881,68,2024-12-31 16:52:16,"""ry_andy_"""


## Extract episode information

In [11]:
TARGET_SUBMISSION_IDS = [41862933, 41863713, 41789980]
target_episodes_df = agents_df.filter(pl.col("SubmissionId").is_in(TARGET_SUBMISSION_IDS))
target_episodes_df.write_csv('dataset/episodes.csv')

In [12]:
def create_info_json(epid:int) -> dict:
    create_seconds = int(episodes_df.filter(pl.col('EpisodeId') == epid)['CreateTime'].item() / 1e9)
    end_seconds = int(episodes_df.filter(pl.col('EpisodeId') == epid)['CreateTime'].item() / 1e9)

    agents_df_filtered = agents_df.filter(pl.col('EpisodeId') == epid).sort('Index')

    agents = []
    for row in agents_df_filtered.iter_rows(named=True):
        agent = {
            "id": int(row["Id"]),
            "state": int(row["State"]),
            "submissionId": int(row['SubmissionId']),
            "reward": float(row['Reward']),
            "index": int(row['Index']),
            "initialScore": float(row['InitialScore']),
            "initialConfidence": float(row['InitialConfidence']),
            "updatedScore": float(row['UpdatedScore']),
            "updatedConfidence": float(row['UpdatedConfidence']),
            "teamId": int(99999)
        }
        agents.append(agent)

    info = {
        "id": int(epid),
        "competitionId": COMPETITION_ID,
        "createTime": {
            "seconds": create_seconds
        },
        "endTime": {
            "seconds": end_seconds
        },
        "agents": agents
    }

    return info

In [13]:
def saveEpisode(epid:int, sub_id:int) -> None:
    # request
    re = requests.post(GET_URL, json = {"episodeId": int(epid)})
        
    # save replay
    replay = re.json()
    with open(OUTPUT_DIR / f'{sub_id}_{epid}.json', 'w') as f:
        json.dump(replay, f)

In [14]:
start_time = datetime.datetime.now()
episode_count = 0
for _sub_id, df in target_episodes_df.group_by('SubmissionId'):
    sub_id = _sub_id[0]
    ep_ids = df['EpisodeId'].unique()
    for epid in ep_ids:
        saveEpisode(epid, sub_id); 
        episode_count+=1
        try:
            size = os.path.getsize(OUTPUT_DIR / f'{sub_id}_{epid}.json') / 1e6
            print(str(episode_count) + f': saved episode #{epid}')
        except:
            print(f'  file {sub_id}_{epid}.json did not seem to save')

        # process 1 episode/sec
        spend_seconds = (datetime.datetime.now() - start_time).seconds
        if episode_count > spend_seconds:
            time.sleep(episode_count - spend_seconds)
            
        if episode_count > EPISODE_LIMIT_SIZE:
            break 
        
    print(f'Episodes saved: {episode_count}')

1: saved episode #58801530
2: saved episode #58802584
3: saved episode #58802932
4: saved episode #58803281
5: saved episode #58803635
6: saved episode #58803987
7: saved episode #58804337
8: saved episode #58804688
9: saved episode #58805040
10: saved episode #58805742
11: saved episode #58806094
12: saved episode #58806444
13: saved episode #58806796
14: saved episode #58807154
15: saved episode #58807503
16: saved episode #58807859
17: saved episode #58808206
18: saved episode #58808558
19: saved episode #58808915
20: saved episode #58808916
21: saved episode #58809614
22: saved episode #58809971
23: saved episode #58810325
24: saved episode #58810677
25: saved episode #58811024
26: saved episode #58811378
27: saved episode #58811734
28: saved episode #58812085
29: saved episode #58812436
30: saved episode #58812787
31: saved episode #58813142
32: saved episode #58813498
33: saved episode #58813852
34: saved episode #58814207
35: saved episode #58814563
36: saved episode #58814916
3