# Generate the files

## Requirements

[rustup](https://rustup.rs/)
[python3](https://www.python.org/downloads/)

Maybe virtual env?

## Step by Step
We need to locate the directory where the replays are, in my case I dual-boot and mount windows to `/mnt/windows`

The final step takes around 32 seconds to process 3600 Replay files. This means about 110 replays per-second on a 16 Core machine.
```bash
$ git clone https://github.com/sebosp/s2protocol-rs
$ cd s2protocol-rs
$ mkdir ipcs/
$ cargo run --features arrow,syntax -r -- -v error --timing --source /home/seb/SCReplaysOnNVMe --output /home/seb/git/s2protocol-rs/ipcs/ write-arrow-ipc --process-max-files 10000000
12059 files have valid init data, processing...
Total time: 100s
$ du -sh ipcs/*
572M    ipcs/cmd_target_point.ipc 
264M    ipcs/cmd_target_unit.ipc 
3.5M    ipcs/details.ipc  
38M     ipcs/lobby_init_data.ipc 
867M    ipcs/stats.ipc 
6.0G    ipcs/unit_born.ipc 
4.0G    ipcs/unit_died.ipc  
43M     ipcs/upgrades.ipc 
36M     ipcs/user_init_data.ipc
$ cd ../s2-polars-data-analysis/
$ pip install -r requirements.txt
$ jupyter lab --notebook-dir=jupyter_notebooks/
# Open the URL explained in the terminal, this will open this notebook for interacting with the data.
```


In [1]:
# import datashader as ds
import plotly.express as px
import polars as pl

pl.Config.set_tbl_width_chars(256)
pl.Config.set_fmt_str_lengths(256)
pl.Config.set_tbl_rows(24)

# Location of the IPC generated files from above.
ipc_dir = "/home/seb/git/s2protocol-rs/ipcs"

# Filter out these:
# Beacon*: Similiar to above, default targets of hatcheries and buildings
#          points to Beacons, clicks on the map, or actions when teams play
#          together and send attack/defend points on the map.
#          For now we'll avoid them.
unit_born_df = pl.scan_ipc(f"{ipc_dir}/unit_born.ipc")
# .filter((~pl.col("unit_type_name").str.starts_with("Beacon")))
unit_died_df = pl.scan_ipc(f"{ipc_dir}/unit_died.ipc")
# .filter((~pl.col("unit_died_name").str.starts_with("Beacon")))
stats_df = pl.scan_ipc(f"{ipc_dir}/stats.ipc")
upgrades_df = pl.scan_ipc(f"{ipc_dir}/upgrades.ipc")
user_init_data_df = pl.scan_ipc(f"{ipc_dir}/user_init_data.ipc")
lobby_slot_init_data_df = pl.scan_ipc(f"{ipc_dir}/lobby_init_data.ipc")
details_df = pl.scan_ipc(f"{ipc_dir}/details.ipc")
lobby_slot_init_data_df.collect_schema()

Schema([('ext_fs_id', UInt64),
        ('ext_fs_sha256', String),
        ('ext_fs_file_name', String),
        ('control', Int64),
        ('user_id', Int64),
        ('team_id', Int64),
        ('observe', UInt8),
        ('working_set_slot_id', UInt8),
        ('map_size_x', UInt8),
        ('map_size_y', UInt8)])

In [2]:
lobby_slot_init_data_df.select(pl.col("working_set_slot_id").value_counts(sort=True)).collect()
# - observe 0, 1, 2 --- 0 = player
# - control 0,1,2,3 --- unsure
# - user_id null, 0, 1, 2, 3 --- unsure
# - team_id 0, 1 --- seems for players
# - working_set_slot_id 0..15 --- unsure (are these maybe the slots of 16 players that can be in a lobby?
# - map_size_x: 0? 1..16 possible values of size_x
# - map_size_y: 0? 1.. many more than 16 possible values? doesn't match size_x hmm

working_set_slot_id
struct[2]
"{15,12059}"
"{14,12059}"
"{13,12059}"
"{12,12059}"
"{8,12059}"
"{0,12059}"
"{11,12058}"
"{10,12058}"
"{9,12058}"
"{4,12057}"


In [3]:
user_init_data_df.collect_schema()

Schema([('ext_fs_id', UInt64),
        ('ext_fs_sha256', String),
        ('ext_fs_file_name', String),
        ('name', String),
        ('clan_tag', String),
        ('scaled_rating', Int32)])

In [4]:
user_init_data_df.select(pl.col("name").value_counts(sort=True)).collect()

name
struct[2]
"{""�"",157010}"
"{""Sazed"",5052}"
"{""Doombringer"",3225}"
"{""ESLRefereeB"",892}"
"{""EnObsA"",822}"
"{""EnObsB"",821}"
"{""AfreecaTV"",802}"
"{""Indy"",588}"
"{""ESLRefereeC"",485}"
"{""ENObsA"",404}"


In [5]:
lobby_slot_init_data_df.filter(
    [pl.col("ext_fs_id").eq(18780) & pl.col("observe").eq(0)]
).collect()

ext_fs_id,ext_fs_sha256,ext_fs_file_name,control,user_id,team_id,observe,working_set_slot_id,map_size_x,map_size_y
u64,str,str,i64,i64,i64,u8,u8,u8,u8
18780,"""0fa2a90d3c75546dd401130e75c227c5a8d62e12bb0aa8efe432edaafb53493e""","""/home/seb/SCReplaysOnNVMe/[2023 GSL S2] Ro.8 Group B Match1 Dark vs TY/1SET [2023 GSL S2] Ro.8 Group B Match1 Dark vs TY.SC2Replay""",2,2,1,0,2,184,178
18780,"""0fa2a90d3c75546dd401130e75c227c5a8d62e12bb0aa8efe432edaafb53493e""","""/home/seb/SCReplaysOnNVMe/[2023 GSL S2] Ro.8 Group B Match1 Dark vs TY/1SET [2023 GSL S2] Ro.8 Group B Match1 Dark vs TY.SC2Replay""",2,3,0,0,3,184,178


In [6]:
details_df.collect_schema()

Schema([('player_name', String),
        ('player_toon_region', UInt8),
        ('player_toon_program_id', UInt32),
        ('player_toon_realm', UInt32),
        ('player_toon_id', UInt64),
        ('player_race', String),
        ('player_color_a', UInt8),
        ('player_color_r', UInt8),
        ('player_color_g', UInt8),
        ('player_color_b', UInt8),
        ('player_control', UInt8),
        ('player_team_id', UInt8),
        ('player_observe', UInt8),
        ('player_result', UInt8),
        ('player_working_set_slot_id', UInt8),
        ('player_hero', String),
        ('title', String),
        ('is_blizzard_map', Boolean),
        ('time_utc', Int64),
        ('time_local_offset', Int64),
        ('ext_fs_id', UInt64),
        ('ext_datetime', Datetime(time_unit='ns', time_zone=None))])

In [7]:
details_df.select(pl.col("player_name").value_counts(sort=True)).collect()
# player_name contains the clan
# player_race is Protoss, Terran, but also has hanzi (神族), korean? (테란) and strings like "Primal Zerg" or "Terraner" or ""
# player_team_id is either 0 or 1
# player_observe is always 0
# player_result can be 1, 2,0 and 3
#               1 and 2 are win,loss (not sure which is which). 
#               0 is probably undecided (or maybe teh status of observer?)
#               3 is probably tie (there are only 50 games with this status in this 12K replay dataset.
# player_working_set_slot_id null, 0..15
#                            not the same as lobby_slot_init_data_df working_set_slot_id
# player_hero always ""
# title is the name of the map
# is_blizzard_map seems filled
# time_utc is that weird "epoch" after 2000 or something
# time_local_offset is +- in millis (or nanos?)

player_name
struct[2]
"{""&lt;Mealen&gt;<sp/>Doombringer"",3225}"
"{""A.I. 2 (Elite)"",3060}"
"{""A.I. 3 (Elite)"",3004}"
"{""&lt;chezs&gt;<sp/>Sazed"",2406}"
"{""Sazed"",1880}"
"{""Cheater 2 (Insane)"",968}"
"{""Cheater 1 (Insane)"",818}"
"{""&lt;ƖIıIƖ&gt;<sp/>Sazed"",744}"
"{""Cheater 3 (Insane)"",189}"
"{""&lt;Mealen&gt;<sp/>Archimonde"",186}"


In [8]:
details_df.filter([pl.col("ext_fs_id").eq(18780)]).collect()

player_name,player_toon_region,player_toon_program_id,player_toon_realm,player_toon_id,player_race,player_color_a,player_color_r,player_color_g,player_color_b,player_control,player_team_id,player_observe,player_result,player_working_set_slot_id,player_hero,title,is_blizzard_map,time_utc,time_local_offset,ext_fs_id,ext_datetime
str,u8,u32,u32,u64,str,u8,u8,u8,u8,u8,u8,u8,u8,u8,str,str,bool,i64,i64,u64,datetime[ns]
"""TYTY""",3,21298,1,2341765,"""Terran""",255,180,20,30,2,1,0,2,2,"""""","""[ESL] Royal Blood""",False,133343197785450190,324000000000,18780,2023-07-20 00:42:58.545019
"""Dark""",3,21298,1,2333312,"""Zerg""",255,0,66,255,2,0,0,1,3,"""""","""[ESL] Royal Blood""",False,133343197785450190,324000000000,18780,2023-07-20 00:42:58.545019


In [9]:
upgrades_df.collect_schema()

Schema([('player_id', UInt8),
        ('name', String),
        ('count', Int32),
        ('ext_replay_loop', Int64),
        ('ext_replay_seconds', UInt32),
        ('ext_fs_id', UInt64)])

In [10]:
upgrades_df.select(pl.col("name").value_counts(sort=True)).collect()
# player_id is 0..15
#             0 is only 20 occurrences and it's from campaign
#             From 1 it's actual players.
# name: the name of the upgrade

name
struct[2]
"{""SprayZerg"",182513}"
"{""SprayProtoss"",83708}"
"{""SprayTerran"",78628}"
"{""RewardDanceOverlord"",15137}"
"{""RewardDanceRoach"",14789}"
"{""RewardDanceInfestor"",14196}"
"{""zerglingmovementspeed"",13069}"
"{""RewardDanceMule"",11240}"
"{""RewardDanceStalker"",10828}"
"{""RewardDanceViking"",10671}"


In [11]:
stats_df.collect_schema()

Schema([('player_id', UInt8),
        ('minerals_current', Int32),
        ('vespene_current', Int32),
        ('minerals_collection_rate', Int32),
        ('vespene_collection_rate', Int32),
        ('workers_active_count', Int32),
        ('minerals_used_in_progress_army', Int32),
        ('minerals_used_in_progress_economy', Int32),
        ('minerals_used_in_progress_technology', Int32),
        ('vespene_used_in_progress_army', Int32),
        ('vespene_used_in_progress_economy', Int32),
        ('vespene_used_in_progress_technology', Int32),
        ('minerals_used_current_army', Int32),
        ('minerals_used_current_economy', Int32),
        ('minerals_used_current_technology', Int32),
        ('vespene_used_current_army', Int32),
        ('vespene_used_current_economy', Int32),
        ('vespene_used_current_technology', Int32),
        ('minerals_lost_army', Int32),
        ('minerals_lost_economy', Int32),
        ('minerals_lost_technology', Int32),
        ('vespene_lost_

In [41]:
stats_df.select(pl.col("minerals_used_in_progress_economy").value_counts(sort=True)).collect()
# player_id From 0..14, mostly 0 and 1, not sure which matches with working_set_slot_id
# minerals_current self explanatory at a given ext_replay_loop for a specific user_id
# vespene_current same as minerals_current
# minerals_collection_rate unsure, seems to be from 0 to 14Ks, 1566 mean. maybe per minute?
# vespene_collection_rate from 0 to 4Ks, 549 mean, as with minerals_collection_rate?
# workers_active_count from 0 to 200 (oddly)
# ------ Following fields are "resource" type:
# ------ - Minerals
# ------ - Vespene 
# ------ And for each resource, one stored field per "bucket"
# ------ - Army
# ------ - Economy
# ------ - Technology
# <resource>_used_in_progress_<bucket>
# <resource>_used_current_<bucket>
# <resource>_lost_<bucket>
# <resource>_killed_<bucket>
# <resource>_friendly_fire_<bucket>
# ------ And then two more:
# <resource>_used_active_forces

minerals_used_in_progress_economy
struct[2]
"{0,1584516}"
"{100,396428}"
"{50,332799}"
"{150,319089}"
"{400,282838}"
"{350,229885}"
"{200,195525}"
"{450,182421}"
"{500,167061}"
"{550,100406}"


In [14]:
unit_born_df.collect_schema()

Schema([('unit_tag_index', UInt32),
        ('unit_tag_recycle', UInt32),
        ('unit_type_name', String),
        ('control_player_id', UInt8),
        ('upkeep_player_id', UInt8),
        ('x', Float32),
        ('y', Float32),
        ('creator_unit_tag_index', UInt32),
        ('creator_unit_tag_recycle', UInt32),
        ('creator_unit_type_name', String),
        ('creator_ability_name', String),
        ('ext_replay_loop', Int64),
        ('ext_replay_seconds', UInt32),
        ('ext_fs_id', UInt64)])

In [34]:
unit_born_df.select(pl.col("creator_unit_type_name").value_counts(sort=True)).collect()
# unit_tag_index and unit_tag_recycle = used to calculate the unique id of a unit
# unit_type_name = name of unit drone, probe, mineralfield, etc.
# control_player_id Either null or 0..15
#                   When null (almost no instances in this data set) could be the campaign, stuff like K5Kerrigan
#                   When 0 it's the system creating mineral fields, geysers, etc.
#                   From 1..15 is the actual users playing.
# upkeep_player_id  Either null or 0..15
#                   As above, when 0, it's the system creating mineral fields, geysers, etc.
#                   When 1..15 actual users playing
# creator_ability_name Larva, Cocoons, OrbitalCommands, Barracks, Nexus, Lair, creating units.

creator_unit_type_name
struct[2]
"{""LurkerMPBurrowed"",38080674}"
"{""Egg"",8223231}"
"{null,7697656}"
"{""Larva"",3724119}"
"{""Hatchery"",3247065}"
"{""BroodLord"",2691123}"
"{""BroodlingEscort"",1041326}"
"{""Barracks"",702789}"
"{""Nexus"",639436}"
"{""Lair"",542361}"


In [35]:
unit_died_df.collect_schema()

Schema([('unit_died_name', String),
        ('unit_tag_index', UInt32),
        ('unit_tag_recycle', UInt32),
        ('killer_player_id', UInt8),
        ('x', UInt8),
        ('y', UInt8),
        ('unit_killer_name', String),
        ('killer_unit_tag_index', UInt32),
        ('killer_unit_tag_recycle', UInt32),
        ('ext_replay_loop', Int64),
        ('ext_replay_seconds', UInt32),
        ('ext_fs_id', UInt64)])

In [37]:
unit_died_df.select(pl.col("unit_killer_name").value_counts(sort=True)).collect()
# unit_died_name name of units dying.
#                Most unit names are InvisibleTargetDummy.
#                It seems when lurker or units A-move, a temporary target_dummy is created at that location and when the unit "attacks" it.
#                Q Are tank shots on InvisibleTargetDummy? There's splash radius/damage somewhere maybe?
# unit_killer_name the killer unit if found.

unit_killer_name
struct[2]
"{""LurkerMPBurrowed"",38470078}"
"{"""",9756594}"
"{""Zergling"",557942}"
"{""Marine"",499037}"
"{""Hydralisk"",433068}"
"{""SiegeTankSieged"",414316}"
"{""Stalker"",405371}"
"{""Roach"",399433}"
"{""Zealot"",292009}"
"{""BroodlingEscort"",255052}"


In [42]:
user_init_data_df.collect_schema()

Schema([('ext_fs_id', UInt64),
        ('ext_fs_sha256', String),
        ('ext_fs_file_name', String),
        ('name', String),
        ('clan_tag', String),
        ('scaled_rating', Int32)])

In [49]:
user_init_data_df.select(pl.col("scaled_rating").value_counts(sort=True)).collect()
# Name: The name of the player
# clan_tag: The name of the clan
# scaled rating, can be null (178K occurrences) or negative (-36400?!?!) or seems like MMR

scaled_rating
struct[2]
"{null,178724}"
"{-36400,6221}"
"{3600,246}"
"{2800,45}"
"{3882,31}"
"{3876,26}"
"{3880,25}"
"{3852,25}"
"{3910,25}"
"{3859,25}"


In [50]:
lobby_slot_init_data_df.collect_schema()

Schema([('ext_fs_id', UInt64),
        ('ext_fs_sha256', String),
        ('ext_fs_file_name', String),
        ('control', Int64),
        ('user_id', Int64),
        ('team_id', Int64),
        ('observe', UInt8),
        ('working_set_slot_id', UInt8),
        ('map_size_x', UInt8),
        ('map_size_y', UInt8)])

In [52]:
lobby_slot_init_data_df.select(pl.col("user_id").value_counts(sort=True)).collect()
# control has values 0,1,2,3, most are 1: 89Ks occurrences and the next is 0 with 57K occurrences
# 

user_id
struct[2]
"{null,155728}"
"{0,12059}"
"{1,10938}"
"{2,3422}"
"{3,3033}"
"{4,2220}"
"{5,1637}"
"{6,1300}"
"{7,519}"
"{8,389}"
