# RPS Episode Archive Dataset - EDA

This notebook explores the [RPS Episode Archive Dataset](https://www.kaggle.com/tonyrobinson/rps-episode)

In [None]:
!find /kaggle/input/rps-episode/ -name '*.json' | wc -l 

In [None]:
# !apt install -qy moreutils 
!cat /kaggle/input/rps-episode/5847716.json | PYTHONUNBUFFERED=1 python -m json.tool | head -n 50
!cat /kaggle/input/rps-episode/5847716.json | PYTHONUNBUFFERED=1 python -m json.tool | tail -n 10

In [None]:
import json
import glob
from joblib import Parallel, delayed
from collections import defaultdict
from operator import itemgetter

In [None]:
%%time

def extract_json(filename):
    with open(filename) as file:
        data   = json.load(file)
        teams  = data['info']['TeamNames']
        output = {
            team: []
            for team in teams
        }
        for step in data['steps']:
            for p1, p2 in zip(teams, reversed(teams)):
                if step[0]['observation'].get('step',0) == 0: continue
                output[p1].append( step[0]['action'] )
                output[p2].append( step[1]['action'] )                
        return output
        
filenames = glob.glob('/kaggle/input/rps-episode/*.json')
# dataset   = [ extract_json(filename) for filename in filenames[:10] ]
dataset   = Parallel(-1)([ delayed(extract_json)(filename) for filename in filenames ])

In [None]:
{ f'{k:10s}':v[:10] for k,v in list(dataset[0].items())[:4] }

# First and Second Move Response

In [None]:
# %%time

first_move_frequency  = defaultdict(int)
second_move_frequency = defaultdict(lambda: defaultdict(int))
third_move_frequency  = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

for game in dataset:
    teams = list(game.keys())
    for p1, p2 in zip(teams, reversed(teams)):
        try:
            first_move_frequency[  game[p1][0] ] += 1
            first_move_frequency[  game[p2][0] ] += 1
            second_move_frequency[ game[p1][0] ][ game[p2][1] ] += 1
            second_move_frequency[ game[p2][0] ][ game[p1][1] ] += 1
            third_move_frequency[  game[p1][0] ][ game[p2][1] ][ game[p1][2] ] += 1
            third_move_frequency[  game[p2][0] ][ game[p1][1] ][ game[p2][2] ] += 1
        except: pass
        
def dictify(obj):
    if isinstance(obj, (dict, defaultdict)):
        return { k: dictify(obj[k]) for k in [0,1,2] + list(obj.keys()) if k in obj }
    else: return obj
    
print('first_move_frequency  = ', dictify(first_move_frequency))
print('second_move_frequency = ', dictify(second_move_frequency))
print('third_move_frequency  = ', dictify(third_move_frequency))

# First Move Frequency By Team

In [None]:
first_move_by_team = defaultdict(lambda: defaultdict(int))
for game in dataset:
    teams = list(game.keys())
    for p1, p2 in zip(teams, reversed(teams)):
        try:
            first_move_by_team[p1][ game[p1][0] ] += 1
            first_move_by_team[p2][ game[p2][0] ] += 1
        except: pass
first_move_by_team = { team: { move: round(count/sum(counts.values()), 3) for move, count in counts.items() } for team, counts in first_move_by_team.items() }
for team, counts in sorted(first_move_by_team.items(), key=itemgetter(0)):
    print(f'{team:>25s}', dictify(counts))
# print('first_move_by_team  = ', dictify(first_move_by_team))