This is an edited version of David NQ's Halite Game Scraper at https://www.kaggle.com/david1013/halite-game-scraper

Kaggle's API limit for Google Football is yet to be made explicit but in Kaggle Halite the limit of 1000 requests per day was eventually raised to 3600 requests per day max.

Rate limits are shared between the ListEpisodes and GetEpisodeReplay endpoints. Exceeding limits repeatedly will lead to temporary and then permanent bans. At some point it is expected Kaggle will remove this public API and provide datasets of episodes.

The episodes take a lot of space. In Kaggle Halite, I ended up with 200GB of games. The Football JSON files are **ten times larger** so you may end up with terabytes. If you use this or any scraper, consider posting the dataset to Kaggle Datasets for others to use.

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import json
import datetime
import time


In [2]:
MIN_FINAL_RATING = 500 # top submission in a match must have reached this score
num_api_calls_today = 0


In [3]:
all_files = []
for root, dirs, files in os.walk('../input/', topdown=False):
    all_files.extend(files)
seen_episodes = [int(f.split('.')[0]) for f in all_files 
                      if '.' in f and f.split('.')[0].isdigit() and f.split('.')[1] == 'json']
print('{} games in existing library'.format(len(seen_episodes)))


42 games in existing library


In [4]:
NUM_TEAMS = 1
EPISODES = 600 

BUFFER = 1

base_url = "https://www.kaggle.com/requests/EpisodeService/"
get_url = base_url + "GetEpisodeReplay"
list_url = base_url + "ListEpisodes"

In [5]:
# inital team list

r = requests.post(list_url, json = {"teamId":  5586412}) # arbitrary ID, change to leading ID during challenge

rj = r.json()

teams_df = pd.DataFrame(rj['result']['teams'])

In [6]:
teams_df.sort_values('publicLeaderboardRank', inplace = True)
teams_df.head(6)

Unnamed: 0,id,teamName,competitionId,teamLeaderId,isBenchmark,createdAfterDeadline,medal,dateMedalAwarded,submissionCount,lastSubmissionDate,publicLeaderboardSubmissionId,publicLeaderboardScore,publicLeaderboardRank,privateLeaderboardSubmissionId,privateLeaderboardScore,privateLeaderboardRank,competition,teamMembers,inboundMergeRequests,outboundMergeRequests
4,5587034,Sasha Korekov,21723,531361,False,False,0,,1,"{'seconds': 1601325057, 'nanos': 903333300}",17361502,,10,,,,,"[{'id': 531361, 'displayName': 'Sasha Korekov'...",[],[]
11,5589130,whoami,21723,5859899,False,False,0,,1,"{'seconds': 1601359548, 'nanos': 716666700}",17366006,,13,,,,,"[{'id': 5859899, 'displayName': 'whoami', 'ema...",[],[]
9,5588284,Cory Hisey,21723,5701569,False,False,0,,1,"{'seconds': 1601346164, 'nanos': 420000000}",17364232,,46,,,,,"[{'id': 5701569, 'displayName': 'Cory Hisey', ...",[],[]
8,5588103,Kha Vo,21723,1829450,False,False,0,,1,"{'seconds': 1601344159, 'nanos': 436666700}",17363942,,51,,,,,"[{'id': 1829450, 'displayName': 'Kha Vo', 'ema...",[],[]
5,5587407,CebadoresFC,21723,2210134,False,False,0,,2,"{'seconds': 1601332909, 'nanos': 483333300}",17361720,,59,,,,,"[{'id': 40364, 'displayName': 'Federico Pousa'...",[],[]
7,5587824,Anwesh Satapathy,21723,820676,False,False,0,,1,"{'seconds': 1601341131, 'nanos': 310000000}",17363569,,71,,,,,"[{'id': 820676, 'displayName': 'Anwesh Satapat...",[],[]


In [7]:
def getTeamEpisodes(team_id):
    # request
    r = requests.post(list_url, json = {"teamId":  int(team_id)})
    rj = r.json()

    # update teams list
    global teams_df
    teams_df_new = pd.DataFrame(rj['result']['teams'])
    
    if len(teams_df.columns) == len(teams_df_new.columns) and (teams_df.columns == teams_df_new.columns).all():
        teams_df = pd.concat( (teams_df, teams_df_new.loc[[c for c in teams_df_new.index if c not in teams_df.index]] ) )
        teams_df.sort_values('publicLeaderboardRank', inplace = True)
    else:
        print('teams dataframe did not match')
    
    # make df
    team_episodes = pd.DataFrame(rj['result']['episodes'])
    team_episodes['avg_score'] = -1;
    
    for i in range(len(team_episodes)):
        agents = team_episodes['agents'].loc[i]
        agent_scores = [a['updatedScore'] for a in agents if a['updatedScore'] is not None]
        team_episodes.loc[i, 'submissionId'] = [a['submissionId'] for a in agents if a['submission']['teamId'] == team_id][0]
        team_episodes.loc[i, 'updatedScore'] = [a['updatedScore'] for a in agents if a['submission']['teamId'] == team_id][0]
        
        if len(agent_scores) > 0:
            team_episodes.loc[i, 'avg_score'] = np.mean(agent_scores)

    for sub_id in team_episodes['submissionId'].unique():
        sub_rows = team_episodes[ team_episodes['submissionId'] == sub_id ]
        max_time = max( [r['seconds'] for r in sub_rows['endTime']] )
        final_score = max( [r['updatedScore'] for r_idx, (r_index, r) in enumerate(sub_rows.iterrows())
                                if r['endTime']['seconds'] == max_time] )

        team_episodes.loc[sub_rows.index, 'final_score'] = final_score
        
    team_episodes.sort_values('avg_score', ascending = False, inplace=True)
    return rj, team_episodes

In [8]:
def saveEpisode(epid, rj):
    # request
    re = requests.post(get_url, json = {"EpisodeId": int(epid)})
        
    # save replay
    with open('{}.json'.format(epid), 'w') as f:
        f.write(re.json()['result']['replay'])

    # save episode info
    with open('{}_info.json'.format(epid), 'w') as f:
        json.dump([r for r in rj['result']['episodes'] if r['id']==epid][0], f)

In [9]:
global num_api_calls_today

pulled_teams = {}
pulled_episodes = []
start_time = datetime.datetime.now()
r = BUFFER;

while num_api_calls_today < EPISODES:
    # pull team
    top_teams = [i for i in teams_df.id if i not in pulled_teams]
    if len(top_teams) > 0:
        team_id = top_teams[0]
    else:
        break;
        
    # get team data
    team_json, team_df = getTeamEpisodes(team_id); r+=1;
    num_api_calls_today+=1
    print('{} games for {}'.format(len(team_df), teams_df.loc[teams_df.id == team_id].iloc[0].teamName))

    
    team_df = team_df[  (MIN_FINAL_RATING is None or (team_df.final_score > MIN_FINAL_RATING))]
    
    print('   {} in score range from {} submissions'.format(len(team_df), len(team_df.submissionId.unique() ) ) )
    
    team_df = team_df[~team_df.id.isin(pulled_episodes + seen_episodes)]        
    print('      {} remain to be downloaded\n'.format(len(team_df)))
        
    # pull games
    target_team_games = int(np.ceil(EPISODES / NUM_TEAMS))
    if target_team_games + len(pulled_episodes) > EPISODES:
        target_team_games = EPISODES - len(pulled_episodes)
     
    pulled_teams[team_id] = 0
    
    i = 0
    while i < len(team_df) and pulled_teams[team_id] < target_team_games:
        epid = team_df.id.iloc[i]
        if not (epid in pulled_episodes or epid in seen_episodes):
            try:
                saveEpisode(epid, team_json); r+=1;
                num_api_calls_today+=1
            except:
                time.sleep(20)
                i+=1;
                continue;
                
            pulled_episodes.append(epid)
            pulled_teams[team_id] += 1
            try:
                size = os.path.getsize('{}.json'.format(epid)) / 1e6
                print(str(num_api_calls_today) + ': Saved Episode #{} @ {:.1f}MB'.format(epid, size))
            except:
                print('  file {}.json did not seem to save'.format(epid))    
            if r > (datetime.datetime.now() - start_time).seconds:
                time.sleep( r - (datetime.datetime.now() - start_time).seconds)
                

        i+=1;
    print(); print()

10 games for Sasha Korekov
   10 in score range from 1 submissions
      9 remain to be downloaded

2: Saved Episode #3630922 @ 20.9MB
3: Saved Episode #3632388 @ 20.4MB
4: Saved Episode #3626526 @ 20.6MB
5: Saved Episode #3627103 @ 21.2MB
6: Saved Episode #3627276 @ 21.1MB
7: Saved Episode #3628201 @ 20.9MB
8: Saved Episode #3629630 @ 21.0MB
9: Saved Episode #3629734 @ 21.0MB
10: Saved Episode #3632680 @ 20.7MB


3 games for whoami
   3 in score range from 1 submissions
      3 remain to be downloaded

12: Saved Episode #3631807 @ 20.8MB
13: Saved Episode #3630525 @ 20.9MB
14: Saved Episode #3629845 @ 17.7MB


6 games for Cory Hisey
   6 in score range from 1 submissions
      6 remain to be downloaded

16: Saved Episode #3631220 @ 18.6MB
17: Saved Episode #3628551 @ 20.7MB
18: Saved Episode #3629215 @ 21.0MB
19: Saved Episode #3630055 @ 20.7MB
20: Saved Episode #3630889 @ 21.1MB
21: Saved Episode #3632572 @ 20.0MB


6 games for Kha Vo
   5 in score range from 1 submissions
      4 re