# Age of Empires webcrawler for matches

### Iterative connection to the aoe2.net API

A custom crawler is required because the above API restricts downloads to 1000; however this is not enough data to infer meaningful statistics, particularly as the leaderboard of interest (ranked 1v1, leaderboard_id = 3) cannot be pre-filtered through the GET request.

Investigation found that the first match after 3 May 2021 00:00 was match_id = 88735565. This crawler works by posting successive requests to the API, iterating the match_id by 1 (n.b. many match_ids are no longer valid, and so lead to bad requests)

In [1]:
import pyodbc as db
import pandas as pd
import json
import urllib
import datetime
import time
import sys

In [2]:
# API string
api_str = 'https://aoe2.net/api/matches?game=aoe2de&count=1000&since='
match_start_date = 1620014400  # start point for matches (03 May 2021)
end_match_date = 1624766400 # most recent match (27 June 2021)

# set up json-to-save structure
json_total = list()


count = 0
current_match_date = match_start_date # this will allow continuing from last URL accessed

max_allowed_invalid = 10 # max allowed consecutive connection issues

# set up save name structure
save_filename_root = 'C:/Users/richa/Documents/AOE2 API Storage/matches{0}.json'
save_filename = save_filename_root.format(0)
i = 0 

In [None]:
# iterate through jsons from the API
invalid_count = 0 # current consecutive url connection errors


while current_match_date < end_match_date:
    
    # every 250,000 matches, switch to a new save filename
    if count % 250 == 0 and count != 0:
        
        # save
        with open(save_filename,'w') as outfile:
            json.dump(json_total, outfile)
            outfile.close()
        
        i += 1
        
        save_filename = save_filename_root.format(i) #change filename
        print ('saving to new file:{0}'.format(save_filename))
        
        json_total = list() # reset match container
        
    # get the url to access the API
    new_match_string = api_str + str(current_match_date)
    
    try:
        with urllib.request.urlopen(new_match_string,timeout=30) as url:
            data = json.loads(url.read().decode())
            
    except:
        invalid_count += 1
        
        
        if invalid_count > max_allowed_invalid:
            print('Too many consecutive connection failures ({0}). Exiting process'.format(max_allowed_invalid))
            break
        
        time.sleep(2)
        continue
    
    # if we get this far, then we have a successful connection. Reset invalid counter
    invalid_count = 0
    
    [json_total.append(i) for i in data]
    
    current_match_date = int(data[-1]['finished'])
    
    count += 1
    if count % 10 == 0: # save at every 10,000 matches in case of network interruptions
        print('Count: {0}. Current matches analysed: {1}. Getting matches after {2}'.format(count,len(json_total),datetime.datetime.fromtimestamp(current_match_date).strftime('%c')))
    
        
        
        with open(save_filename,'w') as outfile:
            json.dump(json_total, outfile)
            outfile.close()
            
            
         
# ensure the final records are saved into the latest filename
with open(save_filename,'w') as outfile:
            json.dump(json_total, outfile)
            outfile.close()    
print('Completed - all records retrieved. Exiting process.')