In [1]:
import json
import logging
import os
import time

import requests

import params

In [2]:
if not (os.path.exists(params.SAVE_DIR) and os.path.isdir(params.SAVE_DIR)):
    os.mkdir(params.SAVE_DIR)

logging.basicConfig(
    filename=params.LOG_FILENAME,
    filemode="a",
    format="%(message)s",
    datefmt="%H:%M:%S",
    level=logging.INFO,  # set to logging.DEBUG if you're a masochist
)
logger = logging.getLogger("Pokemon Showdown")

In [16]:
def request_battle_json(url):
    
    """
    makes GET request for url and returns python dictionary with log data

    returns None if we get a bad response or can't decode log data
    """
    battle_info = None

    # log request
    logger.info(f"Requesting data from {url}")

    # make request
    try:
        battle_info = requests.get(url)
        battle_info.raise_for_status()  # Raise an exception for bad responses (e.g., 404)
        battle_info = battle_info.text.splitlines() # Split the response content into lines
 
    except Exception as e:
        logger.error(f"Received exception during `requests.get()` for battle data: {e}")

    # check for errors
    if battle_info is None:
        logger.error(f"Received null response for {url}")
    
    return battle_info

In [4]:
def get_replay_url(format, page):
    return (
        f"https://replay.pokemonshowdown.com/search.json?format={format}&page={page}"
    )

In [11]:
def get_battle_url(format, battle_id):
    return (f"https://replay.pokemonshowdown.com/{format}-{battle_id}.log")

In [6]:
def process_recent_replays(url):
    """
    makes GET request for url and returns python dictionary with replay IDs, among other data

    returns None if we get a bad response or can't decode JSON data
    """

    result = None

    # log request
    logger.info(f"Requesting data from {url}")

    # make request
    try:
        result = requests.get(url)
    except Exception as e:
        logger.error(f"Received exception during `requests.get()`: {e}")

    # check for errors
    if result is None:
        logger.error(f"Received null response for {url}")
    elif result.status_code != 200:
        logger.error(f"Received http status code {result.status_code} for {url}")
        result = None

    if result is not None:
        # try to convert to JSON
        try:
            result = result.json()
            battle_ids = []
            for i in range(len(result)):
                battle_id = result[i]["id"].split("-")[-1]
                battle_ids.append(str(battle_id))
        except json.JSONDecodeError:
            logger.error(f"Could not parse request as JSON for {url}")
            result = None
            battle_ids = None

    return battle_ids

In [7]:
def write_data(current_batch):
    path = os.path.join(params.SAVE_DIR, f"battles-{time.time()}.log")

    with open(path, "w", encoding="utf-8") as f:
        json.dump(current_batch, f, ensure_ascii=False)

In [18]:
def main():
    ids_seen = {format_: set() for format_ in params.FORMATS}
    ids_seen_old = {format_: set() for format_ in params.FORMATS}
    
    current_batch = {format_: {} for format_ in params.FORMATS}
    page = -1
    format_ix = 0
    next_page_exists = True
    should_cycle_ids = False
    
    page = -1
    
    while True:
        page +=1
        if page > 25 or not next_page_exists:
            page = 0
            format_ix = (format_ix + 1) % len(params.FORMATS)

            if should_cycle_ids and format_ix == 0:
                # reset ID trackers
                ids_seen_old = ids_seen
                ids_seen = {format_: set() for format_ in params.FORMATS}
                should_cycle_ids = False
                
        format_ = params.FORMATS[format_ix]

        logger.info(
            "Battles in memory: "
            + str(sum([len(current_batch[format_]) for format_ in params.FORMATS]))
        )
        
        recent_battles = process_recent_replays(get_replay_url(format_,page))
        if recent_battles is None:
            time.sleep(1)
            continue
            
        last_batch_time = time.time()
        next_page_exists = len(recent_battles) == 51
        
        
        # loop through battles
        num_skipped = 0
        for i in (recent_battles):
            battle_data = request_battle_json(get_battle_url(format_, i))
            
            write_data(battle_data)
        
            #if params.DEBUG:
                #break

        #if params.DEBUG:
            #write_data(current_batch)
            #break
            
        num_battles = sum([len(current_batch[format_]) for format_ in params.FORMATS])
        if num_battles >= params.BATCH_SIZE:
            logger.info(f"saving {num_battles} battles to disk")
            should_cycle_ids = True
        
        logger.debug("Got data for batch. Waiting until it's been 1 minute...")
        logger.info("")
        while time.time() - last_batch_time < 60 and num_skipped / 50 < 0.5:
            time.sleep(1)

In [19]:
if __name__ == "__main__":
    main()

TypeError: Object of type Response is not JSON serializable