In [1]:
from bs4 import BeautifulSoup
import requests
import time

def tournament_dates(): 
    year_range = range(2018, 2019)
    month_range = range(1, 13)
    for year in year_range:
        for month in month_range:
            yield "{}{:02}".format(year, month)
    
banzuke_text = []
for date in tournament_dates():
    print("Getting tournament: {}".format(date))
    url = "http://sumodb.sumogames.de/Banzuke.aspx?b={}".format(date)
    response = requests.get(url)
    time.sleep(3)
    banzuke_text.append(response.text)

Getting tournament: 201801
Getting tournament: 201802
Getting tournament: 201803
Getting tournament: 201804
Getting tournament: 201805
Getting tournament: 201806
Getting tournament: 201807
Getting tournament: 201808
Getting tournament: 201809
Getting tournament: 201810
Getting tournament: 201811
Getting tournament: 201812


In [2]:
from bs4 import BeautifulSoup

def parse_banzuke_rikishi_info(text):
    rikishi_ids = set()
    rikishi_names = {}

    soup = BeautifulSoup(text, 'html.parser')
    rows = soup.find("table",{"class":"banzuke"}).find_all('tr')[1:]
    for row in rows:
        for link in row.find_all('a'):
            if 'Rikishi.aspx' in link['href']:
                rikishi_id = int(link['href'].split('=')[1])
                rikishi_ids.add(rikishi_id)
                rikishi_names[rikishi_id] = link.text

    return rikishi_ids, rikishi_names
 
rikishi_ids = set()
rikishi_names = {}
for banzuke in banzuke_text:
    ids, names = parse_banzuke_rikishi_info(banzuke)
    rikishi_ids.update(ids)
    rikishi_names.update(names)
    
rikishi_ids

{89,
 1123,
 1219,
 1226,
 1235,
 1284,
 2832,
 2879,
 4980,
 5944,
 5967,
 6463,
 6468,
 6473,
 6480,
 6491,
 6594,
 6596,
 6599,
 6614,
 6642,
 6753,
 7153,
 7240,
 8899,
 9079,
 11724,
 11728,
 11784,
 11785,
 11786,
 11845,
 11855,
 11927,
 11934,
 11946,
 11985,
 12026,
 12043,
 12051,
 12055,
 12094,
 12107,
 12130,
 12144,
 12191,
 12210,
 12239,
 12273,
 12291,
 12292}

In [3]:
def get_rikishi_history(rikishi_id):
    url = "http://sumodb.sumogames.de/Rikishi.aspx?r={}".format(rikishi_id)
    print("Getting rikishi {}".format(rikishi_id))
    response = requests.get(url)
    time.sleep(3)
    soup = BeautifulSoup(response.text, 'html.parser')

    cats = map(lambda x: x.text, soup.find_all('td', {'class': 'cat'}))
    vals = map(lambda x: x.text, soup.find_all('td', {'class': 'val'}))
    info = dict(zip(cats, vals))

    history = []
    table = soup.find('table', {'class': 'rikishi'})
    for row in table.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) == 0:
            continue
        date = cells[0].text
        rank = cells[1].text
        rank_debut = 'class' in cells[1].attrs and 'debut' in cells[1].attrs['class']
        score  = cells[3].text
        prizes = list(filter(lambda x: x != '\xa0', cells[4].stripped_strings))
        size = cells[5].text.replace('\xa0', '')

        row = {
            'date': date,
            'rank': rank,
            'rank_debut': rank_debut,
            'score': score,
            'prizes': prizes,
            'size': size
        }
        history.append(row)

    for row in history:
        if row['size'] != '':
            size = row['size']
        elif row['size'] == '':
            row['size'] = size

    info['history'] = history
    info['id'] = rikishi_id
    return info

import pickle  
for rikishi_id in rikishi_ids:
    info = get_rikishi_history(rikishi_id)
    info['name'] = rikishi_names[info['id']]

    with open('/Users/samuel/notebooks/sumo/data/rikishi_{}.pkl'.format(rikishi_id), 'wb') as f:
        pickle.dump(info, f)

Getting rikishi 12291
Getting rikishi 1284
Getting rikishi 12292
Getting rikishi 11784
Getting rikishi 11785
Getting rikishi 11786
Getting rikishi 12043
Getting rikishi 2832
Getting rikishi 12051
Getting rikishi 12055
Getting rikishi 5944
Getting rikishi 12094
Getting rikishi 6463
Getting rikishi 2879
Getting rikishi 6468
Getting rikishi 11845
Getting rikishi 7240
Getting rikishi 6473
Getting rikishi 12107
Getting rikishi 5967
Getting rikishi 6480
Getting rikishi 11855
Getting rikishi 89
Getting rikishi 6491
Getting rikishi 6753
Getting rikishi 12130
Getting rikishi 1123
Getting rikishi 12144
Getting rikishi 4980
Getting rikishi 9079
Getting rikishi 11927
Getting rikishi 11934
Getting rikishi 12191
Getting rikishi 11946
Getting rikishi 12210
Getting rikishi 6594
Getting rikishi 1219
Getting rikishi 6596
Getting rikishi 8899
Getting rikishi 6599
Getting rikishi 1226
Getting rikishi 11724
Getting rikishi 12239
Getting rikishi 11728
Getting rikishi 11985
Getting rikishi 1235
Getting rikis