In [2]:
from bs4 import BeautifulSoup
import requests

def tournament_dates(): 
    year_range = range(1960, 1990)
    month_range = range(1, 13)
    for year in year_range:
        for month in month_range:
            yield "{}{:02}".format(year, month)
    
banzuke_text = []
for date in tournament_dates():
    print("Getting tournament: {}".format(date))
    url = "http://sumodb.sumogames.de/Banzuke.aspx?b={}".format(date)
    response = requests.get(url)
    banzuke_text.append(response.text)

Getting tournament: 196001
Getting tournament: 196002
Getting tournament: 196003
Getting tournament: 196004
Getting tournament: 196005
Getting tournament: 196006
Getting tournament: 196007
Getting tournament: 196008
Getting tournament: 196009
Getting tournament: 196010
Getting tournament: 196011
Getting tournament: 196012
Getting tournament: 196101
Getting tournament: 196102
Getting tournament: 196103
Getting tournament: 196104
Getting tournament: 196105
Getting tournament: 196106
Getting tournament: 196107
Getting tournament: 196108
Getting tournament: 196109
Getting tournament: 196110
Getting tournament: 196111
Getting tournament: 196112
Getting tournament: 196201
Getting tournament: 196202
Getting tournament: 196203
Getting tournament: 196204
Getting tournament: 196205
Getting tournament: 196206
Getting tournament: 196207
Getting tournament: 196208
Getting tournament: 196209
Getting tournament: 196210
Getting tournament: 196211
Getting tournament: 196212
Getting tournament: 196301
G

In [3]:
import pandas as pd
import time
from collections import defaultdict

def parse_score(score):
    score = score.replace(' ↓', '')
    score = score.replace(' ↑', '')
    parts = score.split(' ')

    def _format_score(score):
        if len(score) == 2:
            score.append(0)
        return list(map(int, score))

    if len(parts) == 1:
        scores = parts[0].split('-')
        return _format_score(scores), []
    elif len(parts) == 2:
        scores = parts[0].split('-')
        return _format_score(scores), sorted(list(parts[1]))

def split_rows(rows):
    flat_rows = []
    for row in rows:
        if len(row) == 5:
            flat_rows.append(row[:3])
            flat_rows.append(list(reversed(row[2:])))
        elif len(row) == 4:
            if row[0] == '\xa0':
                row = list(reversed(row))
            row.pop()
            flat_rows.append(row)

    return flat_rows

def parse_row(items):
    score, name, rank = items
    (win, loss, absent), prizes = parse_score(score)
    return {
        'name': name,
        'wins': win,
        'losses': loss,
        'absent': absent,
        'rank': rank
    }

def parse_banzuke(soup):
    banzuke = []
    rows = soup.find("table",{"class":"banzuke"}).find_all('tr')[1:]
    bout_data = get_bouts(rows)
    rows = [list(map(lambda item: item.text, row.find_all('td'))) for row in rows]
    rows = split_rows(rows)
    banzuke_data = list(map(parse_row, rows))
    return banzuke_data, bout_data

def do_get(postfix):
    url = "http://sumodb.sumogames.de/{}".format(postfix)
    print("Getting {}".format(url))
    time.sleep(3)
    resp = requests.get(url)
    return resp

def parse_tournament_bouts(soup):
    bouts = []
    for row in soup.find("table",{"class":"rb_torikumi"}).find_all('tr'):
        items = row.find_all('td')
        day = int(items[0].text.split(' ')[1])
        result = items[1].img['src'].split('_')[1][:-4]
        opponent = items[3].a.text.split(' ')[1]
        bouts.append({
            'day': day, 
            'result': result, 
            'opponent': opponent
        })
    return bouts
    
def get_bouts(rows):
    bouts = defaultdict(list)
    for row in rows:
        links = row.find_all('a')
        links = filter(lambda l: 'basho' in l['href'], links)
        links = map(lambda l: l['href'], links)
        for l in links:
            resp = do_get(l)
            tournament_soup = BeautifulSoup(resp.text, 'html.parser')
            name = tournament_soup.find('span', {'class': 'rb_shikona'}).text.split(' ')[0]
            bout_data = parse_tournament_bouts(tournament_soup)
            bouts[name] = bout_data
    return bouts

In [4]:
url = "http://sumodb.sumogames.de/Banzuke.aspx?b={}".format("201807")
response = requests.get(url)

In [6]:
import pickle

for date, text in zip(tournament_dates(), banzuke_text):
    soup = BeautifulSoup(text, 'html.parser')
    banzuke, bouts = parse_banzuke(soup)

    with open('/Users/samuel/notebooks/sumo/data/banzuke_{}.pkl'.format(date), 'wb') as f:
        pickle.dump(banzuke, f)

    with open('/Users/samuel/notebooks/sumo/data/bouts_{}.pkl'.format(date), 'wb') as f:
        pickle.dump(bouts, f)

Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3877&b=196001
Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3904&b=196001
Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3918&b=196001
Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3947&b=196001
Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3910&b=196001
Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3943&b=196001
Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3977&b=196001
Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3878&b=196001
Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3975&b=196001
Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3926&b=196001
Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3946&b=196001
Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3911&b=196001
Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3948&b=196001
Getting http://sumodb.sumogames.de/Rikishi_basho.aspx?r=3980&b=196001
Getting http://sumod

ConnectionError: HTTPConnectionPool(host='sumodb.sumogames.de', port=80): Max retries exceeded with url: /Rikishi_basho.aspx?r=3944&b=196205 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x118942d68>: Failed to establish a new connection: [Errno 50] Network is down',))