In [2]:
from bs4 import BeautifulSoup
from glob import glob
import pickle
import os
from pprint import pprint

from pymongo import MongoClient
client = MongoClient('mongo', 27017)

def load_raw(path):
#     print("Loading file {}".format(path))
    with open(path, 'r') as f:
        return BeautifulSoup(f.read())

def parse_rikishi_ids(soup):
    table = soup.find("table", {"class": "banzuke"})
    _, *rows = table.find_all('tr')
    links = [link['href'] for row in rows for link in row.find_all('a')]
    ids = map(lambda x: int(x.split('=')[1].split('&')[0]), links)
    return set(ids)

In [None]:
db = client.sumo
rikishi = db.rikishi.find({})
ids = [r['_id'] for r in rikishi]
pickle.dump(ids, open("data/pickles/ids.pkl", 'wb'))

Save IDs for later

In [None]:
pickle.dump(ids, open("data/pickles/ids.pkl", 'wb'))

In [None]:
ids = pickle.load(open("data/pickles/ids.pkl", 'rb'))

In [5]:
import os

def parse_rikishi_id(anchor):
    return int(anchor['href'].split('=')[1].split('&')[0])

def parse_score(text):
    scores = tuple(text.replace('a', '').replace('d', '').split(' ')[0].split('-'))
    
    if len(scores) > 3:
        scores = scores[:2]
    
    scores = tuple(map(int, scores))
    
    if len(scores) > 3:
        scores = scores[:2]
        
    if len(scores) == 2:
        wins, losses = scores
        return wins, losses, 0
    else:
        return scores

def parse_banzuke(soup):
    score_items = soup.find('table', {'class', 'banzuke'}).find_all('td', {'nowrap': 'nowrap'})
    shikona_items = soup.find('table', {'class', 'banzuke'}).find_all('td', {'class': ['shikona', 'retired', 'debut']})

    ranks, rikishi_ids, scores, shikonas, ranks = [], [], [], [], []
    
    if len(score_items) > 0:
        for item in soup.find('table', {'class', 'banzuke'}).find('tbody').find_all('tr'):
            rank = item.find('td', {'class': 'short_rank'}).text
            if len(item.find_all('a')) > 2:
                ranks.extend([rank, rank])
            else:
                ranks.append(rank)

        rikishi_ids = list(map(lambda item: parse_rikishi_id(item.a), score_items))
        shikonas = list(map(lambda item: item.text, shikona_items))
        scores = list(map(lambda item: parse_score(item.text), score_items))

        sizes = [len(ranks), len(rikishi_ids), len(shikonas), len(scores)]
        assert len(set(sizes)) == 1

    data = [{'rikishi_id': rid, 'score': score, 'shikona': shikona, 'rank': rank} for rid, score, shikona, rank in zip(rikishi_ids, scores, shikonas, ranks)]
    return data

def parse_all_banzuke():
    file_names = glob("data/html_raw/*.html")
    ids = map(lambda name: os.path.basename(name).split('.')[0], file_names)

    for file_name, bid in zip(file_names, ids):
        soup = load_raw(file_name)
        data = parse_banzuke(soup)
#         if len(data) == 0:
#             raise RuntimeError("Could not parse any data from file {}".format(file_name))

        yield {'_id': int(bid), 'rikishi': data}

banzuke_data = parse_all_banzuke()

db = client.sumo
db.banzuke.drop()
db.banzuke.insert_many(banzuke_data)

<pymongo.results.InsertManyResult at 0x7ff1c01fed48>

## Parse Rikishi Pages

In [4]:
def parse_hoshi(item):
    imgs = item.find_all('img')
    if imgs is None:
        return []

    names = []
    for img in imgs:
        name = os.path.splitext(os.path.basename(img['src']))[0]
        _, name = name.split('_')
        names.append(name)
    return names

def parse_prizes(item):
    if item.text == '\xa0':
        return []
    
    return item.text

def parse_weight(item):
    if item.text == '\xa0':
        return None
    return item.text

def parse_score(item):
    include = set('0' '1' '2' '3' '4' '5' '6' '7' '8' '9' '-')
    txt = item.text
    txt = ''.join(ch for ch in txt if ch in include)
    return list(map(int, txt.split('-')))

def convert_row_to_entry(row):
    data = {
        'banzuke': row[0].text.replace('.', '')
    }
    try:
        data.update({
                'banzuke': row[0].text.replace('.', ''),
                'rank': row[1].text,
                'hoshi': parse_hoshi(row[2]),
                'score': parse_score(row[3]),
                'prizes': parse_prizes(row[4]),
                'weight': parse_weight(row[5]),
            })
    except Exception:
        pass
    
    return data

def parse_rikishi(soup, rid):
    rows = soup.find('table', {'class', 'rikishi'})
    
    if rows is None:
        print("Issue with {}".format(rid))
        return {}
    
    rows = rows.find_all('tr')
    rows = [row.find_all('td') for row in rows]
    rows = filter(lambda row: len(row) > 0, rows)
    entries = list(map(convert_row_to_entry, rows))
    
    cats = soup.find('table', {'class': 'rikishidata'}).find_all('td', {'class': 'cat'})
    cats = map(lambda c: c.text.replace('\xa0', ''), cats)
    
    vals = soup.find('table', {'class': 'rikishidata'}).find_all('td', {'class': 'val'})
    vals = map(lambda v: v.text.replace('\xa0', ''), vals)

    rikishi_data = {
        '_id': rid,
        'history': entries,
    }
    
    for k, v in zip(cats, vals):
        rikishi_data.update({k: v})
        
    return rikishi_data

def parse_all_rikishi(file_names, ids):
    for file_name, rid in zip(file_names, ids):
        soup = load_raw(file_name)
        yield parse_rikishi(soup, rid)
    
file_names = glob("data/html_raw/rikishi/*.html")
ids = map(lambda name: os.path.basename(name).split('.')[0], file_names)

rikishi_data = parse_all_rikishi(file_names, ids)

db = client.sumo
db.rikishi.drop()
db.rikishi.insert_many(rikishi_data)

Issue with 192701


<pymongo.results.InsertManyResult at 0x7ff1d4568e48>

# Parse Rikishi - Banzuke Results

In [6]:
def parse_history(soup, bid, rid):
    torikumi = soup.find('table', {'class', "rb_torikumi"})
    
    days = torikumi.find_all('td', {'class', 'rb_day'})
    days = map(lambda x: x.text, days)

    def parse_result(x):
        return os.path.splitext(os.path.basename(x['src']))[0].split('_')[1]
            

    result = torikumi.find_all('img')
    result = filter(lambda x: not '.png' in x['src'], result)
    result = map(parse_result, result)

    kimarite = torikumi.find_all('td', {'class': 'rb_kim'})
    kimarite = map(lambda x: x.text, kimarite)
    
    opp = torikumi.find_all('td', {'class': 'rb_opp'})
    opp = map(lambda x: int(x.find('a')['href'].split('=')[1]), opp)
    
    items = zip(days, result, kimarite, opp)
    
    def make_day_doc(item):
        day, result, kimarite, opp = item
        return {'day': day, 'result': result, 'kimarite': kimarite, 'opponent': opp}

    items = map(make_day_doc, items)
    
    return {'_id': '{}_{}'.format(bid, rid), 'bid': int(bid), 'rid': int(rid), 'history': list(items) }
    
def parse_all_history(file_names, ids):
    for file_name, item in list(zip(file_names, ids)):
        soup = load_raw(file_name)
        print (file_name)
        item = parse_history(soup, *item)
#         if len(item['history']) == 0:
#             raise RuntimeError("Could not parse any data from file {}".format(file_name))
        
        yield item

file_names = glob("data/html_raw/rikishi_banzuke/*.html")
ids = map(lambda name: os.path.basename(name).split('.')[0], file_names)
ids = map(lambda name: name.split('-'), ids)
history_data = parse_all_history(file_names, ids)

db = client.sumo
db.rikishi_banzuke.drop()
db.rikishi_banzuke.insert_many(history_data)

data/html_raw/rikishi_banzuke/192701-3652.html
data/html_raw/rikishi_banzuke/192701-3663.html
data/html_raw/rikishi_banzuke/192701-3666.html
data/html_raw/rikishi_banzuke/192701-3668.html
data/html_raw/rikishi_banzuke/192701-3671.html
data/html_raw/rikishi_banzuke/192701-3674.html
data/html_raw/rikishi_banzuke/192701-3677.html
data/html_raw/rikishi_banzuke/192701-3680.html
data/html_raw/rikishi_banzuke/192701-3684.html
data/html_raw/rikishi_banzuke/192701-3686.html
data/html_raw/rikishi_banzuke/192701-3687.html
data/html_raw/rikishi_banzuke/192701-3688.html
data/html_raw/rikishi_banzuke/192701-3689.html
data/html_raw/rikishi_banzuke/192701-3690.html
data/html_raw/rikishi_banzuke/192701-3691.html
data/html_raw/rikishi_banzuke/192701-3692.html
data/html_raw/rikishi_banzuke/192701-3693.html
data/html_raw/rikishi_banzuke/192701-3695.html
data/html_raw/rikishi_banzuke/192701-3696.html
data/html_raw/rikishi_banzuke/192701-3699.html
data/html_raw/rikishi_banzuke/192701-3700.html
data/html_raw

<pymongo.results.InsertManyResult at 0x7ff1b8fdf588>