In [44]:
from collections import defaultdict
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import csv

In [45]:
BASE_URL = 'http://mzbs.pl'
MONDAY_TOURNAMENTS_URL = BASE_URL + '/wyniki/41'
WEDNESDAY_TOURNAMENTS_URL = BASE_URL + '/wyniki/43'

LAST_SLASH_RE = '[^/]+$'
DATE_RE = '\d+\-\d+\-\d+'
RESULTS_SUFIX = 'wyn.html'

In [46]:
def generate_tournament_urls(tournament_page_url):
    results_page_html = urlopen(tournament_page_url)
    soup = BeautifulSoup(results_page_html, 'html')
    result_anchors = soup.ul.findAll('a')
    urls = [a['href'] if BASE_URL in a['href'] else BASE_URL + a['href'] for a in result_anchors]
    urls = [re.sub(LAST_SLASH_RE, RESULTS_SUFIX, a) for a in urls]
    return [a for a in urls if '2018' in a]

In [47]:
def has_participated(player_name, tournament_table):
    results_row = [table_row for table_row in tournament_table if player_name in str(table_row)]
    if len(results_row) is not 1:
        return None
    return results_row[0]

def get_date(soup):
    return re.search(DATE_RE, str(soup.h4)).group(0)

def get_number_of_pairs(tournament_table):
    return len(tournament_table) - 1

def get_place_in_tournament(tournament_table, results_row):
    return tournament_table.index(results_row)

def get_partner_name(row, player_name):
    names = [entry for entry in row.findAll('td') if player_name in str(entry)][0].a.contents
    names = [str(name) for name in names]
    return "".join(names).replace('<br/>', '').replace(player_name, '')

def build_entry(player_name, partner_name, place_in_tournament, number_of_pairs, date, url):
    return {
        'name': player_name,
        'partner_name': partner_name,
        'place': str(place_in_tournament) + '/' + str(number_of_pairs),
        'date': date,
        'url': url
    }

In [48]:
def get_results(players, urls):
    results = defaultdict(list)
    
    for url in urls:
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html')
        
        for player_name in players:
            tournament_table = soup.table.findAll('tr')
            results_row = has_participated(player_name, tournament_table)

            if not results_row:
                continue
            
            date = get_date(soup)
            number_of_pairs = get_number_of_pairs(tournament_table)
            place_in_tournament = get_place_in_tournament(tournament_table, results_row)
            partner_name = get_partner_name(results_row, player_name)

            entry = build_entry(player_name, partner_name, place_in_tournament, number_of_pairs, date, url)

            results[player_name].append(entry)
        
    for key in results.keys():
        results[key] = results[key][::-1]
    
    return results

In [49]:
urls_monday_2018 = generate_tournament_urls(MONDAY_TOURNAMENTS_URL)
urls_wednesday_2018 = generate_tournament_urls(WEDNESDAY_TOURNAMENTS_URL)

In [50]:
urls = urls_monday_2018 + urls_wednesday_2018
players = [
    'Lech Adamus', 
    'Łukasz Baniak',
    'Anna Burda',
    'Aleksandra Byra',
    'Krzysztof Cetera',
    'Mateusz Dominik',
    'Wojciech Garncarz',
    'Błażej Krawczyk'
    'Przemysław Kurzak',
    'Albert Mosiałek',
    'Patryk Mrukot',
    'Krzysztof Piotrowski',
    'Maciej Śliwiński',
    'Piotr Wzorek',
]

In [51]:
total_results = get_results(players, urls)

In [52]:
flattened_results = []
for key in total_results.keys():
    flattened_results += total_results[key]

In [54]:
with open('results.csv', mode='w') as csv_file:
    fieldnames = ['name', 'partner_name', 'place', 'date', 'url']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    for row in flattened_results:
        writer.writerow(row)