In [7]:
# MTG RELATED CONSTANTS
CardTypes = ['creature', 'planeswalker', 'sorcery', 'instant', 'enchantment', 'land', 'artifact'] 

In [8]:
from bs4 import BeautifulSoup
import requests

def get_soup(request_url):
    page_html = requests.get(request_url)
    return BeautifulSoup(page_html.content, features="lxml")

In [9]:
# takes a content bean containing a given deck and parses it 
def read_deck(cb, include_name = False):
    name = cb.find('h4').contents[0]

    type_divs = dict()
    for div in cb.find_all('div'):
        if 'class' in div.attrs:
            for ctype in CardTypes: 
                if ('sorted-by-' + ctype) in div['class']:
                    type_divs[ctype] = div
                    
    def cards_of_type(ctype):
        if ctype in type_divs.keys():
            number_of_cards = [span.contents[0] for span in type_divs[ctype].find_all('span') 
                       if 'card-count' in span['class']]
            name_of_cards = [span.contents[0].contents[0] for span in type_divs[ctype].find_all('span') 
                     if 'card-name' in span['class']]
            return list(zip(name_of_cards, number_of_cards))
        else:
            return []

    deck = []
    for ctype in CardTypes:
        deck = deck + cards_of_type(ctype)
    if include_name:
        return (name, deck)
    else: 
        return deck

In [10]:
def decks_in_url(url):
    soup = get_soup(url)
    
    content_beans = []
    for link in soup.find_all('div'):
        try: 
            cl = link['class']
            if cl == ['content', 'beanSpacing']:
                content_beans.append(link)
        except KeyError:
            pass
        
    return [read_deck(cb) for cb in content_beans]

In [11]:
def create_deck_db(deck_list):
    cards_appearing = [p[0] for deck in deck_list for p in deck]
    cards_index = list(set(cards_appearing))
    
    # takes a deck given a list of pairs (Card, quantity) and returns an array indexed by cards_index
    def deck_in_pairs_into_array(deck):
        a = [0 for i in range(len(cards_index))]
        for p in deck:
            a[cards_index.index(p[0])] = int(p[1])
        return a
    
    return cards_index, [deck_in_pairs_into_array(deck) for deck in deck_list]

In [12]:
import time
import json 

mythic_url = 'https://magic.wizards.com/en/events/coverage/2019MC2/all-mythic-championship-ii-modern-decklists-2019-04-26'
mythic_soup = get_soup(mythic_url)
decklists_urls = [link['href'] for link in mythic_soup.find_all('a') if 'Modern Decks' in link.contents[0]]

mythic_decks = []
for url in decklists_urls:
    time.sleep(10)
    mythic_decks = mythic_decks + decks_in_url(url)
    
cards_index, deck_list = create_deck_db(mythic_decks)
db = { 'cards_index' : cards_index, 'deck_list' : deck_list }

with open('london_db.json', 'w') as file:
    json.dump(db, file)