# Web Scrape MtG Decks and Store them in Dataframe using Beautiful Soup

| data             | % of total | Description |
|------------------|:----------:|:---------|
| training         | 60         | Data used to tune model parameters $w$ and $b$ in training or fitting |
| cross-validation | 20         | Data used to tune other model parameters like degree of polynomial, regularization or the architecture of a neural network.|
| test             | 20         | Data used to test the model after tuning to gauge performance on new data |

In [38]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

In [3]:
# Set the URL of mtgtop8.com to a variable
mtg_top_8 = 'https://www.mtgtop8.com/'

In [4]:
# Define a list of Magic: The Gathering deck archetype formats
formats = {
    'ST': {'name': 'standard', 'meta': 'meta=58', 'format': 'format?f=ST'},
    'PI': {'name': 'pioneer', 'meta': 'meta=191', 'format': 'format?f=PI'},
    'MO': {'name': 'modern', 'meta': 'meta=44', 'format': 'format?f=MO'},
    'LE': {'name': 'legacy', 'meta': 'meta=16', 'format': 'format?f=LE'},
    'HI': {'name': 'historic', 'meta': 'meta=215', 'format': 'format?f=HI'},
    'EXP': {'name': 'explorer', 'meta': 'meta=243', 'format': 'format?f=EXP'},
    'VI': {'name': 'vintage', 'meta': 'meta=14', 'format': 'format?f=VI'},
    'PAU': {'name': 'pauper', 'meta': 'meta=110', 'format': 'format?f=PAU'},
    'cEDH': {'name': 'competetive edh', 'meta': None, 'format': 'format?f=cEDH'},
}

In [5]:
def gen_url(format_keys):
    urls = []

    for f in format_keys:
        try:
            url = mtg_top_8 + formats[f]['format'] + '&' + formats[f]['meta']
        except:
            pass
        urls.append(url)
    return urls

def get_archetypes(url):
    archetypes = []

    response = requests.get(url)

    # Parse the HTML content of the response and create a BeautifulSoup object
    soup = bs(response.content, 'html.parser')

    # Find all hyperlinks in the parsed HTML that contain an 'href' attribute
    for link in soup.find_all('a', href=True):

        # Retrieve the value of the 'href' attribute for each hyperlink
        href = link.get('href')
        
        # Check if the string 'archetype?' is present in the value of the 'href' attribute
        if 'archetype?' in href:
            archetypes.append(href)
    return archetypes

def get_decks(archetypes, url):
    decks = []
    # Once we have all the archetypes in a format, we can get a link to all the decks in that archetype
    for a in archetypes:
        url = url + a

        response = requests.get(url)

        soup = bs(response.content, 'html.parser')

        for link in soup.find_all('a', href=True):

            href = link.get('href')

            if 'd=' in href:
                decks.append(href)
    
    return decks

def get_decklist(decks):
    decklists = []

    for d in decks:
        url = mtg_top_8 + d

        response = requests.get(url)

        soup = bs(response.content, 'html.parser')

        try:
            txt = soup.find('div', {'id': 'MTGAdecklist'}).get_text()
            decklists.append(txt)
        except:
            pass
    return decklists

def get_deck_from_webpage(url):
    # Create a new instance of the Chrome driver
    driver = webdriver.Chrome()

    # Navigate to the desired URL
    driver.get(url)

    # Find the div element that contains the a element
    wait = WebDriverWait(driver, timeout=10, poll_frequency=0.5)
    div = wait.until(EC.element_to_be_clickable((By.ID, 'MTGA_button_div')))

    # Find the a element within the div element
    element = div.find_element(By.TAG_NAME, 'a')

    # Click on the element
    element.click()

    # Get the HTML content of the page after the dynamic content has loaded
    html = driver.page_source

    # Parse the HTML content with BeautifulSoup
    soup = bs(html, 'html.parser')

    for link in soup.find_all('textarea'):
        if 'MTGAdecklistEN' in str(link):
            deck = link.text

    # Close the Selenium driver
    driver.quit()

    return deck



In [31]:
# urls = gen_url(formats.keys())
urls = gen_url(['ST'])

for url in urls:
    run = 1
    print('Scraping decklists from: ' + url + f' [{run}/{len(urls)}]')

    archetype_links = get_archetypes(url)
    deck_links = get_decks(archetype_links, url)
    decklists = get_decklist(deck_links)

    run += 1

# event?e=41705&d=509725&f=ST

Scraping decklists from: https://www.mtgtop8.com/format?f=ST&meta=58 [1/1]
