# Web Scrape MtG Decks and Store them in Dataframe using Beautiful Soup

| data             | % of total | Description |
|------------------|:----------:|:---------|
| training         | 60         | Data used to tune model parameters $w$ and $b$ in training or fitting |
| cross-validation | 20         | Data used to tune other model parameters like degree of polynomial, regularization or the architecture of a neural network.|
| test             | 20         | Data used to test the model after tuning to gauge performance on new data |

In [12]:
import pandas as pd
import numpy as np

import logging

import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pyarrow as pa
import pyarrow.feather as feather

import os
import glob

import time

In [2]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--start-minimized')

# Set up a logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)

In [3]:
# Set the URL of mtgtop8.com to a variable
mtg_top_8 = 'https://www.mtgtop8.com/'

In [4]:
# Define a list of Magic: The Gathering deck archetype formats
formats = {
    'ST': {'name': 'standard', 'meta': 'meta=58', 'format': 'format?f=ST'},
    'PI': {'name': 'pioneer', 'meta': 'meta=191', 'format': 'format?f=PI'},
    'MO': {'name': 'modern', 'meta': 'meta=44', 'format': 'format?f=MO'},
    'LE': {'name': 'legacy', 'meta': 'meta=16', 'format': 'format?f=LE'},
    'HI': {'name': 'historic', 'meta': 'meta=215', 'format': 'format?f=HI'},
    'EXP': {'name': 'explorer', 'meta': 'meta=243', 'format': 'format?f=EXP'},
    'VI': {'name': 'vintage', 'meta': 'meta=14', 'format': 'format?f=VI'},
    'PAU': {'name': 'pauper', 'meta': 'meta=110', 'format': 'format?f=PAU'},
    'cEDH': {'name': 'competetive edh', 'meta': None, 'format': 'format?f=cEDH'},
}

In [5]:
def printc(text, color):
    colors = {
        'black': '\033[30m',
        'red': '\033[31m',
        'green': '\033[32m',
        'yellow': '\033[33m',
        'blue': '\033[34m',
        'magenta': '\033[35m',
        'cyan': '\033[36m',
        'white': '\033[37m',
    }
    if color not in colors:
        raise ValueError('Invalid color: {}'.format(color))
    print('{}{}{}'.format(colors[color], text, '\033[0m'))

In [6]:
def gen_url(format_keys):
    urls = []

    for f in format_keys:
        try:
            url = mtg_top_8 + formats[f]['format'] + '&' + formats[f]['meta']
        except:
            pass
        urls.append(url)
    return urls


def get_archetypes(url):
    archetypes = []

    response = requests.get(url)

    # Parse the HTML content of the response and create a BeautifulSoup object
    soup = bs(response.content, 'html.parser')

    # Find all hyperlinks in the parsed HTML that contain an 'href' attribute
    for link in soup.find_all('a', href=True):

        # Retrieve the value of the 'href' attribute for each hyperlink
        href = link.get('href')
        
        # Check if the string 'archetype?' is present in the value of the 'href' attribute
        if 'archetype?' in href:
            archetypes.append(href)

    print(f'ARCHETYPES: \n{archetypes}')
    return archetypes


def get_decks(archetypes):
    decks = []
    # Once we have all the archetypes in a format, we can get a link to all the decks in that archetype
    for a in archetypes:
        url = mtg_top_8 + a

        response = requests.get(url)

        soup = bs(response.content, 'html.parser')

        for link in soup.find_all('a', href=True):

            href = link.get('href')

            if 'd=' in href:
                decks.append(href)

    print('DECKS: \n'); print(decks)
    return decks


def get_decklists(decks):
    # Create a new instance of the Chrome driver
    driver = webdriver.Chrome(options=chrome_options)
    driver.minimize_window()

    decklists = []
    counter = 0

    for d in decks:
        url = mtg_top_8 + d
        try:
            deck = get_mtgo_deck_from_webpage(url, driver)
        except Exception as e:
            logger.error(f'An error occured while retrieving the mtgo deck: {e}')
        try:
            deck = get_standard_deck_from_webpage(url, driver)
            decklists.append(deck)
        except Exception as e:
            logger.error(f'An error occurred while retrieving the standard deck: {e}')
            continue

        # Write a feather file for every 10 decks iterated on
        counter += 1
        if counter % 250 == 0:
            feather.write_feather(pa.Table.from_pydict({'decks': decklists}), f'decklists.feather')
            
    # Close the Selenium driver
    driver.quit()

    return decklists


def get_standard_deck_from_webpage(url, driver):
    # Navigate to the desired URL
    driver.get(url)

    # Find the div element that contains the a element
    wait = WebDriverWait(driver, timeout=10, poll_frequency=0.1)
    div = wait.until(EC.element_to_be_clickable((By.ID, 'MTGA_button_div')))

    # Find the a element within the div element
    element = div.find_element(By.TAG_NAME, 'a')

    # Click on the element
    element.click()

    # time.sleep(0.5)

    # Wait for the dynamic content to load
    # Wait for the <textarea> element with the name attribute set to 'MTGAdecklist' to be present
    # You can adjust the timeout and polling frequency as needed
    wait = WebDriverWait(driver, timeout=10, poll_frequency=0.1)
    wait.until(EC.text_to_be_present_in_element((By.ID, 'MTGAdecklistEN')))

    # Get the HTML content of the page after the dynamic content has loaded
    html = driver.page_source

    # Parse the HTML content with BeautifulSoup
    soup = bs(html, 'html.parser')

    for link in soup.find_all('textarea', {'id': 'MTGAdecklistEN'},string=True):
        print(link)
        try:
            deck = link.text
        except Exception as e:
            logger.error(f'Failed to retrieve text data from the page {url} \nException: {e}')
            continue

    print(f'DECK RETRIEVED FROM: {url}')
    try:
        return deck
    except:
        printc("ERROR: FAILED TO GET DECK!!", 'red')


def get_mtgo_deck_from_webpage(url, driver):
    # Navigate to the desired URL
    driver.get(url)

    # Find the element by its href attribute containing "mtgo?d="
    wait = WebDriverWait(driver, timeout=10, poll_frequency=0.5)
    element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[href*=mtgo?d=]")))

    # Click on the element
    element.click()

    # Navigate to the directory where the files are stored
    downloads = 'C:/Users/Peter/Downloads'

    # Get the initial number of files in the directory
    init_num_files = len(os.listdir(downloads))

    # Start an infinite loop
    while True:
        # Check the current number of files in the directory
        num_files = len(os.listdir(downloads))
        
        # If a new file has been added, break the loop
        if num_files > init_num_files:
            break
        
        # Wait for a certain amount of time before checking again
        time.sleep(0.1)

    # Use glob to get a list of file paths in the directory sorted by modification time
    file_paths = sorted(glob.glob(os.path.join(downloads, '*')), key=os.path.getmtime)

    # Get the path of the most recently modified file
    most_recent_file_path = file_paths[-1]

    # Open the file and read its contents into a string
    with open(most_recent_file_path, 'r') as f:
        deck = f.read()

    # Delete the file
    os.remove(most_recent_file_path)

    print(f'DECK RETRIEVED FROM: {url}')
    try:
        return deck
    except:
        printc("ERROR: FAILED TO GET DECK!!", 'red')



In [None]:
format_urls = gen_url(formats.keys()[1:])
# format_urls = gen_url(['ST'])

for url in format_urls:
    run = 1
    print('Scraping decklists from: ' + url + f' [{run}/{len(format_urls)}]')

    archetype_links = list(set(get_archetypes(url)))
    deck_links = list(set(get_decks(archetype_links)))
    decklists = list(set(get_decklists(deck_links)))

    run += 1

In [10]:
# Read the list from the Feather file
df = feather.read_feather('decklists.feather')
df

Unnamed: 0,decks
0,"1 Shivan Devastator (DMU) 143\n2 Squee, Dubiou..."
1,"2 Squee, Dubious Monarch (DMU) 146\n3 Feldon, ..."
2,"1 Shivan Devastator (DMU) 143\n2 Squee, Dubiou..."
3,"1 Shivan Devastator (DMU) 143\n3 Feldon, Ronom..."
4,"1 Chandra, Dressed to Kill (VOW) 149\n4 Mechan..."
...,...
2245,1 Opt (XLN) 65\n1 Metallic Rebuke (KLR) 56\n2 ...
2246,1 Metallic Rebuke (KLR) 56\n2 Mox Amber (DAR) ...
2247,1 Metallic Rebuke (KLR) 56\n1 Opt (XLN) 65\n2 ...
2248,12 Navigator's Compass (DAR) 225\n2 Aetherflux...
