# Web Scrape MtG Decks and Store them in Dataframe using Beautiful Soup

In [1]:
import pandas as pd
import numpy as np

import logging
import sys
from IPython.display import display, HTML

import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pyarrow as pa
import pyarrow.feather as feather

import os
import glob

import time

In [None]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--start-minimized')

# Define a custom log handler that writes messages to the notebook output
class NotebookLogHandler(logging.Handler):
    def emit(self, record):
        message = self.format(record)
        display(HTML(f'<p style="color: {record.levelname.lower()}">{message}</p>'))

# Create a logger and set its level to INFO
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Create a formatter and add it to the logger
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler = NotebookLogHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)

In [3]:
# Set the URL of mtgtop8.com to a variable
mtg_top_8 = 'https://www.mtgtop8.com/'

In [4]:
# Define a list of Magic: The Gathering deck archetype formats
formats = {
    'ST': {'name': 'standard', 'meta': 'meta=58', 'format': 'format?f=ST'},
    'PI': {'name': 'pioneer', 'meta': 'meta=191', 'format': 'format?f=PI'},
    'MO': {'name': 'modern', 'meta': 'meta=44', 'format': 'format?f=MO'},
    'LE': {'name': 'legacy', 'meta': 'meta=16', 'format': 'format?f=LE'},
    'HI': {'name': 'historic', 'meta': 'meta=215', 'format': 'format?f=HI'},
    'EXP': {'name': 'explorer', 'meta': 'meta=243', 'format': 'format?f=EXP'},
    'VI': {'name': 'vintage', 'meta': 'meta=14', 'format': 'format?f=VI'},
    'PAU': {'name': 'pauper', 'meta': 'meta=110', 'format': 'format?f=PAU'},
    'cEDH': {'name': 'competetive edh', 'meta': None, 'format': 'format?f=cEDH'},
}

In [5]:
def printc(text, color):
    colors = {
        'black': '\033[30m',
        'red': '\033[31m',
        'green': '\033[32m',
        'yellow': '\033[33m',
        'blue': '\033[34m',
        'magenta': '\033[35m',
        'cyan': '\033[36m',
        'white': '\033[37m',
    }
    if color not in colors:
        raise ValueError('Invalid color: {}'.format(color))
    print('{}{}{}'.format(colors[color], text, '\033[0m'))

In [6]:
def gen_url(format_keys):
    urls = []

    for f in format_keys:
        try:
            url = mtg_top_8 + formats[f]['format'] + '&' + formats[f]['meta']
            urls.append(url)
        except:
            pass       
    return urls


def get_archetypes(url):
    archetypes = []

    response = requests.get(url)

    # Parse the HTML content of the response and create a BeautifulSoup object
    soup = bs(response.content, 'html.parser')

    # Find all hyperlinks in the parsed HTML that contain an 'href' attribute
    for link in soup.find_all('a', href=True):

        # Retrieve the value of the 'href' attribute for each hyperlink
        href = link.get('href')
        
        # Check if the string 'archetype?' is present in the value of the 'href' attribute
        if 'archetype?' in href:
            archetypes.append(href)

    print(f'ARCHETYPES: \n{archetypes}')
    return archetypes


def get_decks(archetypes):
    decks = []
    # Once we have all the archetypes in a format, we can get a link to all the decks in that archetype
    for a in archetypes:
        url = mtg_top_8 + a

        response = requests.get(url)

        soup = bs(response.content, 'html.parser')

        for link in soup.find_all('a', href=True):

            href = link.get('href')

            if 'd=' in href:
                decks.append(href)

    print('DECKS: \n'); print(decks)
    return decks


def get_decklists(decks):
    # Create a new instance of the Chrome driver
    driver = webdriver.Chrome(options=chrome_options)
    driver.minimize_window()

    decklists = []
    for d in decks:
        url = mtg_top_8 + d
        try:
            logger.info(f"Processing {url}")
            deck = get_mtgo_deck_from_webpage(url, driver)
            decklists.append(deck)
        except Exception as e:
            logger.error(f"Failed to load deck from {url}: {e}")
            continue

        logger.info(f"Download complete: {url}")

    # Write a feather file of decklists
    feather.write_feather(pa.Table.from_pydict({'decks': decklists}), f'decklist_vintage.feather')
        
    # Close the Selenium driver
    driver.quit()

    logger.info("Scraping complete")

    return decklists


def get_mtgo_deck_from_webpage(url, driver):
    # Navigate to the desired URL
    driver.get(url)

    # Find the element by its href attribute containing "mtgo?d="
    wait = WebDriverWait(driver, timeout=10, poll_frequency=0.5)
    element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[href*="mtgo?d="]')))

    # Navigate to the directory where the files are stored
    downloads = 'C:/Users/Peter/Downloads'

    # Get the initial number of files in the directory
    init_num_files = len(os.listdir(downloads))

    # Click on the element triggering the download of the text file
    element.click()

    # Start an infinite loop
    while True:
        # Check the current number of files in the directory
        num_files = len(os.listdir(downloads))
        
        # If a new file has been added, break the loop
        if num_files > init_num_files:
            break
        
        # Wait for a certain amount of time before checking again
        time.sleep(0.05)
    
    time.sleep(1)

    # Use glob to get a list of file paths in the directory sorted by modification time
    file_paths = sorted(glob.glob(os.path.join(downloads, '*')), key=os.path.getmtime)

    # Get the path of the most recently modified file
    most_recent_file_path = file_paths[-1]
    logger.info(f'{most_recent_file_path}')
    
    while True:
        try:
            # Try to open the file in "append" mode
            with open(most_recent_file_path, "a"):
                pass  # If the open operation succeeds, the file is ready
        except:
            # If the open operation fails, the file is still being written to
            time.sleep(0.05)  # Wait for a short time before checking again
        else:
            break  # If the open operation succeeds, break out of the loop

    # Open the file and read its contents into a string
    with open(most_recent_file_path, 'r') as f:
        deck = f.read()

    #Delete the file
    try:
        os.remove(most_recent_file_path)
    except FileNotFoundError:
        logger.error(f"File {most_recent_file_path} not found")

    # Wait for the file to be deleted
    while os.path.exists(most_recent_file_path):
        time.sleep(0.05)

    logger.info(f'DECK RETRIEVED FROM: {url}')
    try:
        return deck
    except:
        printc("ERROR: FAILED TO GET DECK!!", 'red')



In [None]:
# format_urls, format_keys = gen_url(list(formats.keys())[1:]), list(formats.keys())    'cEDH'
format_urls = gen_url(['PAU',])

for url in format_urls:
    print('Scraping decklists from: ' + url)

    archetype_links = list(set(get_archetypes(url)))
    deck_links = list(set(get_decks(archetype_links)))
    decklists = list(set(get_decklists(deck_links)))

In [None]:
# Test the function for a given url
driver = webdriver.Chrome()
url = 'https://www.mtgtop8.com/event?e=37982&d=484143&f=MO'

deck = get_mtgo_deck_from_webpage(url, driver)
len(deck)

## Once the decklists have all been created, join them together 

###### The list of decks needs to be cleaned:
###### '\n' needs to be replaced with ', '

In [None]:
# Get the absolute path of the current working directory
current_dir = os.path.abspath(os.getcwd())

# Get the absolute path of the parent directory
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))

# Read our card database
file_path = os.path.join(parent_dir, 'data', 'decklists')

# Get a list of all feather files in the directory
feather_files = [os.path.join(file_path, f) for f in os.listdir(file_path) if f.endswith('.feather')]

# Load each feather file into a Pandas DataFrame and concatenate them together
dfs = []
for file in feather_files:
    df = pd.read_feather(file)
    dfs.append(df)
decks_df = pd.concat(dfs)

def clean_deck(row):
    row = str(row).split('Sideboard')[0].split('\n')
    return row

decks_df = decks_df.copy()
decks_df['decks'] = decks_df['decks'].apply(lambda row: clean_deck(row))

# Save the combined DataFrame as a feather file
feather.write_feather(decks_df, os.path.join(parent_dir, 'data', 'decks.feather'))

decks_df.iloc[2][0]

['3 Atraxa, Grand Unifier',
 '4 Fable of the Mirror-Breaker',
 "2 Courier's Briefcase",
 "4 Esika's Chariot",
 '1 Tear Asunder',
 "2 Sheoldred's Edict",
 '4 Fatal Push',
 '4 Thoughtseize',
 '4 Transmogrify',
 '4 Careful Cultivation',
 "2 Liliana, Death's Majesty",
 '2 Ob Nixilis, the Adversary',
 '1 Cragcrown Pathway',
 '1 Boseiju, Who Endures',
 '1 Forest',
 '1 Stomping Ground',
 '1 Sokenzan, Crucible of Defiance',
 '1 Swamp',
 '2 Blightstep Pathway',
 '2 Overgrown Tomb',
 '3 Blackcleave Cliffs',
 '3 Blood Crypt',
 '4 Blooming Marsh',
 "4 Ziatora's Proving Ground",
 '']