In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re

In [2]:
supported_formats = {
    'standard': 'format?f=ST',
    'pioneer': 'format?f=PI',
    'modern': 'format?f=MO',
    'legacy': 'format?f=LE',
    'historic': 'format?f=HI',
    'vintage': 'format?f=VI',  # low ammount of data
    'pauper': 'format?f=PAU'  # literally 1 tier list, moatly for test purposes
}

In [3]:
url = 'https://www.mtgtop8.com/'

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.width = 0
pd.options.display.max_colwidth = None

In [4]:
def get_tournament_links(mtg_format: str) -> pd.DataFrame:
    """
    Takes format name, return Pandas DataFrame with link to tier list for each month that is available on mtgtop8.com
    :param str mtg_format: one of the supported formats
    :return: pd.DataDrame with links to each tier list
    """

    mtg_format = mtg_format.lower()  # make input lower case
    if mtg_format in supported_formats:
        format_url = supported_formats[mtg_format]
    else:
        raise Exception('Format not on the list of formats: ', mtg_format)

    full_url = url + format_url
    time.sleep(random.randint(0, 3))
    page = requests.get(full_url)  # get the page
    soup = BeautifulSoup(page.content, "html.parser")  # get only it's content
#     tournament_element = soup.find(style="class: 'stable'")  # search for tournament list
    tourney_links = soup.find_all('a', attrs={'href': re.compile("event")})  # get all the tournament links
    tournament_link = []

    for element in tourney_links:
            tournament_link.append(url+element['href'])  # link to decks of that tournament

    tournament_link = list(set(tournament_link))
    tournament_links = pd.DataFrame(columns=['link to tournament'])
    tournament_links['link to tournament']= tournament_link
#     print(tournament_links)
    return tournament_links

In [5]:
# get_tournament_links("modern")

In [6]:
def get_deck_links_tournament(tournament_url: str) -> pd.DataFrame:
    """
    Given a URL for a tournament scrapes all deck links in that tournaent
    :param str tournament_url: string that is a link to a tournament
    :return: pd.DataFrame with links to all decks on that page
    """
    time.sleep(random.randint(0, 3))
    page = requests.get(tournament_url)  # get the page
    soup = BeautifulSoup(page.content, "html.parser")  # get only it's content
    # this is spaghetti but the website is also spaghetti
#     print(soup)
    searching_by_style = soup.find_all(style='margin:0px 4px 0px 4px;')
#     print(searching_by_style)
    decks = []
    Deck_link = ""
    for element in searching_by_style:  # it should be exactly 1 iteration any ways
        aas = element.find_all('a')
#         print(aas[0]) 
#         hrefs = [x.find_all(href=True) for x in aas]
        strings = [str(x) for x in aas]
#         print(strings);
        for element in strings:
            if r'?e' in element:
#                 print(element)
                regex_1 = '<a href="'
                regex_2 = '\".*$'
                
                deck_link_1 = re.sub(regex_1,"", element)
                deck_link = re.sub(regex_2,"", deck_link_1)
#                 print(deck_link)
                full_deck_link = url + 'event' + deck_link
                full_deck_link = full_deck_link.replace("amp;","")
#                 print(full_deck_link)
                decks.append(full_deck_link)
#                 print(decks)
    decks = list(set(decks))
#     print(decks)
#         for index, tr in enumerate(trs):
# #             print('Tier ' + str(rank))
#             aas = all_links = tr.find_all('a')  # Find all links
#             deck_link = ""
#             x = iter(aas)
#             if len(aas) % 3 != 0:
#                 raise Exception("Number of links isn't devideable by zero (something went wrong!)")
#             for a_1, a_2, a_3 in zip(x, x, x):
#                 try:
#                     deck_link = a_2['href']
#                     # print(tier, deck_name, deck_link, deck_author)
#                     full_deck_link = url + 'event' + deck_link
#                     decks.append([full_deck_link])
#                 except IndexError:
#                     print('Likely no deck link and author, only picture!')
#                     print(a_1, a_2, a_3)
    
    decks_df = pd.DataFrame({'Deck link': decks})
#     print(decks_df)
    return decks_df


In [7]:
# get_deck_links_tournament("https://www.mtgtop8.com/event?e=39400&f=MO")

In [8]:
def get_deck_links_all(mtg_format: str) -> pd.DataFrame:
    """
    Get all decks from "decks to beat" for a given format.
    :param str mtg_format: one of the supported formats
    :return: pd.DataFrame with the month, tier, link to the deck and other data
    """

    mtg_format = mtg_format.lower()  # make input lower case
    if mtg_format not in supported_formats:
        raise Exception('Format not on the list of formats: ', mtg_format)

    tier_lists = get_tournament_links(mtg_format)
#     all_decks_from_format = pd.DataFrame(columns=
#                                          ['Year', 'Month', 'rank', 'Name', 'Author name', 'Deck link'])
#     year = []
#     month = []
#     rank = []
#     name = []
#     author_name = []
#     deck_link = []
    all_decks_from_format = pd.DataFrame(columns=
                                         ['Deck link'])
    deck_link = []
#     tier_lists = tier_lists.values.astype({'link to tournament':'string'})
#     print(tier_lists)
#     for index,row in tier_lists.iterrows():       
#         decks_from_tournament = get_deck_links_tournament()
#         for index, row in decks_from_tournament.iterrows():
#             deck_link.append(['Deck link'])
    for index, row in tier_lists.iterrows():
        deck_from_tournament = get_deck_links_tournament(row['link to tournament'])                
#         print('Adding', ': ', row['Month'], '.', row['Year'], sep='')
        for index2, row2 in deck_from_tournament.iterrows():
#             year.append(row['Year'])
#             month.append(row['Month'])
#             tier.append(row2['rank'])
#             name.append(row2['Name'])
#             author_name.append(row2['Author name'])
            deck_link.append(row2['Deck link'])
#             print(deck_link)

#     all_decks_from_format['Year'] = year
#     all_decks_from_format['Month'] = month
#     all_decks_from_format['Rank'] = rank
#     all_decks_from_format['Name'] = name
#     all_decks_from_format['Author name'] = author_name
    all_decks_from_format['Deck link'] = deck_link
    return all_decks_from_format



In [9]:
# get_deck_links_all("legacy")

In [10]:
# def get_deck_links_all(mtg_format: str) -> pd.DataFrame:
#     """
#     Get all decks from "decks to beat" for a given format.
#     :param str mtg_format: one of the supported formats
#     :return: pd.DataFrame with the month, tier, link to the deck and other data
#     """

#     mtg_format = mtg_format.lower()  # make input lower case
#     if mtg_format not in supported_formats:
#         raise Exception('Format not on the list of formats: ', mtg_format)
#     tournament_links = get_tournament_links('link to tournament')
# #     all_decks_from_format = pd.DataFrame(columns=
# #                                          ['Year', 'Month', 'rank', 'Name', 'Author name', 'Deck link'])
# #     year = []
# #     month = []
# #     rank = []
# #     name = []
# #     author_name = []
# #     deck_link = []
#     all_decks_from_format = pd.DataFrame(columns=
#                                          ['Deck link'])
#     deck_link = []
#     for index, row in tournament_lists.iterrows():
# #         print('Adding', ': ', row['Month'], '.', row['Year'], sep='')
# #             for index2, row2 in deck_from_month.iterrows():
# #             year.append(row['Year'])
# #             month.append(row['Month'])
# #             tier.append(row2['rank'])
# #             name.append(row2['Name'])
# #             author_name.append(row2['Author name'])
#         deck_link.append(['Deck link'])

# #     all_decks_from_format['Year'] = year
# #     all_decks_from_format['Month'] = month
# #     all_decks_from_format['Rank'] = rank
# #     all_decks_from_format['Name'] = name
# #     all_decks_from_format['Author name'] = author_name
#     all_decks_from_format['Deck link'] = deck_link
#     print(all_decks_from_format)
#     return all_decks_from_format


In [11]:
def get_deck_list(deck_link: str) -> str:
    """
    Scrape deck list from provided deck link
    :param str deck_link: deck link to mtgtop8.com
    :return: deck list as string
    """

    time.sleep(random.randint(0, 3))
    page = requests.get(deck_link)
    soup = BeautifulSoup(page.content, "html.parser")  # get only it's content
    found_element = soup.find_all('a', {'href': re.compile(r'mtgo')})
#     print(found_element)
    if len(found_element) == 2:
        found_element.pop(0)
        mtgo_url = found_element[0]['href']
        full_url = url + mtgo_url
        print(full_url)
        time.sleep(random.randint(0, 3))
        page = requests.get(full_url)  # get the *.txt file on the website containing deck list in MTGO format
        deck_list = page.text
        return deck_list
    elif len(found_element) == 0:
        print(deck_link + " gave an error")
    else:
        mtgo_url = found_element[0]['href']
        full_url = url + mtgo_url
        print(full_url)
        time.sleep(random.randint(0, 3))
        page = requests.get(full_url)  # get the *.txt file on the website containing deck list in MTGO format
        deck_list = page.text
        return deck_list

In [12]:
# get_deck_list("https://www.mtgtop8.com/event?e=39428&d=493595&f=LE")

In [13]:
def get_all_decklists(mtg_format: str) -> pd.DataFrame:
    """
    Given a format returns a DataFrame with information about all decks, including deck lists
    :param str mtg_format: one of the supported formats
    :return: pd.Dataframe with deck lists and other information
    """

    deck_strings = []
    counter = 1
    all_deck_lists = get_deck_links_all(mtg_format)
    print("\nTotal decklists found:", len(all_deck_lists))
    for index, row in all_deck_lists.iterrows():
        deck_list = get_deck_list(row['Deck link'])  # get the decklist to string format from its web page
        # noinspection PyTypeChecker
        deck_strings.append(deck_list)
        print(str(counter) + ' decks done')
        counter += 1
    all_deck_lists['Deck list'] = deck_strings
    print(all_deck_lists)

    return all_deck_lists

In [20]:
# get_deck_list('https://www.mtgtop8.com/event?e=38037&d=484549&f=MO')
format_to_scrape = 'vintage'
print('getting all ' + (format_to_scrape) + " decks")
df_to_save = get_all_decklists(format_to_scrape)
df_to_save.to_csv(path_or_buf=str(format_to_scrape + " decks.csv"), index=False)


getting all vintage decks

Total decklists found: 164
https://www.mtgtop8.com/mtgo?d=490465&f=Vintage_Dimir_Control_by_Daniele_Masi
1 decks done
https://www.mtgtop8.com/mtgo?d=490463&f=Vintage_Paradoxical_by_Davide_Rosadini
2 decks done
https://www.mtgtop8.com/mtgo?d=490464&f=Vintage_Mentor_Paradoxical_by_David_Cioffini
3 decks done
https://www.mtgtop8.com/mtgo?d=490466&f=Vintage_Golos_MUD_by_Tommaso_Pacini
4 decks done
https://www.mtgtop8.com/mtgo?d=490384&f=Vintage_Selesnya_Hatebear_by_Wesal
5 decks done
https://www.mtgtop8.com/mtgo?d=490388&f=Vintage_Grixis_Control_by_discoverN
6 decks done
https://www.mtgtop8.com/mtgo?d=490382&f=Vintage_Grixis_Control_by_handsomePPZ
7 decks done
https://www.mtgtop8.com/mtgo?d=490383&f=Vintage_Merfolk_by_Mogged
8 decks done
https://www.mtgtop8.com/mtgo?d=490386&f=Vintage_Grixis_Control_by_KingHairy
9 decks done
https://www.mtgtop8.com/mtgo?d=490385&f=Vintage_Paradoxical_Storm_by_RespectTheCat
10 decks done
https://www.mtgtop8.com/mtgo?d=490381&f=Vin

91 decks done
https://www.mtgtop8.com/mtgo?d=491125&f=Vintage_Bazaar_Aggro_by_Firetruck
92 decks done
https://www.mtgtop8.com/mtgo?d=491132&f=Vintage_Grixis_Control_by_mosh110
93 decks done
https://www.mtgtop8.com/mtgo?d=491127&f=Vintage_Bazaar_Aggro_by_Hitogoroshi80
94 decks done
https://www.mtgtop8.com/mtgo?d=491128&f=Vintage_Dredge_by_TheSleepingEye
95 decks done
https://www.mtgtop8.com/mtgo?d=491130&f=Vintage_Hatebear_by_wizardwand
96 decks done
https://www.mtgtop8.com/mtgo?d=491126&f=Vintage_Jund_by_Absorbentthree
97 decks done
https://www.mtgtop8.com/mtgo?d=491129&f=Vintage_Oath_of_Druids_by_IfHeDiesHeDies
98 decks done
https://www.mtgtop8.com/mtgo?d=490302&f=Vintage_Bazaar_Aggro_by_IamActuallyLvL1
99 decks done
https://www.mtgtop8.com/mtgo?d=490301&f=Vintage_Underworld_Breach_by_Phill_Hellmuth
100 decks done
https://www.mtgtop8.com/mtgo?d=490303&f=Vintage_Underworld_Breach_by_Kinarus
101 decks done
https://www.mtgtop8.com/mtgo?d=490300&f=Vintage_Landless_Spy_by_sandydogmtg
102 d