In [1]:
def get_fighter_pages(URL = "https://www.bjjheroes.com/a-z-bjj-fighters-list") -> list:
    """
    Return a list of unique page IDs that represent each fighter's profile page
    Args:
        URL: str - This is defaulted to the 'A-Z BJJ Fighters List' page on BJJ Heroes
    """    
    response = requests.get(URL)
    soup = BeautifulSoup(response.content, 'html.parser')

    fighter_pages = []

    for link in soup.tbody.find_all('a', href=True):
        if link not in fighter_pages:
            fighter_pages.append(link['href'])

    # Our raw list has links to irrelevant webpages, parts of the desired link that need cleaning up, and duplicate entries (multiple columns within
    # the tbody object have separate links to the same page). We'll fix all of these with a few list comprehensions. 
    fighter_pages = [x for x in fighter_pages if "/?p=" in x]

    fighter_pages = [x.replace('/?p=', '') for x in fighter_pages]
    #find some way to unpack this into a dict
    fighter_pages = dict.fromkeys(fighter_pages)
    #print(fighter_pages)
    #print(type(fighter_pages))
    fighter_pages = list(fighter_pages.keys())
    print(fighter_pages)

    return fighter_pages

def get_fighter_name(soup) -> str:
    # Gets the webpage's title and returns the fight's name from the meta tag
    title = soup.find("meta", property="og:title")
    fighter = title["content"]
    fighter = remove_suffix(fighter, " | BJJ Heroes")
    return fighter


def get_table_rows(soup):
    # Returns an object containing all HTML table rows in the 'soup' input
    return soup.tbody.find_all('tr')

def convert_tr_to_list(tr) -> list:
    # Iterate over the 'tr' object, remove HTML tags, strip and separate with a comma, then convert it to a list of elements.
    # Returns a list of lists representing each row of data points.
    l = []
    index = 0
    for iter in tr:
        x = tr[index].get_text(",", strip=True)
        x = x.split(",")
        l.append(x)
        index += 1
    return l

def clean_rows(input, fighter_name):
    for i in input:
        # In some cases, the 'Opponent' column has a reference link to that person's page. The link is returned as it's on 'td' value and should be removed.
        if i[1] == i[2]:
            i.pop(1)

        # The first column comes back with some unknown id that can be removed.
        i.pop(0)

        # The figher's name that is being parsed needs to be added to the 'Fighter' column.
        i.insert(0, fighter_name)

        # Finally, the 'Method' column sometimes contains a string ' Adv' if both fighters tied and results were determined by an advantage.
        # This causes issues with the comma as a separate so let's take it out for now.
        if ' Adv' in i:
            i.remove(' Adv')
    return input

def wait_time(start_time, wait=4) -> float:
    if time.time() - start_time > wait:
        return 0
    else:
        return wait - (time.time() - start_time)
        
def save_to_csv(df, filename) -> None:
    """
    Save a pandas dataframe as a CSV file with the given filename.
    Args:
        df: pandas.DataFrame - The dataframe to be saved.
        filename: str - The name of the CSV file to be saved.
    """
    df.to_csv(filename, index=False)

In [2]:
## remove suffix is deprecated

def remove_suffix(input_string, suffix):
    if suffix and input_string.endswith(suffix):
        return input_string[:-len(suffix)]
    return input_string

In [5]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd



df = pd.DataFrame(columns=['Fighter', 'Opponent', 'W/L', 'Method', 'Competition', 'Weight', 'Stage', 'Year'])

error_count = {
    'Append Row Error': 0, 
    'No Table Body Found': 0,
    'Response Error': 0}

ID = get_fighter_pages()


for i in ID:
    URL = "https://www.bjjheroes.com/?p=" + str(i)
    ts1 = time.time()
    response = requests.get(URL)

    if response.status_code == 200:
        print("Successful connection to: " + URL)
        soup = BeautifulSoup(response.content, 'html.parser')
    else:
        print("Status Code Error: " + str(response.status_code))
        error_count["Response Error"] += 1
        print("Waiting " + str(wait_time(ts1)) + " seconds")
        time.sleep(wait_time(ts1))
        continue

    try:
        tr = get_table_rows(soup) 
    except:
        print("No TR found in the table body. Breaking loop for ID: " + str(i))
        error_count['No Table Body Found'] += 1
        print("Waiting " + str(wait_time(ts1)) + " seconds")
        time.sleep(wait_time(ts1))
        continue

    fighter_name = get_fighter_name(soup) # Get the fighter's name from the webpage meta title

    print(tr)
    li = convert_tr_to_list(tr)
    print(li) # Convert the 'tr' object to a list of lists, then clean bad data from the rows
    li = clean_rows(li, fighter_name)
    print(li)

    for i in li: # Add each list element as a new row in the dataframe
        try:
            df.loc[len(df)] = i
        except:
            print("Error while appending")
            error_count['Append Row Error'] += 1

    print("Waiting " + str(wait_time(ts1)) + " seconds")
    time.sleep(wait_time(ts1))

save_to_csv(df, "data.csv")
print(error_count)

['8141', '9246', '8494', '390', '3083', '8814', '8131', '1133', '7478', '6339', '8968', '9541', '699', '1116', '909', '4328', '2189', '289', '2583', '12235', '696', '2611', '9876', '7660', '11746', '13084', '1131', '1757', '1567', '9166', '9312', '449', '474', '727', '9793', '9407', '666', '311', '1671', '84', '1517', '8525', '12569', '891', '9377', '8884', '10004', '826', '2052', '1553', '9058', '7012', '12064', '13078', '8301', '486', '1391', '12639', '8888', '6979', '2357', '276', '11685', '249', '1101', '9615', '1716', '144', '11646', '7747', '1640', '78', '11384', '1414', '314', '8519', '11808', '11878', '12703', '8293', '10638', '10764', '540', '1104', '957', '1236', '1536', '911', '9130', '1865', '12', '387', '584', '9931', '260', '11703', '381', '9401', '1167', '188', '2606', '12203', '1023', '1119', '1998', '425', '1433', '1158', '471', '898', '9385', '9048', '10862', '7535', '253', '2358', '422', '8298', '9640', '11354', '12599', '1659', '222', '7568', '637', '684', '1707', '

KeyboardInterrupt: 

In [4]:
df.head()

Unnamed: 0,Fighter,Opponent,W/L,Method,Competition,Weight,Stage,Year
0,"Aaron ""Tex"" Johnson",Quentin Rosensweig,L,Inside heel hook,Kakuto 5,ABS,SPF,2015
1,"Aaron ""Tex"" Johnson",Neiman Gracie,L,RNC,NoGi Pan Ams,94KG,SF,2015
2,"Aaron ""Tex"" Johnson",Richie Martinez,L,Heel hook,Kakuto Challenge,ABS,SF,2015
3,"Aaron ""Tex"" Johnson",Leo Nogueira,L,Points,Atlanta W. Open,94KG,SF,2016
4,"Aaron ""Tex"" Johnson",Romulo Azevedo,L,,UAEJJF NYC Pro,94KG,SF,2016
