In [13]:
def get_fighter_pages(URL = "https://www.bjjheroes.com/a-z-bjj-fighters-list") -> list:
    """
    Return a list of unique page IDs that represent each fighter's profile page
    Args:
        URL: str - This is defaulted to the 'A-Z BJJ Fighters List' page on BJJ Heroes
    """    
    response = requests.get(URL)
    soup = BeautifulSoup(response.content, 'html.parser')

    fighter_pages = []

    for link in soup.tbody.find_all('a', href=True):
        if link not in fighter_pages:
            fighter_pages.append(link['href'])

    # Our raw list has links to irrelevant webpages, parts of the desired link that need cleaning up, and duplicate entries (multiple columns within
    # the tbody object have separate links to the same page). We'll fix all of these with a few list comprehensions. 
    fighter_pages = [x for x in fighter_pages if "/?p=" in x]

    fighter_pages = [x.replace('/?p=', '') for x in fighter_pages]
    #find some way to unpack this into a dict
    fighter_pages = list(dict.fromkeys(fighter_pages))

    return fighter_pages

def get_fighter_name(soup) -> str:
    # Gets the webpage's title and returns the fight's name from the meta tag
    title = soup.find("meta", property="og:title")
    fighter = title["content"]
    fighter = remove_suffix(fighter, " | BJJ Heroes")
    return fighter


def get_table_rows(soup):
    # Returns an object containing all HTML table rows in the 'soup' input
    return soup.tbody.find_all('tr')

def convert_tr_to_list(tr) -> list:
    # Iterate over the 'tr' object, remove HTML tags, strip and separate with a comma, then convert it to a list of elements.
    # Returns a list of lists representing each row of data points.
    list = []
    index = 0
    for iter in tr:
        x = tr[index].get_text(",", strip=True)
        x = x.split(",")
        list.append(x)
        index += 1
    return list

def clean_rows(input, fighter_name):
    for i in input:
        # In some cases, the 'Opponent' column has a reference link to that person's page. The link is returned as it's on 'td' value and should be removed.
        if i[1] == i[2]:
            i.pop(1)

        # The first column comes back with some unknown id that can be removed.
        i.pop(0)

        # The figher's name that is being parsed needs to be added to the 'Fighter' column.
        i.insert(0, fighter_name)

        # Finally, the 'Method' column sometimes contains a string ' Adv' if both fighters tied and results were determined by an advantage.
        # This causes issues with the comma as a separate so let's take it out for now.
        if ' Adv' in i:
            i.remove(' Adv')
    return input

def wait_time(start_time, wait=4) -> float:
    if time.time() - start_time > wait:
        return 0
    else:
        return wait - (time.time() - start_time)
        
def save_to_csv(df, filename) -> None:
    """
    Save a pandas dataframe as a CSV file with the given filename.
    Args:
        df: pandas.DataFrame - The dataframe to be saved.
        filename: str - The name of the CSV file to be saved.
    """
    df.to_csv(filename, index=False)

In [14]:
## remove suffix is deprecated

def remove_suffix(input_string, suffix):
    if suffix and input_string.endswith(suffix):
        return input_string[:-len(suffix)]
    return input_string

In [15]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd



df = pd.DataFrame(columns=['Fighter', 'Opponent', 'W/L', 'Method', 'Competition', 'Weight', 'Stage', 'Year'])

error_count = {
    'Append Row Error': 0, 
    'No Table Body Found': 0,
    'Response Error': 0}

ID = get_fighter_pages()


for i in ID:
    URL = "https://www.bjjheroes.com/?p=" + str(i)
    ts1 = time.time()
    response = requests.get(URL)

    if response.status_code == 200:
        print("Successful connection to: " + URL)
        soup = BeautifulSoup(response.content, 'html.parser')
    else:
        print("Status Code Error: " + str(response.status_code))
        error_count["Response Error"] += 1
        print("Waiting " + str(wait_time(ts1)) + " seconds")
        time.sleep(wait_time(ts1))
        continue

    try:
        tr = get_table_rows(soup) 
    except:
        print("No TR found in the table body. Breaking loop for ID: " + str(i))
        error_count['No Table Body Found'] += 1
        print("Waiting " + str(wait_time(ts1)) + " seconds")
        time.sleep(wait_time(ts1))
        continue

    fighter_name = get_fighter_name(soup) # Get the fighter's name from the webpage meta title

    list = convert_tr_to_list(tr) # Convert the 'tr' object to a list of lists, then clean bad data from the rows
    list = clean_rows(list, fighter_name)

    for i in list: # Add each list element as a new row in the dataframe
        try:
            df.loc[len(df)] = i
        except:
            print("Error while appending")
            error_count['Append Row Error'] += 1

    print("Waiting " + str(wait_time(ts1)) + " seconds")
    time.sleep(wait_time(ts1))

save_to_csv(df, "data.csv")
print(error_count)

TypeError: 'list' object is not callable