In [2]:
# import libraries
# any way to make this always run?
import pandas as pd
import numpy as np
import requests
import re
import os
from bs4 import BeautifulSoup
import time

# make sure we're in the right directory
os.chdir('/Users/ryan-saloma/Python Projects/football_financials')

In [3]:
# input: BeautifulSoup object
# output: list of table names (as stored in the h2 tags)
# assuming table naming structure is consistent
def get_h2s(soup):
    h2s = soup.find_all('h2')
    h2_list = [h2.text.strip() for h2 in h2s]
    h2_list = [h2 for h2 in h2_list if re.match(r'\d{4}', h2)]
    return h2_list

# input: url
# output: team name, list of tables, list of h2s (table names)
def get_team_tables(url):

    # wait a random amount of time before making the request
    # TODO: make this react to the website response
    wait_time = np.random.randint(1, 5)
    time.sleep(wait_time)

    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    team_name = get_team_name(soup)
    table_list = pd.read_html(page.content)
    tables = []
    for table in table_list:
        tables.append(pd.DataFrame(table))
    h2s = get_h2s(soup)
    return team_name, tables, h2s

# convert the h2 into string suitable for file name
# make everything lowercase and replace spaces with underscores
# replace / with and
# remove any extra underscores
def h2_to_str(h2):
    h2 = h2.lower()
    h2 = re.sub(r' ', '_', h2)
    h2 = re.sub(r'/', '+', h2)
    h2 = re.sub(r'_+', '_', h2)
    return h2

# get the team name
def get_team_name(soup):
    # get the team name
    h1 = soup.find('h1').text
    team_name = re.sub(r' \d{4} Cap Table', '', h1)
    return team_name

# get the available years from team url
def get_available_years(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    team_name = get_team_name(soup)
    # format: <select name="year" class="form-select form-select-sm" tabindex="0" control-id="ControlID-15">
    select = soup.find('select', {'name': 'year'})
    options = select.find_all('option')
    years = [option.text for option in options]
    return team_name, years

# handle changed team names
def handle_team_name_changes(team_name):
    match team_name:
        case 'Oakland Raiders':
            return 'Las Vegas Raiders'
        case 'San Diego Chargers':
            return 'Los Angeles Chargers'
        case 'St. Louis Rams':
            return 'Los Angeles Rams'
        case 'Washington Football Team':
            return 'Washington Commanders'
        case 'Washington Redskins':
            return 'Washington Commanders'
        case _:
            return team_name


In [None]:
team_urls = pd.read_csv(os.getcwd() + '/data/cap_space_all_teams.csv')
team_codes = pd.read_csv(os.getcwd() + '/data/team_codes.csv')
team_codes = team_codes.set_index('team_name')
team_codes = team_codes.to_dict()['team_code']

# for url in team_urls['team_url']:
for url in team_urls['team_url']:
    
    year = re.search(r'(\d{4})', url).group(1)
    year = int(year)

    # go through each year and get page until 2011
    while year > 2010:
      
        team_name, tables, h2s = get_team_tables(url)
        team_name = handle_team_name_changes(team_name)
        print('Retrieved data for ' + team_name + ' for the year ' + str(year))
        dir = os.getcwd() + '/data/teams/' + team_name + '/' + str(year) + '/raw'
        # check if directory exists
        if not os.path.exists(dir):
            os.mkdir(dir)

        table_names = [h2_to_str(h2) for h2 in h2s]

        # NOTE: system is not intelligent enough to know if the file already exists
        for i, table in enumerate(tables):
            file = dir + '/' + team_codes[team_name] + '_' + table_names[i] + '_raw.csv'
            # check if file exists
            if os.path.exists(file):
                continue
            table.to_csv(file, index=False)
        year = year - 1
        url = re.sub(r'\d{4}', str(year), url)