# Scraping Player salaries and madden ratings

## data sources: 

### https://overthecap.com --> player salaries as of June 1, 2025
    used for webscraping to obtain dataframe
### https://www.spotrac.com/nfl --> contract type (Rookie, Undrafted, other) as of June 5, 2025
    used for reference to manually fill contract type - did not scrape
### https://www.ea.com/en/games/madden-nfl --> 2024 Madden Ratings
    used for webscraping to obtain dataframe

In [20]:
import requests
import bs4
import numpy as np
import pandas as pd

In [10]:
def read_data(url):
    """
    read in the data from the url.
    ---------------
    Parameters:
    ---------------
    
    Inputs: 
        url (string): the url of interest 

    Outputs: 
        resulted (HTML): the HTML code to search through
    
    """
    r = requests.get(url)
    print(r)
    resulted = bs4.BeautifulSoup(r.text, 'html')
    return(resulted)

In [118]:
positions = ['quarterback', 'running-back', 'fullback', 'wide-receiver', 
             'tight-end', 'left-tackle', 'left-guard', 'center', 
             'right-guard', 'right-tackle', 'interior-defensive-line', 
             'edge-rusher', 'linebacker', 'safety', 'cornerback', 'kicker', 
             'punter', 'long-snapper']



def compile_data():
    contract_dataset = pd.DataFrame(columns = ['Player', 'Team', 'Age', 'Total Value', 'APY', 'Total Guaranteed', 'Fully Guaranteed', 'Free Agency', 'position'])


    for position in positions:
        url = f'https://overthecap.com/position/{position}'
        webpage_html = read_data(url)

        td_tags = webpage_html.find_all('td')

        pos_contract_data = [str(i).split('">')[-1].split('</td>')[0].replace('<td>', '').replace('</a>', '').replace('$', '').replace(',', '') for i in td_tags]

        position_contract_df = pd.DataFrame(
            data=np.array(pos_contract_data).reshape(
                int(len(pos_contract_data)/8), 
                8), 
            columns = ['Player', 'Team', 'Age', 'Total Value', 'APY', 'Total Guaranteed', 'Fully Guaranteed', 'Free Agency']
        )

        position_contract_df['position'] = [position]*len(position_contract_df)
        
        contract_dataset = pd.concat([contract_dataset, position_contract_df])

    return(contract_dataset)

In [104]:
data = compile_data()

In [112]:
#data.to_csv('../data/player_salaries.csv', index=0)

In [465]:

teams_on_madden = ['Miami Dolphins', 'Philadelphia Eagles', 'Los Angeles Chargers', 'NY Giants', 'Kansas City Chiefs', 'Minnesota Vikings', 'Arizona Cardinals', 'New England Patriots', 'Denver Broncos', 'Houston Texans', 'Dallas Cowboys', 'Las Vegas Raiders', 'Pittsburgh Steelers', 'Atlanta Falcons', 'San Francisco 49ers', 'Washington Commanders', 'Tennessee Titans', 'Chicago Bears', 'Cincinnati Bengals', 'Cleveland Browns', 'Seattle Seahawks', 'Carolina Panthers', 'Green Bay Packers', 'NY Jets', 'Baltimore Ravens', 'Tampa Bay Buccaneers', 'Buffalo Bills', 'Indianapolis Colts', 'New Orleans Saints', 'Los Angeles Rams', 'Jacksonville Jaguars', 'Detroit Lions']


def compile_madden_data():
    madden_dataset = pd.DataFrame(columns = ['Player', 'Team', 'Position', 'OVR', 'SPD', 'STR', 'AGI', 'COD', 'INJ','AWR'])


    for page_num in range(1,22):
        url = f'https://www.ea.com/en/games/madden-nfl/ratings?page={page_num}'
        webpage_html = read_data(url)

        player_names_raw = webpage_html.find_all('div', class_='Table_profileContent__0t2_u')
        positions_raw = webpage_html.find_all('span', class_='Table_tag__vKZKn generated_utility20sm__ZX2Hf generated_utility19md__XKkU_')
        teams_raw = [i for i in webpage_html.find_all('img', class_='Picture_image__L8suG', style="width:100%;height:100%") if 'X-Factor' not in str(i)]
        ratings_raw = webpage_html.find_all('span', class_='Table_statCellValue__zn5Cx')


        players = [str(i).split('__0t2_u">')[-1].replace('</div>', '') for i in player_names_raw]
        teams_reduced = [str(i).split('alt="')[1].split('" class')[0] for i in teams_raw]
        teams = [i for i in teams_reduced if i in teams_on_madden][:len(players)]
        positions = [str(i).split('XKkU_">')[-1].replace('</span>', '') for i in positions_raw]
        OVR = [str(i).split('<span aria-hidden="true"')[0].replace('</span>', '').split('>')[-1] for i in ratings_raw[::14]]
        SPD = [str(i).split('<span aria-hidden="true"')[0].replace('</span>', '').split('>')[-1] for i in ratings_raw[1::14]]
        STR = [str(i).split('<span aria-hidden="true"')[0].replace('</span>', '').split('>')[-1] for i in ratings_raw[2::14]]
        AGI = [str(i).split('<span aria-hidden="true"')[0].replace('</span>', '').split('>')[-1] for i in ratings_raw[3::14]]
        COD = [str(i).split('<span aria-hidden="true"')[0].replace('</span>', '').split('>')[-1] for i in ratings_raw[4::14]]
        INJ = [str(i).split('<span aria-hidden="true"')[0].replace('</span>', '').split('>')[-1] for i in ratings_raw[5::14]]
        AWR = [str(i).split('<span aria-hidden="true"')[0].replace('</span>', '').split('>')[-1] for i in ratings_raw[6::14]]


        madden_page_df = pd.DataFrame(
                                    {'Player':players, 
                                     'Team':teams, 
                                     'Position':positions, 
                                     'OVR':OVR, 
                                     'SPD':SPD, 
                                     'STR':STR, 
                                     'AGI':AGI, 
                                     'COD':COD, 
                                     'INJ':INJ, 
                                     'AWR':AWR})
        
        madden_dataset = pd.concat([madden_dataset, madden_page_df])

    return(madden_dataset)

In [467]:
madden_data = compile_madden_data()

In [479]:
#madden_data.to_csv('../data/madden_ratings.csv', index=0)