# Functions that edit On3 Website

In [6]:
def get_player_ratings_and_school(url):
    '''
    Gets player ratings + school name
    Input: url
    Output: dataframe with all info from that on3 webpage
    '''
    
    response = requests.get(url) 
        #requests.get returns a requests.response object. 
        #This sends an HTTP get request. The server responds with the desired info
    
    soup = BeautifulSoup(response.text, 'html.parser')
        #reuturns beautiful soup object to scrape data
        #response.text: tells computer to scrape the HTML content
        #html.parser: parser library that BeautifulSoup uses to parse HTML
    
    players = soup.find_all('li', class_ = "IndustryComparisonList_industryComparisonItemContainer__QNFjk")
        #finds all div elements with the CSS class "player card"
        #find_all method returns a ResultSet object (iterable like a list)
        #each element in the ResultSet object is a Tag object to further search

    data = []
    curr_player = 0
    
    for player in players:
        data.append(create_player(player, curr_player))
        curr_player+=1
        
    df = pd.DataFrame(data)
    return df


def create_player(player, i):
    '''
    Creates a player for the dataframe
    Input beautiful soup object with player, index of current player
    Output: dict with player's information
    '''
    
    new_obs = {}

    on3_a = player.find('a', class_= 
                          f"MuiTypography-root MuiLink-root MuiLink-underlineNone MuiBox-root jss{27 + i * 4} IndustryComparisonFourServicesItem_serviceItemContainer__Vxx5H IndustryComparisonConditionalLink_conditionalLink__C6QoW MuiTypography-colorPrimary")
    ESPN_a = player.find('a', class_= 
                          f"MuiTypography-root MuiLink-root MuiLink-underlineNone MuiBox-root jss{29 + i * 4} IndustryComparisonFourServicesItem_serviceItemContainer__Vxx5H IndustryComparisonConditionalLink_conditionalLink__C6QoW MuiTypography-colorPrimary")
    sports247_a = player.find('a', class_= 
                          f"MuiTypography-root MuiLink-root MuiLink-underlineNone MuiBox-root jss{28 + i * 4} IndustryComparisonFourServicesItem_serviceItemContainer__Vxx5H IndustryComparisonConditionalLink_conditionalLink__C6QoW MuiTypography-colorPrimary")
    rivals_a = player.find('a', class_= 
                          f"MuiTypography-root MuiLink-root MuiLink-underlineNone MuiBox-root jss{30 + i * 4} IndustryComparisonFourServicesItem_serviceItemContainer__Vxx5H IndustryComparisonConditionalLink_conditionalLink__C6QoW MuiTypography-colorPrimary")

    new_obs["Player_Name"] = player.find('a', class_='MuiTypography-root MuiLink-root MuiLink-underlineNone MuiTypography-h5 MuiTypography-colorPrimary').text
    new_obs["On3_rating"] = notRivals_rating_assign(on3_a)
    new_obs["ESPN_rating"] = notRivals_rating_assign(ESPN_a)
    new_obs["Rivals_rating"] = rivals_rating_assign(rivals_a)
    new_obs["247_rating"] = notRivals_rating_assign(sports247_a)
    new_obs["On3SchN"], new_obs["On3SchLoc"] = get_school_name(player)
    
    return new_obs
        


def notRivals_rating_assign(soupObj):
    '''
    Assign rating from on3 website
    Input: beautiful soup object that may contain rating
    Output: rating
    '''
    if soupObj is None or soupObj.find('span', class_ = "StarRating_overallRating__MTh52 StarRating_gray__xYvHF") is None:
        return 40.9

    return soupObj.find('span', class_ = "StarRating_overallRating__MTh52 StarRating_gray__xYvHF").text


def rivals_rating_assign(soupObj):
    '''
    Assign rating from on3 website
    Input: beautiful soup object that may contain rating
    Output: rating
    '''
    if soupObj is None or soupObj.find('span', class_ = "StarRating_overallRating__MTh52 StarRating_gray__xYvHF") is None:
        return 2.7

    return soupObj.find('span', class_ = "StarRating_overallRating__MTh52 StarRating_gray__xYvHF").text


def get_school_name(player):
    '''
    Gets the player's ON3 school name, will be important to get other data later
    input: Beautiful soup object for an On3 web page with the player
    output: School name + city
    '''
    school_name = player.find('p', class_='MuiTypography-root IndustryComparisonPlayerItem_hometownContainer__Qvs_0 IndustryComparisonPlayerItem_mobile__ROVeH MuiTypography-body1 MuiTypography-colorTextPrimary')
    school_name = school_name.find('span')
    school_city = player.find('span', class_='IndustryComparisonPlayerItem_homeTown__8IcYx')
    if school_name is not None and school_city is not None:
        return (school_name.text, " (" + school_city.text + ")")
    else:
        return None

# Functions that get MaxPreps positions

In [15]:
def get_first_google_result(s):
    '''
    Gets first google result and returns as beautiful soup object
    Input: string
    Output: beautiful soup object for website
    '''
    query = '+'.join(s.split())     
    url = f"https://www.google.com/search?q={query}"
    soup = set_up_soup(url)
    first_result = soup.find('div', class_='yuRUbf')
    
    if first_result:
        return set_up_soup(first_result.a['href'])
    else:
        return None
    


def find_pos_on_maxpreps(name, soup, pos):

    try:
        jersey_pos = soup.find_all('div', class_ = 'jersey-pos')
        sports = soup.find_all('div', class_ = 'sport-name')
        li = list(zip(sports, jersey_pos))
        sports_x_pos = [pos[1] for i, pos in enumerate(li) if 'Football' in str(li[i][0])][0]
        sports_x_pos = sports_x_pos.text.split('• ')[-1]
        sports_x_pos = np.array(sports_x_pos.split(', ')).flatten()
        return sports_x_pos
    except IndexError:
        return np.array([[input_position_from_on3(pos)]])
    

def input_position_from_on3(pos):
    '''
    Differentiates between On3 position and Maxpreps positions
    Input: On3 position
    Output: corresponding maxpreps position (ex. qb should match with qb)
    '''
    maxpreps_positions = ['QB', 'RB', 'WR', 'TE', 'T', 'G', 'C', 'DE', 'CB', 'FS', 'SS', 'DT', 'MLB', 'OLB']
    on3_positions = ['qb', 'rb', 'wr', 'te', 'ot', 'iol', 'xx', 'edge', 'cb', 's', 'xxx', 'dl', 'lb', 'xx', 'ath']

    return [maxpreps_positions[on3_positions.index(pos)]]
   

def assign_pos(jersey_pos, num, poss_positions):
    '''
    Assigns player to position in dataframe
    Inputs: array of positions, number position, offense or defense
    Outputs: position to add
    '''
    counter = 0
    for item in jersey_pos:
        if item in poss_positions:
            counter+=1
            if counter == num:
                return item
    
    return "--"

def get_mp_confName(soup):
    '''
    Gets a player's athletic conference name, useful for finding all-conference data
    Input: Soup object
    Output: Conference name of player
    '''
    team_link = soup.find('a', class_ = 'sc-333a63d7-0 eWSjMq school')['href'] 
    team_link_rankings = team_link + '/football/22-23/standings/'
    soup_team = set_up_soup(team_link_rankings).find("h2", class_ = 'sc-f584fccb-0 hTQrEh heading_125_bold')
    
    return soup_team.text if soup_team else 'Not Found'

# Functions that get team captain and team ratings

In [17]:
def get_team_captain(soup):
    '''
    Finds whether a player is a team captain or not
    Input: maxpreps website beautiful soup object
    Output: team captain
    '''
    try: 
        teamdata = soup.find('div', class_ = 'teamdata')
        sport_arr = soup.find_all('div', class_ = 'sport')
        football = [sport_arr[i] for i, sport in enumerate(sport_arr) if 'Football' in str(sport_arr[i])]
        if 'Captain' in str(football[0]):
            return 1

        return 0
    except IndexError:
        print('Couldn\'t find team captain for player above')
        return 'IndexError'

    return 0

def get_team_rating(soup):
    '''
    Gets a player's team rating from maxpreps
    Input: soup object of the website
    Output: Rating
    '''
    team_link = soup.find('a', class_ = 'sc-333a63d7-0 eWSjMq school')['href'] 
    team_link_rankings = team_link + '/football/22-23/rankings/'
    soup_team = set_up_soup(team_link_rankings)

    trs = soup_team.find_all("tr")
    for tr in trs:
        if team_link + "football/22-23/schedule/" in str(tr):
            return tr.find_all("td")[-1].text
    
    return 'Not_found'

def get_mp_potg(soup):
    '''
    Gets how many times a player has won player of the game, from 2022 and in total
    Input: soup object of the website
    Output: (How many times player won in 2022, Total number of career times player won potg)
    '''
    links = soup.find_all('li', class_ = '')
    soup_potg = ''
    found_soup = 0
    for link in links:
        if "awards" in str(link):
            soup_potg = set_up_soup(link.a['href'])
            found_soup = 1
            break
    
    if found_soup == 0:
        return 0, 0
            
    buttons = soup_potg.find_all('button')
    potg_2022 = 0
    potg_before = 0
    for button in buttons:
        if "Player of the Game" in str(button):
            if "2022" in str(button):
                potg_2022+=1
            if "2021" in str(button) or "2020" in str(button) or "2019" in str(button):
                potg_before+=1
    return potg_2022, potg_before + potg_2022

def set_up_soup(link):
    '''
    Sets up a beatuiful soup object for a website
    Input: Link for parseable website
    Output: Soup object ready for parsing
    '''
    session = requests.Session()
    retry = Retry(connect=4, backoff_factor=2.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }  # headers to approve user agent

    response = session.get(link, headers=headers)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')

def fill_df_position_inplace(df):
    '''
    Fills a player's position in the dataframe
    Input: Df
    Output: None
    '''
    df.at[i, "MP_OPos1"] = assign_pos(jersey_pos, 1, off_positions)
    df.at[i, "MP_OPos2"] = assign_pos(jersey_pos, 2, off_positions)
    df.at[i, "MP_OPos3"] = assign_pos(jersey_pos, 3, off_positions)
    df.at[i, "MP_DPos1"] = assign_pos(jersey_pos, 1, def_positions)
    df.at[i, "MP_DPos2"] = assign_pos(jersey_pos, 2, def_positions)
    df.at[i, "MP_DPos3"] = assign_pos(jersey_pos, 3, def_positions)
    
def fill_df_error_inplace(position_dfs, pos, error_type):
    if error_type == 'TypeError':
        position_dfs[pos].at[i, "Team_Captain"] = ''
        position_dfs[pos].at[i, "MPTmRtg"] = ''
        position_dfs[pos].at[i, "MP_potg_count"], position_dfs[pos].at[i, "MP_potg_count_total"] = ('', '')
        position_dfs[pos].at[i, "conference_name"] = ''
    elif error_type == 'HTTPError':
        position_dfs[pos].at[i, "On3_position"] = pos
        position_dfs[pos].at[i, "MP_OPos1"] = 'HTTPErr'
        position_dfs[pos].at[i, "MP_OPos2"] = ''
        position_dfs[pos].at[i, "MP_OPos3"] = ''
        position_dfs[pos].at[i, "MP_DPos1"] = ''
        position_dfs[pos].at[i, "MP_DPos2"] = ''
        position_dfs[pos].at[i, "MP_DPos3"] = ''
        position_dfs[pos].at[i, "Team_Captain"] = ''
        position_dfs[pos].at[i, "MPTmRtg"] = ''
        position_dfs[pos].at[i, "Team_Captain"] = ''
        position_dfs[pos].at[i, "MPTmRtg"] = ''
        position_dfs[pos].at[i, "MP_potg_count"], position_dfs[pos].at[i, "MP_potg_count_total"] = ('', '')


# Code below are the "main" functions, what I run to execute data collection

In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import requests
from requests.exceptions import HTTPError
import numpy as np
import re
import time
import sys
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [19]:
dfs = []
positions = ['qb', 'rb', 'wr', 'te', 'ot', 'iol', 'edge', 'dl', 'lb', 'cb', 's']

In [20]:
off_positions = ['QB', 'RB', 'WR', 'TE', 'T', 'C', 'G']
def_positions = ['DE', 'CB', 'FS', 'SS', 'DT', 'MLB', 'OLB']
maxpreps_positions = np.concatenate((off_positions, def_positions))
years = [2023, 2024, 2025, 2026]

for year in years:
    for pos in positions:
        position_dfs = {}
        dfs = []
        for i in range(1, 6):
            url = f"https://www.on3.com/db/rankings/industry-comparison/football/{year}/?position={pos}&page={i}"
            dfs.append(get_player_ratings_and_school(url))

        position_dfs[pos] = pd.concat(dfs)
        position_dfs[pos].reset_index(drop=True, inplace=True)
        names = list(position_dfs[pos]["Player_Name"])
        #set up dataframe

        for i, name in enumerate(names):
            print(name, i) #shows the player collecting data for
            try:
                time.sleep(2.5 + 0.02 * i)
                soup = get_first_google_result(name + " " + position_dfs[pos].at[i, 'On3SchN'] + " maxpreps")
                school_name = soup.find('div', class_ = 'school-name')
                school_loc = soup.find('div', class_ = "location")
                if school_name and school_loc:
                    position_dfs[pos].at[i, "MPSchN"] = school_name.text + school_loc.text
                #assign school name
                
                jersey_pos = find_pos_on_maxpreps(name, soup, pos) 
                position_dfs[pos].at[i, "On3_position"] = pos
                fill_df_position_inplace(position_dfs[pos])
                #assign player position
                try:
                    position_dfs[pos].at[i, "Team_Captain"] = get_team_captain(soup)
                    position_dfs[pos].at[i, "MPTmRtg"] = get_team_rating(soup)
                    position_dfs[pos].at[i, "MP_potg_count"], position_dfs[pos].at[i, "MP_potg_count_total"] = get_mp_potg(soup)
                    position_dfs[pos].at[i, "conference_name"] = get_mp_confName(soup)
                    #assign more player info
                except TypeError:
                    fill_df_error_inplace(position_dfs, pos, 'TypeError')

            except HTTPError:
                print('Error encountered')
                fill_df_error_inplace(position_dfs, pos, 'HTTPError')

            if i % 40 == 0:
                time.sleep(10 + 0.05 * i)

        position_dfs[pos].to_csv(f"new_{year}_" + pos + "_ratings.csv", index=False)

Arch Manning 0
[]
Couldn't find team captain for player above
Dante Moore 1
[<div class="sport"><div><a class="sc-a2e14072-0 hCsilC" href="https://www.maxpreps.com/mi/detroit/king-crusaders/football/22-23/schedule/" style="--data-color:#004ACE;--data-hover-color:#002cb0">King Varsity Football</a></div><div>#5 • QB</div></div>]
Nico Iamaleava 2
[<div class="sport"><div><a class="sc-a2e14072-0 hCsilC" href="https://www.maxpreps.com/ca/downey/warren-bears/football/21-22/schedule/" style="--data-color:#004ACE;--data-hover-color:#002cb0">Warren Varsity Football</a></div><div>#8 • QB</div></div>]
Jackson Arnold 3
[<div class="sport"><div><a class="sc-a2e14072-0 hCsilC" href="https://www.maxpreps.com/tx/denton/guyer-wildcats/football/22-23/schedule/" style="--data-color:#004ACE;--data-hover-color:#002cb0">Guyer Varsity Football</a></div><div>#11 • QB</div></div>]
Malachi Nelson 4
[<div class="sport"><div><a class="sc-a2e14072-0 hCsilC" href="https://www.maxpreps.com/ca/los-alamitos/los-alamit

KeyboardInterrupt: 