Find relevant tight end to webscrape

In [1]:
import pandas as pd

pd.set_option('display.max_columns', 100)

df_superflex_rankings = pd.read_parquet(r'..\..\data\fantasypros_in_season_rankings\superflex_rankings.parquet')

df_superflex_rankings = df_superflex_rankings.loc[df_superflex_rankings['Player Name'].notna(), :].reset_index(drop=True)

# make names easier to match
df_superflex_rankings['Player Name'] = (
    df_superflex_rankings['Player Name']
    .str.split().str[:2].str.join(' ')  # make names just the first two words
    .str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)  # remove non-alphanumeric characters
    .str.lower()  # lowercase only (helps with matching)
)

df_superflex_rankings

Unnamed: 0,Year,Week,RK,Player Name,POS,POS RK
0,2018,1,1,todd gurley,RB,1
1,2018,1,2,david johnson,RB,2
2,2018,1,3,alvin kamara,RB,3
3,2018,1,4,melvin gordon,RB,4
4,2018,1,5,ezekiel elliott,RB,5
...,...,...,...,...,...,...
41050,2023,17,461,steven sims,WR,183
41051,2023,17,462,jashaun corbin,RB,112
41052,2023,17,463,dee eskridge,WR,184
41053,2023,17,464,clayton tune,QB,65


In [2]:
df_te = df_superflex_rankings.loc[df_superflex_rankings['POS'] == 'TE', :].sort_values(['Year', 'Week', 'POS RK'], ignore_index=True)

df_te = df_te.loc[df_te['Year'] >= 2020, :].reset_index(drop=True)  # for now

df_te.sort_values(['Year', 'Week', 'POS RK'], ignore_index=True, inplace=True)

df_te

Unnamed: 0,Year,Week,RK,Player Name,POS,POS RK
0,2020,1,43,george kittle,TE,1
1,2020,1,48,travis kelce,TE,2
2,2020,1,70,mark andrews,TE,3
3,2020,1,76,zach ertz,TE,4
4,2020,1,87,darren waller,TE,5
...,...,...,...,...,...,...
6738,2023,17,427,rodney williams,TE,100
6739,2023,17,439,hunter long,TE,101
6740,2023,17,446,albert okwuegbunam,TE,102
6741,2023,17,451,david wells,TE,103


Filter to relevant players

In [3]:
df_teams = pd.read_parquet('../../data/fantasy_points/footballguys_half_ppr.parquet')[['Season', 'Week', 'Name', 'Team', 'Position']]

df_teams['Name'] = (
    df_teams['Name'].str.split().str[:2].str.join(' ')  # make names just the first two words
    .str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)  # remove non-alphanumeric characters
    .str.lower()  # lowercase only (helps with matching)
)

df_teams = df_teams.loc[df_teams['Position'] == 'TE', :].reset_index(drop=True)

df_teams

Unnamed: 0,Season,Week,Name,Team,Position
0,2018,1,jared cook,OAK,TE
1,2018,1,rob gronkowski,NE,TE
2,2018,1,will dissly,SEA,TE
3,2018,1,eric ebron,IND,TE
4,2018,1,jordan reed,WAS,TE
...,...,...,...,...,...
8806,2023,17,david wells,TB,TE
8807,2023,17,mitchell wilcox,CIN,TE
8808,2023,17,brayden willis,SF,TE
8809,2023,17,rodney williams,PIT,TE


In [4]:
df_te = df_te.merge(
    df_teams.rename(columns={'Season': 'Year', 'Name': 'Player Name', 'Position': 'POS'}),
    how='left',
    on=['Year', 'Week', 'Player Name', 'POS'],
)

df_te.loc[(df_te['Team'].isna()) & (df_te['POS RK'] <= 25), :]

# 2022 Week 17 Bills at Bengals was cancelled

Unnamed: 0,Year,Week,RK,Player Name,POS,POS RK,Team
2754,2021,12,156,taysom hill,TE,21,
3042,2021,15,179,brevin jordan,TE,25,
4986,2022,17,125,dawson knox,TE,12,
5634,2023,6,175,dalton kincaid,TE,25,


Fix players with difficult names

In [5]:
# df_teams.loc[df_teams['Name'] == 'will fuller', 'Name'] = 'william fuller'

In [6]:
df_te.loc[(df_te['Team'].isna()) & (df_te['POS RK'] <= 32*1), :].groupby(['Player Name'])['POS'].count().sort_values(ascending=False).head(10)

Player Name
taysom hill       2
brevin jordan     1
dalton kincaid    1
dawson knox       1
john bates        1
juwan johnson     1
Name: POS, dtype: int64

In [7]:
df_te = df_te.loc[df_te['Team'].notna(), :].reset_index(drop=True)

df_te

Unnamed: 0,Year,Week,RK,Player Name,POS,POS RK,Team
0,2020,1,43,george kittle,TE,1,SF
1,2020,1,48,travis kelce,TE,2,KC
2,2020,1,70,mark andrews,TE,3,BAL
3,2020,1,76,zach ertz,TE,4,PHI
4,2020,1,87,darren waller,TE,5,LV
...,...,...,...,...,...,...,...
5923,2023,17,397,matt sokol,TE,92,NE
5924,2023,17,408,travis vokolek,TE,96,ARI
5925,2023,17,425,brayden willis,TE,99,SF
5926,2023,17,427,rodney williams,TE,100,PIT


In [8]:
df_te = df_te.groupby(['Year', 'Week', 'Team']).nth(0).reset_index(drop=True)

df_te

Unnamed: 0,Year,Week,RK,Player Name,POS,POS RK,Team
0,2020,1,43,george kittle,TE,1,SF
1,2020,1,48,travis kelce,TE,2,KC
2,2020,1,70,mark andrews,TE,3,BAL
3,2020,1,76,zach ertz,TE,4,PHI
4,2020,1,87,darren waller,TE,5,LV
...,...,...,...,...,...,...,...
2010,2023,17,197,mike gesicki,TE,31,NE
2011,2023,17,200,josh oliver,TE,32,MIN
2012,2023,17,217,austin hooper,TE,34,LV
2013,2023,17,225,adam trautman,TE,35,DEN


In [9]:
df_te['Player Name URL'] = df_te['Player Name'].str.replace(' ', '-')

# fix players who are known to have issues
df_te.loc[df_te['Player Name'] == 'chris herndon', 'Player Name URL'] = 'chris-herndon-iv'
df_te.loc[df_te['Player Name'] == 'irv smith', 'Player Name URL'] = 'irv-smith-jr'
df_te.loc[df_te['Player Name'] == 'drew ogletree', 'Player Name URL'] = 'andrew-ogletree'
df_te.loc[df_te['Player Name'] == 'mo aliecox', 'Player Name URL'] = 'mo-alie-cox'
df_te.loc[df_te['Player Name'] == 'ricky sealsjones', 'Player Name URL'] = 'ricky-seals-jones'
df_te.loc[df_te['Player Name'] == 'dalton keene', 'Player Name URL'] = 'dalton-keene-te'

df_te

Unnamed: 0,Year,Week,RK,Player Name,POS,POS RK,Team,Player Name URL
0,2020,1,43,george kittle,TE,1,SF,george-kittle
1,2020,1,48,travis kelce,TE,2,KC,travis-kelce
2,2020,1,70,mark andrews,TE,3,BAL,mark-andrews
3,2020,1,76,zach ertz,TE,4,PHI,zach-ertz
4,2020,1,87,darren waller,TE,5,LV,darren-waller
...,...,...,...,...,...,...,...,...
2010,2023,17,197,mike gesicki,TE,31,NE,mike-gesicki
2011,2023,17,200,josh oliver,TE,32,MIN,josh-oliver
2012,2023,17,217,austin hooper,TE,34,LV,austin-hooper
2013,2023,17,225,adam trautman,TE,35,DEN,adam-trautman


In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
# options.add_argument('--headless')

driver = webdriver.Chrome(options=options)

In [11]:
import time
from selenium.webdriver.common.by import By

def scrape_props(player_name, player_name_url, year, week, team):
    driver.get(url=f"https://www.bettingpros.com/nfl/odds/player-props/{player_name_url}/?season={year}&week={week}")
    time.sleep(5)

    # the page uses lazy loading
    # this loop helps all relevant elements load before scraping
    for _ in range(3):
        # find all loaded rows
        rows = driver.find_elements(
            by=By.CLASS_NAME,
            value="grouped-items-with-sticky-footer__content",
        )

        # scroll to the last loaded row, which should load subsequent rows if there are any
        if len(rows) > 0:
            driver.execute_script(f"window.scrollTo(0, {rows[-1].location['y']});")

        time.sleep(3)

    rows = driver.find_elements(
        by=By.CLASS_NAME,
        value="grouped-items-with-sticky-footer__content",
    )

    if len(rows) == 0:
        print(f"Warning: No rows for {player_name} {year} week {week}")
        
    data = {
        'Year': year,
        'Week': week,
        'Player Name': player_name,
        'Team': team,
    }
    for row in rows:
        t = row.text.split('\n')
        if t[0] == 'Receiving Yards Over/Under':
            data['Receiving Yards Projection'] = t[-4]
            data['Receiving Yards Over'] = t[-3]
            data['Receiving Yards Under'] = t[-1]
        elif t[0] == 'Receptions Over/Under':
            data['Receptions Projection'] = t[-4]
            data['Receptions Over'] = t[-3]
            data['Receptions Under'] = t[-1]
        elif t[0] == 'Anytime Touchdown Scorer':
            data['Anytime Touchdown Line'] = t[-1]

    return data

In [12]:
from tqdm.autonotebook import tqdm

d = []
for index, row in tqdm(list(df_te.iterrows())):
    try:
        d.append(scrape_props(row['Player Name'], row['Player Name URL'], row['Year'], row['Week'], row['Team']))
    except Exception as e:
        print(f"Error with {row['Player Name'], row['Player Name URL'], row['Year'], row['Week'], row['Team']}: {e}")

  from tqdm.autonotebook import tqdm


  0%|          | 0/2015 [00:00<?, ?it/s]



In [13]:
driver.quit()

In [14]:
d

[{'Year': 2020,
  'Week': 1,
  'Player Name': 'george kittle',
  'Team': 'SF',
  'Anytime Touchdown Line': 'EVEN',
  'Receiving Yards Projection': 'O 71.5',
  'Receiving Yards Over': '(-110)',
  'Receiving Yards Under': '(-110)',
  'Receptions Projection': 'O 6.5',
  'Receptions Over': '(+102)',
  'Receptions Under': '(-130)'},
 {'Year': 2020,
  'Week': 1,
  'Player Name': 'travis kelce',
  'Team': 'KC',
  'Receptions Projection': 'O 5.5',
  'Receptions Over': '(-142)',
  'Receptions Under': '(+112)',
  'Anytime Touchdown Line': '-120',
  'Receiving Yards Projection': 'O 69.5',
  'Receiving Yards Over': '(-122)',
  'Receiving Yards Under': '(EVEN)'},
 {'Year': 2020,
  'Week': 1,
  'Player Name': 'mark andrews',
  'Team': 'BAL',
  'Receiving Yards Projection': 'O 48.5',
  'Receiving Yards Over': '(-110)',
  'Receiving Yards Under': '(-110)',
  'Receptions Projection': 'O 4.5',
  'Receptions Over': '(+115)',
  'Receptions Under': '(-145)',
  'Anytime Touchdown Line': '+138'},
 {'Year': 2

In [15]:
import pandas as pd

df = pd.json_normalize(d)

df

Unnamed: 0,Year,Week,Player Name,Team,Anytime Touchdown Line,Receiving Yards Projection,Receiving Yards Over,Receiving Yards Under,Receptions Projection,Receptions Over,Receptions Under
0,2020,1,george kittle,SF,EVEN,O 71.5,(-110),(-110),O 6.5,(+102),(-130)
1,2020,1,travis kelce,KC,-120,O 69.5,(-122),(EVEN),O 5.5,(-142),(+112)
2,2020,1,mark andrews,BAL,+138,O 48.5,(-110),(-110),O 4.5,(+115),(-145)
3,2020,1,zach ertz,PHI,+163,O 54.5,(-110),(-110),O 5.5,(-104),(-145)
4,2020,1,darren waller,LV,+138,O 52.5,(-110),(-110),O 4.5,(-130),(+110)
...,...,...,...,...,...,...,...,...,...,...,...
2010,2023,17,mike gesicki,NE,+500,O 17.5,(-115),(-110),O 2.5,(+116),(-150)
2011,2023,17,josh oliver,MIN,+450,O 10.5,(-115),(-115),,,
2012,2023,17,austin hooper,LV,+450,O 20.5,(-113),(-110),O 2.5,(EVEN),(-135)
2013,2023,17,adam trautman,DEN,+400,O 5.5,(-110),(-110),O 0.5,(-174),(+130)


In [16]:
for column in ('Receiving Yards Projection', 'Receptions Projection'):
    df[column] = df[column].str.extract(r'(\d+\.\d+)').astype(float)

df

Unnamed: 0,Year,Week,Player Name,Team,Anytime Touchdown Line,Receiving Yards Projection,Receiving Yards Over,Receiving Yards Under,Receptions Projection,Receptions Over,Receptions Under
0,2020,1,george kittle,SF,EVEN,71.5,(-110),(-110),6.5,(+102),(-130)
1,2020,1,travis kelce,KC,-120,69.5,(-122),(EVEN),5.5,(-142),(+112)
2,2020,1,mark andrews,BAL,+138,48.5,(-110),(-110),4.5,(+115),(-145)
3,2020,1,zach ertz,PHI,+163,54.5,(-110),(-110),5.5,(-104),(-145)
4,2020,1,darren waller,LV,+138,52.5,(-110),(-110),4.5,(-130),(+110)
...,...,...,...,...,...,...,...,...,...,...,...
2010,2023,17,mike gesicki,NE,+500,17.5,(-115),(-110),2.5,(+116),(-150)
2011,2023,17,josh oliver,MIN,+450,10.5,(-115),(-115),,,
2012,2023,17,austin hooper,LV,+450,20.5,(-113),(-110),2.5,(EVEN),(-135)
2013,2023,17,adam trautman,DEN,+400,5.5,(-110),(-110),0.5,(-174),(+130)


In [17]:
for column in ('Receiving Yards Over', 'Receiving Yards Under', 'Receptions Over', 'Receptions Under'):
    df.loc[df[column].str.contains('EVEN', regex=False, na=False), column] = '100'
    df.loc[df[column] == '--', column] = 'nan'
    df[column] = df[column].str.replace(r'\(|\)', '', regex=True).astype(float)

df

Unnamed: 0,Year,Week,Player Name,Team,Anytime Touchdown Line,Receiving Yards Projection,Receiving Yards Over,Receiving Yards Under,Receptions Projection,Receptions Over,Receptions Under
0,2020,1,george kittle,SF,EVEN,71.5,-110.0,-110.0,6.5,102.0,-130.0
1,2020,1,travis kelce,KC,-120,69.5,-122.0,100.0,5.5,-142.0,112.0
2,2020,1,mark andrews,BAL,+138,48.5,-110.0,-110.0,4.5,115.0,-145.0
3,2020,1,zach ertz,PHI,+163,54.5,-110.0,-110.0,5.5,-104.0,-145.0
4,2020,1,darren waller,LV,+138,52.5,-110.0,-110.0,4.5,-130.0,110.0
...,...,...,...,...,...,...,...,...,...,...,...
2010,2023,17,mike gesicki,NE,+500,17.5,-115.0,-110.0,2.5,116.0,-150.0
2011,2023,17,josh oliver,MIN,+450,10.5,-115.0,-115.0,,,
2012,2023,17,austin hooper,LV,+450,20.5,-113.0,-110.0,2.5,100.0,-135.0
2013,2023,17,adam trautman,DEN,+400,5.5,-110.0,-110.0,0.5,-174.0,130.0


In [18]:
for column in ('Anytime Touchdown Line', ):
    df.loc[df[column].str.contains('EVEN', regex=False, na=False), column] = '100'
    df.loc[df[column] == '--', column] = 'nan'
    df[column] = df[column].astype(float)

df

Unnamed: 0,Year,Week,Player Name,Team,Anytime Touchdown Line,Receiving Yards Projection,Receiving Yards Over,Receiving Yards Under,Receptions Projection,Receptions Over,Receptions Under
0,2020,1,george kittle,SF,100.0,71.5,-110.0,-110.0,6.5,102.0,-130.0
1,2020,1,travis kelce,KC,-120.0,69.5,-122.0,100.0,5.5,-142.0,112.0
2,2020,1,mark andrews,BAL,138.0,48.5,-110.0,-110.0,4.5,115.0,-145.0
3,2020,1,zach ertz,PHI,163.0,54.5,-110.0,-110.0,5.5,-104.0,-145.0
4,2020,1,darren waller,LV,138.0,52.5,-110.0,-110.0,4.5,-130.0,110.0
...,...,...,...,...,...,...,...,...,...,...,...
2010,2023,17,mike gesicki,NE,500.0,17.5,-115.0,-110.0,2.5,116.0,-150.0
2011,2023,17,josh oliver,MIN,450.0,10.5,-115.0,-115.0,,,
2012,2023,17,austin hooper,LV,450.0,20.5,-113.0,-110.0,2.5,100.0,-135.0
2013,2023,17,adam trautman,DEN,400.0,5.5,-110.0,-110.0,0.5,-174.0,130.0


In [19]:
def american_odds_to_probability(odds):
    """
    Converts American betting odds to implied probability.
    
    Args:
        odds (int or float): The American odds value (e.g., -150, +200).

    Returns:
        float: The implied probability as a decimal (e.g., 0.60 for 60%).
    """
    if odds > 0:
        # Positive odds: (100 / (odds + 100))
        probability = 100 / (odds + 100)
    else:
        # Negative odds: (-odds / (-odds + 100))
        probability = -odds / (-odds + 100)
    
    return probability

Adjust projections based on over/under lines

In [20]:
for stat in ('Receiving Yards', 'Receptions'):
    df[f'Adjusted {stat} Projection'] = (
        df[f'{stat} Projection'] +
        0.5 * df[f'{stat} Over'].apply(american_odds_to_probability) -
        0.5 * df[f'{stat} Under'].apply(american_odds_to_probability)
    )

df

Unnamed: 0,Year,Week,Player Name,Team,Anytime Touchdown Line,Receiving Yards Projection,Receiving Yards Over,Receiving Yards Under,Receptions Projection,Receptions Over,Receptions Under,Adjusted Receiving Yards Projection,Adjusted Receptions Projection
0,2020,1,george kittle,SF,100.0,71.5,-110.0,-110.0,6.5,102.0,-130.0,71.500000,6.464916
1,2020,1,travis kelce,KC,-120.0,69.5,-122.0,100.0,5.5,-142.0,112.0,69.524775,5.557539
2,2020,1,mark andrews,BAL,138.0,48.5,-110.0,-110.0,4.5,115.0,-145.0,48.500000,4.436640
3,2020,1,zach ertz,PHI,163.0,54.5,-110.0,-110.0,5.5,-104.0,-145.0,54.500000,5.458984
4,2020,1,darren waller,LV,138.0,52.5,-110.0,-110.0,4.5,-130.0,110.0,52.500000,4.544513
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010,2023,17,mike gesicki,NE,500.0,17.5,-115.0,-110.0,2.5,116.0,-150.0,17.505537,2.431481
2011,2023,17,josh oliver,MIN,450.0,10.5,-115.0,-115.0,,,,10.500000,
2012,2023,17,austin hooper,LV,450.0,20.5,-113.0,-110.0,2.5,100.0,-135.0,20.503353,2.462766
2013,2023,17,adam trautman,DEN,400.0,5.5,-110.0,-110.0,0.5,-174.0,130.0,5.500000,0.600127


In [21]:
df['Anytime Touchdown Probability'] = df['Anytime Touchdown Line'].apply(american_odds_to_probability)

df

Unnamed: 0,Year,Week,Player Name,Team,Anytime Touchdown Line,Receiving Yards Projection,Receiving Yards Over,Receiving Yards Under,Receptions Projection,Receptions Over,Receptions Under,Adjusted Receiving Yards Projection,Adjusted Receptions Projection,Anytime Touchdown Probability
0,2020,1,george kittle,SF,100.0,71.5,-110.0,-110.0,6.5,102.0,-130.0,71.500000,6.464916,0.500000
1,2020,1,travis kelce,KC,-120.0,69.5,-122.0,100.0,5.5,-142.0,112.0,69.524775,5.557539,0.545455
2,2020,1,mark andrews,BAL,138.0,48.5,-110.0,-110.0,4.5,115.0,-145.0,48.500000,4.436640,0.420168
3,2020,1,zach ertz,PHI,163.0,54.5,-110.0,-110.0,5.5,-104.0,-145.0,54.500000,5.458984,0.380228
4,2020,1,darren waller,LV,138.0,52.5,-110.0,-110.0,4.5,-130.0,110.0,52.500000,4.544513,0.420168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010,2023,17,mike gesicki,NE,500.0,17.5,-115.0,-110.0,2.5,116.0,-150.0,17.505537,2.431481,0.166667
2011,2023,17,josh oliver,MIN,450.0,10.5,-115.0,-115.0,,,,10.500000,,0.181818
2012,2023,17,austin hooper,LV,450.0,20.5,-113.0,-110.0,2.5,100.0,-135.0,20.503353,2.462766,0.181818
2013,2023,17,adam trautman,DEN,400.0,5.5,-110.0,-110.0,0.5,-174.0,130.0,5.500000,0.600127,0.200000


In [22]:
df.to_parquet(r'..\..\data\betting_lines\te1_props.parquet')

print('Done')

Done
