Challenge – Multi-page Tables Scrape

Please scrape all free agent signing from 2020-2024.

https://www.baseball-reference.com/leagues/majors/2020-free-agents.shtml 

In [35]:
import pandas as pd

# randomize time to scape each line- from the random library, import random and uniform
from random import randint, uniform 

import time

In [19]:
## headers - this tells the server what kind of browser is coming to it
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "           
                         "AppleWebKit/537.36 (KHTML, like Gecko) "
                         "Chrome/124.0.0.0 Safari/537.36"}

In [27]:
# first try scraping the first page
url = "https://www.baseball-reference.com/leagues/majors/2020-free-agents.shtml"

response = pd.read_html(url)

In [29]:
response

[      Rk                   Name        Date               To Team From Team  \
 0      1              Ben Gamel  2021-05-09    Pittsburgh Pirates       MIL   
 1      2          Brian Goodwin  2021-05-05     Chicago White Sox       CIN   
 2      3  Christian Bethancourt  2021-05-01    Pittsburgh Pirates       PHI   
 3      4        Aaron Wilkerson  2021-05-01   Los Angeles Dodgers       MIL   
 4      5            Tim Adleman  2021-04-30       Cincinnati Reds       DET   
 ..   ...                    ...         ...                   ...       ...   
 375  376        Abraham Almonte  2020-10-30        Atlanta Braves       SDP   
 376  377       Kendall Graveman  2020-10-29      Seattle Mariners       SEA   
 377  378       Stevie Wilkerson  2020-10-29     Baltimore Orioles       BAL   
 378  379           Justin Smoak  2020-09-09  San Francisco Giants       MIL   
 379  380               AJ Ramos  2020-09-05      Colorado Rockies       CHC   
 
      Age  WAR3  Yrs       G      AB  

In [49]:
# now try scraping all the pages that have 2020-2024 
base_url = "https://www.baseball-reference.com/leagues/majors/{year}-free-agents.shtml"

df_list = [] #to help concat all the tables from the diff pages
broken_links = [] #to keep track of diff broken pages etc

for i, year in enumerate(range (2020,2025), start = 1):
    url = base_url.format(year=year)
    print(f"scraping page {i}, url: {url}")
    try:
        df = pd.read_html(url)[0]
        df["source_url"] = url
        df_list.append(df)

    except Exception as e:
        print(f"oh no...encountered an issue: {e} at {url}")
        broken_links.append(url)

    finally:
        snoozer = uniform(5,20)
        print(f"snoozing for {snoozer} seconds before next scrape")
        time.sleep(snoozer)

print("done scraping all urls")

scraping page 1, url: https://www.baseball-reference.com/leagues/majors/2020-free-agents.shtml
snoozing for 10.674089324486795 seconds before next scrape
scraping page 2, url: https://www.baseball-reference.com/leagues/majors/2021-free-agents.shtml
snoozing for 15.340083974488024 seconds before next scrape
scraping page 3, url: https://www.baseball-reference.com/leagues/majors/2022-free-agents.shtml
snoozing for 7.607781136101068 seconds before next scrape
scraping page 4, url: https://www.baseball-reference.com/leagues/majors/2023-free-agents.shtml
snoozing for 15.9317236311016 seconds before next scrape
scraping page 5, url: https://www.baseball-reference.com/leagues/majors/2024-free-agents.shtml
snoozing for 17.2234619352938 seconds before next scrape
done scraping all urls


In [53]:
# concat the list into a single dataframe
df = pd.concat (df_list, ignore_index= True)
df

Unnamed: 0,Rk,Name,Date,To Team,From Team,Age,WAR3,Yrs,G,AB,...,WHIP,G.1,GS,SV,IP,H.1,HR.1,BB.1,SO,source_url
0,1,Ben Gamel,2021-05-09,Pittsburgh Pirates,MIL,29,0.8,5,442.0,1239.0,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
1,2,Brian Goodwin,2021-05-05,Chicago White Sox,CIN,30,1.9,5,357.0,1009.0,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
2,3,Christian Bethancourt,2021-05-01,Pittsburgh Pirates,PHI,29,,5,161.0,469.0,...,3.375,6.0,0.0,0.0,5.1,7.0,1.0,11.0,3.0,https://www.baseball-reference.com/leagues/maj...
3,4,Aaron Wilkerson,2021-05-01,Los Angeles Dodgers,MIL,32,-0.5,3,,,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
4,5,Tim Adleman,2021-04-30,Cincinnati Reds,DET,33,,4,,,...,,,,,,,,,,https://www.baseball-reference.com/leagues/maj...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022,396,Chris Devenski,2024-10-31,New York Mets,SEA,34,-1.0,9,25.0,1.0,...,1.135,303.0,7.0,8.0,400.0,343.0,58.0,111.0,415.0,https://www.baseball-reference.com/leagues/maj...
2023,397,Geoff Hartlieb,2024-10-18,New York Yankees,COL,31,-0.3,5,34.0,0.0,...,1.866,64.0,0.0,0.0,79.1,93.0,11.0,55.0,76.0,https://www.baseball-reference.com/leagues/maj...
2024,398,Yohan Ramírez,2024-10-13,Pittsburgh Pirates,BOS,30,-0.9,5,4.0,0.0,...,1.379,140.0,0.0,7.0,169.0,147.0,20.0,86.0,173.0,https://www.baseball-reference.com/leagues/maj...
2025,399,Zach Logue,2024-09-12,Los Angeles Dodgers,ATL,29,-1.5,3,0.0,0.0,...,1.557,19.0,10.0,0.0,70.0,87.0,17.0,22.0,56.0,https://www.baseball-reference.com/leagues/maj...
