In [16]:
import requests
import json
import bs4
from datetime import date
from bs4 import BeautifulSoup as bs

# Web Scraping From ESPN.com

The data we need is located on the ESPN website, but since there is no way to export this data we will use web scraping to compile it into a JSON file.

In [27]:
# parameters of which pages to scrape
min_year = 2006
max_year = 2019

In [28]:
def save_to_json(name, year, url, pos, age, status, prev_team, new_team, years_signed, dollars):
    with open('players.json') as in_file:
        data = json.load(in_file)
        
        already_exists = False
        for player in data:
            if player['url'] == url and player['year'] == year:
                already_exists = True
        
        if not already_exists:
            new_player = {}
            new_player['name'] = name
            new_player['year'] = year
            new_player['url'] = url
            new_player['pos'] = pos
            new_player['age'] = age
            new_player['status'] = status
            new_player['prev_team'] = prev_team
            new_player['new_team'] = new_team
            new_player['years_signed'] = years_signed
            new_player['dollars'] = dollars
            
            data.append(new_player)
            
            with open('players.json', 'w') as out_file:
                json.dump(data, out_file, indent=4)
                
            print("Saved: {}". format(name))
        else:
            print("Already Exists: {} ".format(name))

In [29]:
for year in range(min_year, max_year + 1):
    print("PARSING YEAR", year)
    # Send a request to the ESPN page containing the data we need
    response = requests.get('http://www.espn.com/mlb/freeagents/_/year/' + str(year))
    content = response.content
    print("Connection Status Code:", response.status_code)

    # Parse the HTML to find the table cells
    parser = bs(content, 'html.parser')
    table_rows = parser.find_all('tr', class_=['oddrow', 'evenrow'])
    # Save the data in a JSON file
    for row in table_rows:
        cells = row.select('td')

        name = cells[0].text
        year = year
        pos = cells[1].text
        status = cells[3].text
        prev_team = cells[4].text
        new_team = cells[5].text
        years_signed = cells[6].text
        dollars = cells[8].text
                
        # some players have no urls, so we will just skip them
        try:
            url = cells[0].select('a')[0]['href']
        except IndexError:
            continue
            
        # if a player has no age listed, we will also skip them
        try:
            age = int(cells[2].text) - (date.today().year - year)
        except ValueError:
            continue        
            
        # also skip players who did not sign a contract during their FA period
        if years_signed == '':
            continue
        elif new_team == '--':
            continue
        elif dollars == '--' or dollars == 'Minor Lg':
            continue

        save_to_json(name, year, url, pos, age, status, prev_team, new_team, years_signed, dollars)

PARSING YEAR 2006
Connection Status Code: 200
Already Exists: Moises Alou 
Already Exists: Tony Armas 
Already Exists: Rich Aurilia 
Already Exists: Danys Baez 
Already Exists: Paul Bako 
Already Exists: Rod Barajas 
Already Exists: Miguel Batista 
Already Exists: Gary Bennett 
Already Exists: Henry Blanco 
Already Exists: Geoff Blum 
Already Exists: Barry Bonds 
Already Exists: Aaron Boone 
Already Exists: Joe Borowski 
Already Exists: Chad Bradford 
Already Exists: Doug Brocail 
Already Exists: Miguel Cairo 
Already Exists: Jamey Carroll 
Already Exists: Sean Casey 
Already Exists: Frank Catalanotto 
Already Exists: Jeff Cirillo 
Already Exists: Royce Clayton 
Already Exists: Alex Cora 
Already Exists: Craig Counsell 
Already Exists: Jose Cruz Jr. 
Already Exists: Dave Dellucci 
Already Exists: Mark DeRosa 
Already Exists: Octavio Dotel 
Already Exists: J.D. Drew 
Already Exists: Ray Durham 
Already Exists: Damion Easley 
Already Exists: Adam Eaton 
Already Exists: Jim Edmonds 
Alrea

Already Exists: Damaso Marte 
Already Exists: Jason Michaels 
Already Exists: Aaron Miles 
Already Exists: Trever Miller 
Already Exists: Guillermo Mota 
Already Exists: Jamie Moyer 
Already Exists: Joe Nelson 
Already Exists: Greg Norton 
Already Exists: Darren Oliver 
Already Exists: Chan Ho Park 
Already Exists: Carl Pavano 
Already Exists: Brad Penny 
Already Exists: Oliver Perez 
Already Exists: Andy Pettitte 
Already Exists: Scott Proctor 
Already Exists: Nick Punto 
Already Exists: Horacio Ramirez 
Already Exists: Manny Ramirez 
Already Exists: Tim Redding 
Already Exists: Edgar Renteria 
Already Exists: Dennys Reyes 
Already Exists: Arthur Rhodes 
Already Exists: Juan Rivera 
Already Exists: Francisco Rodriguez 
Already Exists: Iván Rodríguez 
Already Exists: David Ross 
Already Exists: CC Sabathia 
Already Exists: Takashi Saito 
Already Exists: Brian Shouse 
Already Exists: John Smoltz 
Already Exists: Russ Springer 
Already Exists: Willy Taveras 
Already Exists: Mark Teixeira

Saved: Ryan Theriot
Saved: Jim Thome
Saved: Matt Treanor
Saved: Tsuyoshi Wada
Saved: Chien-Ming Wang
Saved: Eli Whiteside
Saved: Josh Willingham
Saved: Dontrelle Willis
Saved: C.J. Wilson
Saved: Jack Wilson
Saved: Kerry Wood
Saved: Joel Zumaya
PARSING YEAR 2012
Connection Status Code: 200
Saved: Mike Adams
Saved: Jeremy Affeldt
Saved: Scott Baker
Saved: Lance Berkman
Saved: Joe Blanton
Saved: Michael Bourn
Saved: Jonathan Broxton
Saved: Sean Burnett
Saved: Melky Cabrera
Saved: Eric Chavez
Saved: Randy Choate
Saved: Bartolo Colon
Saved: Kevin Correia
Saved: Ryan Dempster
Saved: Stephen Drew
Saved: Scott Feldman
Saved: Jeff Francis
Saved: Jason Frasor
Saved: Kyuji Fujikawa
Saved: Jonny Gomes
Saved: Mike Gonzalez
Saved: Tom Gorzelanny
Saved: Zack Greinke
Saved: Jason Grilli
Saved: Jeremy Guthrie
Saved: Josh Hamilton
Saved: Jack Hannahan
Saved: Dan Haren
Saved: Roberto Hernandez
Saved: Eric Hinske
Saved: J.P. Howell
Saved: Torii Hunter
Saved: Raul Ibanez
Saved: Hisashi Iwakuma
Saved: Maice

Saved: Blake Treinen
Saved: Stephen Vogt
Saved: Michael Wacha
Saved: Adam Wainwright
Saved: Taijuan Walker
Saved: Zack Wheeler
Saved: Matt Wieters
Saved: Alex Wood
