In [319]:
import requests
import json
import bs4
from datetime import date
from bs4 import BeautifulSoup as bs

### NOTES

Scraper does not account for trade years, only counts stats from the lastinstance of a year.

# Web Scraping From ESPN.com

The data we need is located on the ESPN website, but since there is no way to export this data we will use web scraping to compile it into a JSON file.

In [357]:
# parameters of which pages to scrape
min_year = 2017
max_year = 2019

This method takes a player's scraped data and appends it to a JSON file containing all data.  For statistical data, we create one file for batters, and one for pitchers.

In [343]:
def save_to_json(name, year, url, pos, age, status, prev_team, new_team, years_signed, dollars, stat_names, stats):
    # load the correct JSON based on pitcher/batter position
    if pos == 'SP' or pos =='RP' or pos =='P':
        file_path = 'pitchers_stats.json'
    else:
        file_path = 'batters_stats.json'
        
    with open(file_path) as in_file:
        data = json.load(in_file)
        
    # check to make sure player/year combo is not already added
    already_exists = False
    for player in data:
        if player['url'] == url and player['year'] == year:
            already_exists = True
    
    if not already_exists:
        new_player = {}

        # add general player and contract info
        new_player['name'] = name
        new_player['year'] = year
        new_player['url'] = url
        new_player['pos'] = pos
        new_player['age'] = age
        new_player['status'] = status
        new_player['prev_team'] = prev_team
        new_player['new_team'] = new_team
        new_player['years_signed'] = years_signed
        new_player['dollars'] = dollars

        # add player's stats over the previous 3 years
        for year in range(0, len(stats)):
            year_list = {}
            for i in range(len(stat_names)):
                year_list[stat_names[i]] = stats[year][i]
            new_player['stats_' + str(year+1) + 'yr_ago'] = year_list
            
        data.append(new_player)

        # write new data to the JSON
        with open(file_path, 'w') as out_file:
            json.dump(data, out_file, indent=4)

        print("Saved: {}". format(name))
    else:
        print("Already Exists: {} ".format(name))

This method is called to find a list of stats for a given player.  Inputs are a list of years to find stats from, and a list of stats to look for.

In [344]:
# return format will be a list of yearly data lists
# ex. years = [2015,2016], stat_names = [AB, WAR]
# stats = [[480,3.1],[355, 2.5]]
def find_stats(url, years, stat_names):
    stats = []
    # send a request to the ESPN stats page for a player
    url_split = url.split('player/')
    url = url_split[0] + 'player/stats/' + url_split[1]
    response = requests.get(url)
    content = response.content
    
    # parse the HTML to find the relevant table of statistics
    parser = bs(content, 'html.parser')
    table = parser.find('section', class_='ResponsiveTable')
    
    # find the row number(s) in the table corresponding to the relevant year(s)
    row_nums = []
    table_left = table.find('table', class_='Table--fixed-left').find('tbody')
    row_years = table_left.findAll('tr')
    for year in years:
        row_list = []
        for row in row_years:
            cell = row.find('td')
            if cell.text == str(year):
                row_list.append(row['data-idx'])
        row_nums.append(row_list)
    
    # find the column number(s) in the table corresponding to the relevant statistic(s)
    table_body = table.find('div', class_='Table__Scroller')
    col_nums = []
    columns = table_body.findAll('th')
    for stat_name in stat_names:
        col_num = 0
        for col in columns:
            if col.text == stat_name:
                col_nums.append(col_num)
                break
            col_num += 1
    
    # find the stat data at the calculated row and column of the table
    table_data = table_body.find('tbody', class_='Table__TBODY')
    for row_list in row_nums:
        yearly_stats = []
        # if a player did not play in a year
        if len(row_list) == 0:
            for col_num in col_nums:
                yearly_stats.append('None')
                
        # if player had multiple teams in one year, combine all rows of data into one row
        elif len(row_list) > 1:
            yearly_stats_by_team = []
            for row_list_item in row_list:
                team_stats = []
                row = table_data.find('tr',class_='Table__TR', attrs={'data-idx':row_list_item})
                for col_num in col_nums:
                    index = 0
                    for cell in row.findAll('td'):
                        if index == col_num:
                            team_stats.append(cell.text)
                        index += 1
                yearly_stats_by_team.append(team_stats)
            
            # sum the counting stats across each team per year
            for i in range(len(stat_names)):
                total = 0
                for team_stats in yearly_stats_by_team:
                    if stat_names[i] in ['WAR','IP']:
                        total += float(team_stats[i])
                    else:
                        total += int(team_stats[i])
                yearly_stats.append(str(total))
            
        # if player had exactly one team in a year
        else:
            row = table_data.find('tr',class_='Table__TR', attrs={'data-idx':row_list[0]})
            for col_num in col_nums:
                index = 0
                for cell in row.findAll('td'):
                    if index == col_num:
                        yearly_stats.append(cell.text)
                    index += 1
        stats.append(yearly_stats)
    
    return stats

Here we run the actual web scraper, using data from ESPN's MLB Free Agent Tracker and sending requests to their player statistics pages for each player listed on the tracker.

In [358]:
for year in range(min_year, max_year + 1):
    print("PARSING YEAR", year)
    # Send a request to the ESPN page containing the data we need
    response = requests.get('http://www.espn.com/mlb/freeagents/_/year/' + str(year))
    content = response.content
    print("Connection Status Code:", response.status_code)

    # Parse the HTML to find the table cells
    parser = bs(content, 'html.parser')
    table_rows = parser.find_all('tr', class_=['oddrow', 'evenrow'])
    
    # Save the data in a JSON file
    for row in table_rows[:140]:
        cells = row.select('td')

        name = cells[0].text
        year = year
        pos = cells[1].text
        status = cells[3].text
        prev_team = cells[4].text
        new_team = cells[5].text
        years_signed = cells[6].text
        dollars = cells[8].text
                
        # some players have no urls, so we will just skip them
        try:
            url = cells[0].select('a')[0]['href']
        except IndexError:
            continue
            
        # if a player has no age listed, we will also skip them
        try:
            age = int(cells[2].text) - (date.today().year - year)
        except ValueError:
            continue        
            
        # also skip players who did not sign a contract during their FA period
        if years_signed == '':
            continue
        elif new_team == '--':
            continue
        elif dollars == '--' or dollars == 'Minor Lg':
            continue
            
        # skip a  few players whose ESPN profiles are bugged / have no data
        if name in ['José Molina','Brian Anderson','Hiroyuki Nakajima','Yaisel Sierra','Thomas Milone']:
            continue
        
        # given a list of years and stats to find, retrieve the relevant data for that player
        years = [year, year-1, year-2]
        if pos == 'SP' or pos == 'RP' or pos== 'P':
            stat_names = ['GP','GS','W','L','IP','K','BB','H','R','ER','SV','HLD','BLSV','WAR']
        else:
            stat_names = ['GP','AB','R','H','2B','3B','HR','RBI','BB','HBP','SO','SB','CS','WAR']
        
        stats = find_stats(url, years, stat_names)
        save_to_json(name, year, url, pos, age, status, prev_team, new_team, years_signed, dollars, stat_names, stats)
        
print('Finished')

PARSING YEAR 2017
Connection Status Code: 200
Saved: Matt Adams
Saved: Matt Albers
Saved: Yonder Alonso
Saved: Jake Arrieta
Saved: Alex Avila
Saved: Erick Aybar
Saved: Tony Barnette
Saved: Joaquin Benoit
Saved: Peter Bourjos
Saved: Jay Bruce
Saved: Trevor Cahill
Saved: Lorenzo Cain
Saved: Andrew Cashner
Saved: Welington Castillo
Saved: Jhoulys Chacin
Saved: Tyler Chatwood
Saved: Jesse Chavez
Saved: Steve Cishek
Saved: Alex Cobb
Saved: Zack Cozart
Saved: Yu Darvish
Saved: Wade Davis
Saved: Lucas Duda
Saved: Brian Duensing
Saved: Zach Duke
Saved: Jarrod Dyson
Saved: Alcides Escobar
Saved: Mike Fiers
Saved: Doug Fister
Saved: Todd Frazier
Saved: Yovani Gallardo
Saved: Jaime Garcia
Saved: Carlos Gomez
Saved: Carlos Gonzalez
Saved: Miguel Gonzalez
Saved: Curtis Granderson
Saved: Luke Gregerson
Saved: David Hernandez
Saved: Eric Hosmer
Saved: Jared Hughes
Saved: Nick Hundley
Saved: Tommy Hunter
Saved: Chris Iannetta
Saved: Austin Jackson
Saved: Howie Kendrick
Saved: Brandon Kintzler
Saved: T