In [2]:
import numpy as np
import pandas as pd
import requests
import json
import bs4
from datetime import date
from bs4 import BeautifulSoup as bs

# Web Scraping From ESPN.com

The data we need is located on the ESPN website, but since there is no way to export this data we will use web scraping to compile it into a JSON file.

In [14]:
# parameters of which pages to scrape
min_year = 2006
max_year = 2006

In [15]:
def save_to_json(name, year, url, pos, age, status, prev_team, new_team, years_signed, dollars):
    with open('players.json') as in_file:
        data = json.load(in_file)
        
        already_exists = False
        for player in data:
            if player['url'] == url and player['year'] == year:
                already_exists = True
        
        if not already_exists:
            new_player = {}
            new_player['name'] = name
            new_player['year'] = year
            new_player['url'] = url
            new_player['pos'] = pos
            new_player['age'] = age
            new_player['status'] = status
            new_player['prev_team'] = prev_team
            new_player['new_team'] = new_team
            new_player['years_signed'] = years_signed
            new_player['dollars'] = dollars
            
            data.append(new_player)
            
            with open('players.json', 'w') as out_file:
                json.dump(data, out_file, indent=4)
                
            print("Saved: {}". format(name))
        else:
            print("Already Exists: {} ".format(name))

In [13]:
for year in range(min_year, max_year + 1):
    print("PARSING YEAR", year)
    # Send a request to the ESPN page containing the data we need
    response = requests.get('http://www.espn.com/mlb/freeagents/_/year/' + str(year))
    content = response.content
    print("Connection Status Code:", response.status_code)

    # Parse the HTML to find the table cells
    parser = bs(content, 'html.parser')
    table_rows = parser.find_all('tr', class_=['oddrow', 'evenrow'])[:5]
    # Save the data in a JSON file
    for row in table_rows:
        cells = row.select('td')

        name = cells[0].text
        year = year
        pos = cells[1].text
        status = cells[3].text
        prev_team = cells[4].text
        new_team = cells[5].text
        years_signed = cells[6].text
        dollars = cells[8].text
        
        # some players have no urls, so we will just skip them
        try:
            url = cells[0].select('a')[0]['href']
        except IndexError:
            continue
            
        # if a player has no age listed, we will also skip them
        try:
            age = int(cells[2].text) - (date.today().year - year)
        except ValueError:
            continue        

        save_to_json(name, year, url, pos, age, status, prev_team, new_team, years_signed, dollars)

PARSING YEAR 2006
Connection Status Code: 200
Already Exists: Matt Albers 
Already Exists: Sandy Alomar Jr. 
Already Exists: Moises Alou 
Already Exists: Rick Ankiel 
Already Exists: Tony Armas 
