# Using Scrapy to Scrape for Historical Baseball Statistics

In [1]:
import scrapy
import re
from scrapy.crawler import CrawlerProcess

class BattingAvgSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "batting_avg"
    
    # URL(s) to start with.
    start_urls = [
        'http://www.espn.com/mlb/history/leaders',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        
        # Iterate over every <article> element on the page.
        for player_entry in response.xpath('//tr'):
            
            # Yield a dictionary with the values we want.
            yield {
                'rank': player_entry.xpath('//td[1]').extract_first(),
                'name': player_entry.xpath('//td[2]/a').extract_first(),
                'years': player_entry.xpath('//td[3]').extract(),
                'games': player_entry.xpath('//td[4]').extract(),
                'at_bats': player_entry.xpath('//td[5]').extract(),
                'hits': player_entry.xpath('//td[7]').extract(),
                'rbi': player_entry.xpath('//td[11]').extract()
            }
        # Get the URL of the previous page.
        single_season = response.xpath('//*[@id="content"]/div[2]/div/div/div[1]/text()[3]').extract_first()
        
        # Recursively call the spider to run on the next page, if it exists.
        if single_season is not None:
            single_season = response.urljoin(single_season)
            # Request the next page and recursively parse it the same way we did above
            yield scrapy.Request(single_season, callback=self.parse)

# Tell the script how to run the crawler by passing in settings.
# The new settings have to do with scraping etiquette.          
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'batting_avg.json',       # Name our storage file.
    'LOG_ENABLED': False,          # Turn off logging for now.
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True
})

# Start the crawler with our spider.
process.crawl(BattingAvgSpider)
process.start()
print('Success!')

Success!


In [5]:
import pandas as pd
batting_avg_df = pd.read_json('batting_avg.json', orient='records')