In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [6]:
#to connect to the website, we pass the URL string into requests.get()
page = requests.get('http://www.espn.com/mlb/history/leaders/_/breakdown/season/year/2018/start/1')

In [7]:
#we can look at the source code behind the website using .text
page.text

'\n<!DOCTYPE html>\n<html xmlns:fb="https://www.facebook.com/2008/fbml">\n<head>\n\n<script>\n(function redirectToHttpIfHttps() {\n   var win      = typeof window !== \'undefined\' && window,\n       location = win && win.location,\n       protocol = location && location.protocol;\n\n   if (protocol === \'https:\' && !true) {\n        location.href = location.href.replace(\'https://\', \'http://\');\n   }\n})();\n</script><meta charset="iso-8859-1">\n<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n<link rel="icon" sizes="any" mask href="https://a.espncdn.com/favicon.ico">\n<meta name="theme-color" content="#CC0000">\n<script type="text/javascript">\n    if(true && navigator && navigator.userAgent.toLowerCase().indexOf("teamstream") >= 0) {\n        window.location = \'http://m.espn.com/mobilecache/general/apps/sc\';\n    }\n</script><title>MLB Baseball Career Batting Leaders - Major League Baseball - ESPN</title>\n<meta name="google-site-verification" content="xuj1ODRlu

we specify that BeautifulSoup should parse the source code 
(page.text) as HTML by including the html.parse and save it to 
a variable, soup

In [10]:
#pull in website's Source Code
url = 'http://www.espn.com/mlb/history/leaders/_/breakdown/season/year/2018/start/1'

page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

after inspecting the website, we found out that the tag for the first player's row is 'tr' and its attributes is 'class': 'oddrow player-10-33039'.

we use find.all() to extract the information of first player's row.
- find.all() first argument is the tag of element we want to scrape.
- find.all() second argument is the attrs argument
    - attrs argument is constructed like so:
    - attrs = {'attribute1_name': 'attribute1_value', 'attribute2_name': 'attribute2_value'}
- find.all() always returns a list

In [11]:
#extract the first player's row from the website
soup.find_all('tr', attrs = {'class': 'oddrow player-10-33039'})

[<tr align="right" class="oddrow player-10-33039"><td align="left">1</td><td align="left"><a href="https://www.espn.com/mlb/player/_/id/33039/mookie-betts"><span class="bi">Mookie Betts</span></a></td><td align="left">7</td><td>136</td><td>520</td><td>129</td><td>180</td><td>47</td><td>5</td><td>32</td><td>80</td><td>81</td><td>91</td><td>30</td><td>6</td><td class="sortcell">.346</td></tr>]

if we just want to keep the text that report the player's batting average, which is the last 'td' tag, and remove all HTML notation, we can select just text with BeautifulSoup's .get_text() method.

In [12]:
# extract the text from each <td> element in the first player's row
row = soup.find('tr', {'class': 'oddrow player-10-33039'})
for data in row.find_all('td'):
    print(data.get_text())

1
Mookie Betts
7
136
520
129
180
47
5
32
80
81
91
30
6
.346


### Creat a final DataFram using Pandas with the batting states for all 331 players.

We want the column names to be the same column names used in the table found on ESPN, so we need to scrape the text from the table header.

After inspecting the website's source code, we find that the start tag for the table header is 'tr class='colhead' align='right'.
We only want to scrape a single row, we use .find() method.

In [13]:
#Create column names from the table header
##Identify Header Row
header = soup.find('tr', attrs = {'class': 'colhead'})
##Extract the text column names from Header Row
columns = [col.get_text() for col in header.find_all('td')]
#columns

#Create an empty DataFrame with column names from the header
final_df = pd.DataFrame(columns=columns)
final_df

Unnamed: 0,Unnamed: 1,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA


#### Scrape each baseball player's stats

Looking at the source code of each player's row, we find that all plays' classes are like: 'class="oddrow player-10-..."' or 'class="evenrow player-10-..."'

Knowing this, we can scrape all player elements by using the regular expression's (re) compile function to pick up all the elements that contain the string 'row player-10-' in its class value.

In [14]:
#this shows all the HTML elements that contain the players' data on the page
players = soup.find_all('tr', attrs = {'class': re.compile('row player-10-')})

In [15]:
#Looping through each page to capture all 331 players, 50 players at a time
for i in range(1,331,50):
    
    #pull in website source code
    url = 'http://www.espn.com/mlb/history/leaders/_/breakdown/season/year/2018/start/{}'.format(i)
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    #pull in the player rows
    #identify player rows
    players = soup.find_all('tr', attrs = {'class': re.compile('row player-10-')})
    for player in players:
    
        #Get the stats for each player
        stats = [stat.get_text() for stat in player.find_all('td')]
    
        #Create a temporary DataFrame for each player's stats
        temp_df = pd.DataFrame(stats).transpose()
        temp_df.columns = columns
    
        #Join each single player's stats with the final_df using pd.concat()
        final_df = pd.concat([final_df, temp_df], ignore_index=True)

final_df

Unnamed: 0,Unnamed: 1,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA
0,1,Mookie Betts,7,136,520,129,180,47,5,32,80,81,91,30,6,.346
1,2,J.D. Martinez,10,150,569,111,188,37,2,43,130,69,146,6,1,.330
2,3,Jeff McNeil,3,63,225,35,74,11,6,3,19,14,24,7,1,.329
3,4,Christian Yelich,8,147,574,118,187,34,7,36,110,68,135,22,4,.326
4,5,Jose Altuve,10,137,534,84,169,29,2,13,61,55,79,17,4,.316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,,Gary Sanchez,6,89,323,51,60,17,0,18,53,46,94,1,0,.186
327,328,Aaron Altherr,6,105,243,28,44,11,1,8,38,36,91,3,2,.181
328,329,Dexter Fowler,13,90,289,40,52,10,0,8,31,38,75,5,2,.180
329,330,Sandy Leon,9,89,265,30,47,12,0,5,22,15,75,1,0,.177


In [None]:
#export this DataFrame to a .csv file for analysis
final_df.to_csv(r"Desktop\mlb_stats.csv", index=False, sep=',', encoding='utf-8')