# Scrape NBA Player Real Plus-Minus (RPM) Stats from ESPN with lxml

In [237]:
import csv
import time
import string
import requests
import lxml.html as lh
import json
import numpy
import pandas as pd

## Extract `<tr>` elements
* Create a GET request to the site's url
* Store the webpage contents
* Extract the `<tr>` elements

In [238]:
url='http://www.espn.com/nba/statistics/rpm/_/sort/RPM'

# Create a GET request to the ESPN RPM site url
page = requests.get(url)

# Store the contents of the website
doc = lh.fromstring(page.content)

# Parse the data stored in all <tr> tags in the HTML
tr_elements = doc.xpath('//tr')

print('tr_elements: ', tr_elements)

tr_elements:  [<Element tr at 0x1166d5b38>, <Element tr at 0x1167a3688>, <Element tr at 0x1167a3638>, <Element tr at 0x1167a3368>, <Element tr at 0x1167a3138>, <Element tr at 0x1167a32c8>, <Element tr at 0x1167a3188>, <Element tr at 0x1167a36d8>, <Element tr at 0x1166a5c28>, <Element tr at 0x11685b9a8>, <Element tr at 0x11685b9f8>, <Element tr at 0x11685b548>, <Element tr at 0x11685b868>, <Element tr at 0x11685b688>, <Element tr at 0x11685b778>, <Element tr at 0x11685bb88>, <Element tr at 0x11685bbd8>, <Element tr at 0x11685bc28>, <Element tr at 0x11685b7c8>, <Element tr at 0x11685bc78>, <Element tr at 0x11685bcc8>, <Element tr at 0x11685b728>, <Element tr at 0x11685bb38>, <Element tr at 0x11685b818>, <Element tr at 0x11685ba48>, <Element tr at 0x11685bdb8>, <Element tr at 0x11685b598>, <Element tr at 0x11685bae8>, <Element tr at 0x11685be58>, <Element tr at 0x11685be08>, <Element tr at 0x11685bd18>, <Element tr at 0x11685bd68>, <Element tr at 0x11685bf98>, <Element tr at 0x11685bf48>,

## Check the length of each row of the table

In [239]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]

## Store first element (header) and empty list for each row

In [240]:
# Create empty list
cols = []
i=0

# For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i += 1        
    column_name = t.text_content()
    print('%d:"%s"'%(i, column_name))
    cols.append((column_name,[]))

1:"RK"
2:"NAME"
3:"TEAM"
4:"GP"
5:"MPG"
6:"ORPM"
7:"DRPM"
8:"RPM"
9:"WINS"


## Extract links to all player pages

In [241]:
# Extract player page links
links = doc.xpath('//*[@id="my-players-table"]/div/div[2]/table//a/@href')
# First four links are not player pages
links = links[4:]
print('links: ', links)

links:  ['http://www.espn.com/nba/player/_/id/4251/paul-george', 'http://www.espn.com/nba/player/_/id/3992/james-harden', 'http://www.espn.com/nba/player/_/id/3975/stephen-curry', 'http://www.espn.com/nba/player/_/id/3032977/giannis-antetokounmpo', 'http://www.espn.com/nba/player/_/id/3112335/nikola-jokic', 'http://www.espn.com/nba/player/_/id/6583/anthony-davis', 'http://www.espn.com/nba/player/_/id/3059318/joel-embiid', 'http://www.espn.com/nba/player/_/id/6478/nikola-vucevic', 'http://www.espn.com/nba/player/_/id/1966/lebron-james', 'http://www.espn.com/nba/player/_/id/6606/damian-lillard', 'http://www.espn.com/nba/player/_/id/3202/kevin-durant', 'http://www.espn.com/nba/player/_/id/3988/danny-green', 'http://www.espn.com/nba/player/_/id/3012/kyle-lowry', 'http://www.espn.com/nba/player/_/id/3032976/rudy-gobert', 'http://www.espn.com/nba/player/_/id/2779/chris-paul', 'http://www.espn.com/nba/player/_/id/3995/jrue-holiday', 'http://www.espn.com/nba/player/_/id/3213/al-horford', 'http

## Store each `<tr>` element's data in a two-dimensional array
Each `<td>` element can be accessed using `iterchildren()`.

In [242]:
# Our first row is the header (columns), so data is stored in the second row onwards
for j in range(1, len(tr_elements)):
    
    # T is our j'th row
    T = tr_elements[j]
    
    # If row is not of size 9, the //tr data is not from our table 
    if len(T) != 9:
        break
    
    # i is the index of our column
    i = 0
    
    # Iterate through each element of the row
    for t in T.iterchildren():
        data = t.text_content()
        print('data: ', data)
        # Check if row is empty
        if i > 0:
            # Extract link to player page
            if i == 1:
                link = t.xpath('/a/@href')
                print('link: ', link)
            # Convert any numerical value to integers
            try:
                data = int(data)
            except:
                pass
        # Append the data to the empty list of the i'th column
        col[i][1].append(data)
        # Increment i for the next column
        i += 1

data:  1
data:  Paul George, SF
link:  []
data:  OKC
data:  77
data:  36.9
data:  4.49
data:  3.09
data:  7.58
data:  19.77
data:  2
data:  James Harden, PG
link:  []
data:  HOU
data:  78
data:  36.8
data:  7.38
data:  0.09
data:  7.47
data:  18.63
data:  3
data:  Stephen Curry, PG
link:  []
data:  GS
data:  69
data:  33.8
data:  5.86
data:  0.68
data:  6.54
data:  14.78
data:  4
data:  Giannis Antetokounmpo, PF
link:  []
data:  MIL
data:  72
data:  32.8
data:  3.13
data:  3.09
data:  6.22
data:  14.49
data:  5
data:  Nikola Jokic, C
link:  []
data:  DEN
data:  80
data:  31.3
data:  3.63
data:  2.56
data:  6.19
data:  14.46
data:  6
data:  Anthony Davis, PF
link:  []
data:  NO
data:  56
data:  33.0
data:  2.52
data:  3.20
data:  5.72
data:  10.79
data:  7
data:  Joel Embiid, C
link:  []
data:  PHI
data:  64
data:  33.7
data:  2.37
data:  3.12
data:  5.49
data:  11.67
data:  8
data:  Nikola Vucevic, C
link:  []
data:  ORL
data:  80
data:  31.4
data:  1.84
data:  3.60
data:  5.44
data:  

## Check the length of each column of data in the 2D array

In [243]:
[len(C) for (title, C) in col]

[891, 891, 891, 891, 891, 891, 891, 891, 891]

## Store the data in a new dataframe

In [244]:
Dict = {title:column for (title, column) in col}
df = pd.DataFrame(Dict)

## Inspect the dataframe

In [245]:
df.head()

Unnamed: 0,RK,NAME,TEAM,GP,MPG,ORPM,DRPM,RPM,WINS
0,1,"Paul George, SF",OKC,77,36.9,4.49,3.09,7.58,19.77
1,2,"James Harden, PG",HOU,78,36.8,7.38,0.09,7.47,18.63
2,3,"Stephen Curry, PG",GS,69,33.8,5.86,0.68,6.54,14.78
3,4,"Giannis Antetokounmpo, PF",MIL,72,32.8,3.13,3.09,6.22,14.49
4,5,"Nikola Jokic, C",DEN,80,31.3,3.63,2.56,6.19,14.46


We now have our dataframe populated with the RPM data. However, we can see that under the 'NAME' column, the player name is combined with the player position. We want to separate the player position into a new column. However, before doing this, we need to handle pagination and collect the rest of the player data, as it is stored across 13 separate pages.

## Find the link to the next page

In [246]:
# Extract the next page link
next_page_link = 'https:' + doc.xpath('//*[@id="my-players-table"]/div/div[2]/div/div[2]/a/@href')[0]
print(next_page_link)

https://www.espn.com/nba/statistics/rpm/_/page/2


Let's put all of the above in its own method, which we can call repeatedly inside a wrapper function. Additionally, let's add a cleaning function to handle the separation of the player name and player position into separate features. Our complete scraper with all its methods are shown below.

## Complete scraper with data cleaning method and wrapper

In [2]:
import csv
import string
import requests
import lxml.html as lh
import numpy
import pandas as pd

def clean_data(df):
    # Extract player positions into separate list
    positions = []
    for index, row in df.iterrows():
        positions.append(row['NAME'].split(', ')[1])
    
    print('total positions: ', len(positions))
    print('dataframe size: ', len(df.index))
    
    # Fix player name and add new column for player position
    df['NAME'] = df['NAME'].map(lambda x: x.split(', ')[0])
    df.insert(2, 'POS', positions)
    return df

# Main scraping method
def rpm_scraper_single_page(url, flag):
    # Create a get request to the site url
    page = requests.get(url)
    
    # Store the contents of the website
    doc = lh.fromstring(page.content)
    
    # Parse the data stored in all <tr> tags in the HTML
    tr_elements = doc.xpath('//tr')
    #print('tr_elements: ', tr_elements)
    
    #Check the length of the first 12 rows
    [len(T) for T in tr_elements[:12]]

    # Create empty list
    cols = []
    i=0

    # For each row, store each first element (header) and an empty list
    for t in tr_elements[0]:
        i += 1        
        column_name = t.text_content()
        print('%d:"%s"'%(i, column_name))
        cols.append((column_name,[]))

    # Extract player page links
    links = doc.xpath('//*[@id="my-players-table"]/div/div[2]/table//a/@href')
    # First four links are not player pages
    links = links[4:]
    # Add string 'gamelog/' to url
    links = ['gamelog/_'.join(link.split('_')) for link in links]
    print('links: ', links)
    print('total links: ', len(links))

    # Our first row is the header (columns), so data is stored in the second row onwards
    for j in range(1, len(tr_elements)):

        # T is our j'th row
        T = tr_elements[j]

        # If row is not of size 9, the //tr data is not from our table 
        if len(T) != 9:
            break

        # i is the index of our column
        i = 0

        # Iterate through each element of the row
        for t in T.iterchildren():
            data = t.text_content() 
            # Check if row is empty
            if i > 0:
                # Convert any numerical value to integers
                try:
                    data = int(data)
                except:
                    pass
            # Append the data to the empty list of the i'th column
            cols[i][1].append(data)
            # Increment i for the next column
            i += 1
        
    # Check the lengths
    [len(C) for (title, C) in cols]

    # Store the data in a dataframe
    Dict = {title:column for (title, column) in cols}
    rpm = pd.DataFrame(Dict)
    rpm.head()

    # Data cleaning
    rpm = clean_data(rpm)
    rpm.head()

    # Add links column to dataframe
    rpm['GAME LOG PAGE'] = links
        
    # Extract the next page link
    if flag:
        next_page_link = 'https:' + doc.xpath('//*[@id="my-players-table"]/div/div[2]/div/div[2]/a/@href')[0]
    else:      
        try:
            next_page_link = 'https:' + doc.xpath('//*[@id="my-players-table"]/div/div[2]/div/div[2]/a/@href')[1]
        except IndexError:
            next_page_link = None
    
    print(next_page_link)

    # Return a tuple with the dataframe and next page link
    return (rpm, next_page_link)

# Wrapper method
def rpm_scraper(base_url):
    url = None
    flag = False
    rpm_pages = []
    for i in range(0, 13):
        if i == 0:
            url = base_url
            flag = True
        else:
            flag = False
            
        if url:
            # Scrape all results on the current page
            rpm_tuple = rpm_scraper_single_page(url, flag)
            # Append the dataframe to list
            rpm_pages.append(rpm_tuple[0])
            # Update url to next page link
            url = rpm_tuple[1]

    return pd.concat(rpm_pages, ignore_index=False)

# Run scraper
base_url = 'http://www.espn.com/nba/statistics/rpm/_/sort/RPM'
rpm_data = rpm_scraper(base_url)

# Check results
rpm_data.head()

1:"RK"
2:"NAME"
3:"TEAM"
4:"GP"
5:"MPG"
6:"ORPM"
7:"DRPM"
8:"RPM"
9:"WINS"
links:  ['http://www.espn.com/nba/player/gamelog/_/id/4251/paul-george', 'http://www.espn.com/nba/player/gamelog/_/id/3992/james-harden', 'http://www.espn.com/nba/player/gamelog/_/id/3975/stephen-curry', 'http://www.espn.com/nba/player/gamelog/_/id/3032977/giannis-antetokounmpo', 'http://www.espn.com/nba/player/gamelog/_/id/3059318/joel-embiid', 'http://www.espn.com/nba/player/gamelog/_/id/3112335/nikola-jokic', 'http://www.espn.com/nba/player/gamelog/_/id/6583/anthony-davis', 'http://www.espn.com/nba/player/gamelog/_/id/6478/nikola-vucevic', 'http://www.espn.com/nba/player/gamelog/_/id/1966/lebron-james', 'http://www.espn.com/nba/player/gamelog/_/id/6606/damian-lillard', 'http://www.espn.com/nba/player/gamelog/_/id/3988/danny-green', 'http://www.espn.com/nba/player/gamelog/_/id/3202/kevin-durant', 'http://www.espn.com/nba/player/gamelog/_/id/3012/kyle-lowry', 'http://www.espn.com/nba/player/gamelog/_/id/3032976

1:"RK"
2:"NAME"
3:"TEAM"
4:"GP"
5:"MPG"
6:"ORPM"
7:"DRPM"
8:"RPM"
9:"WINS"
links:  ['http://www.espn.com/nba/player/gamelog/_/id/2797/marvin-williams', 'http://www.espn.com/nba/player/gamelog/_/id/984/tyson-chandler', 'http://www.espn.com/nba/player/gamelog/_/id/4249/gordon-hayward', 'http://www.espn.com/nba/player/gamelog/_/id/6619/terrence-ross', 'http://www.espn.com/nba/player/gamelog/_/id/3431/eric-gordon', 'http://www.espn.com/nba/player/gamelog/_/id/2991055/montrezl-harrell', 'http://www.espn.com/nba/player/gamelog/_/id/6507/brad-wanamaker', 'http://www.espn.com/nba/player/gamelog/_/id/3914044/landry-shamet', 'http://www.espn.com/nba/player/gamelog/_/id/2999547/gary-harris', 'http://www.espn.com/nba/player/gamelog/_/id/4261/ekpe-udoh', 'http://www.espn.com/nba/player/gamelog/_/id/2530572/langston-galloway', 'http://www.espn.com/nba/player/gamelog/_/id/3554/omri-casspi', 'http://www.espn.com/nba/player/gamelog/_/id/3978/demar-derozan', 'http://www.espn.com/nba/player/gamelog/_/id/

1:"RK"
2:"NAME"
3:"TEAM"
4:"GP"
5:"MPG"
6:"ORPM"
7:"DRPM"
8:"RPM"
9:"WINS"
links:  ['http://www.espn.com/nba/player/gamelog/_/id/3934662/tyler-lydon', 'http://www.espn.com/nba/player/gamelog/_/id/3917376/jaylen-brown', 'http://www.espn.com/nba/player/gamelog/_/id/136/vince-carter', 'http://www.espn.com/nba/player/gamelog/_/id/2608891/jakarr-sampson', 'http://www.espn.com/nba/player/gamelog/_/id/3056602/semi-ojeleye', 'http://www.espn.com/nba/player/gamelog/_/id/3136183/yante-maten', 'http://www.espn.com/nba/player/gamelog/_/id/4278077/jarred-vanderbilt', 'http://www.espn.com/nba/player/gamelog/_/id/3074765/isaiah-hicks', 'http://www.espn.com/nba/player/gamelog/_/id/4017838/ante-zizic', 'http://www.espn.com/nba/player/gamelog/_/id/3133603/kelly-oubre-jr.', 'http://www.espn.com/nba/player/gamelog/_/id/3948153/chris-boucher', 'http://www.espn.com/nba/player/gamelog/_/id/4065673/tony-bradley', 'http://www.espn.com/nba/player/gamelog/_/id/3418/michael-beasley', 'http://www.espn.com/nba/play

1:"RK"
2:"NAME"
3:"TEAM"
4:"GP"
5:"MPG"
6:"ORPM"
7:"DRPM"
8:"RPM"
9:"WINS"
links:  ['http://www.espn.com/nba/player/gamelog/_/id/4277842/trevon-duval', 'http://www.espn.com/nba/player/gamelog/_/id/6637/kent-bazemore', 'http://www.espn.com/nba/player/gamelog/_/id/2991283/alex-poythress', 'http://www.espn.com/nba/player/gamelog/_/id/3102528/dante-exum', 'http://www.espn.com/nba/player/gamelog/_/id/3913546/melvin-frazier-jr.', 'http://www.espn.com/nba/player/gamelog/_/id/1987/dwyane-wade', 'http://www.espn.com/nba/player/gamelog/_/id/3136194/tyler-ulis', 'http://www.espn.com/nba/player/gamelog/_/id/3137795/chris-chiozza', 'http://www.espn.com/nba/player/gamelog/_/id/3150844/moritz-wagner', 'http://www.espn.com/nba/player/gamelog/_/id/2326411/james-nunnally', 'http://www.espn.com/nba/player/gamelog/_/id/3157465/duncan-robinson', 'http://www.espn.com/nba/player/gamelog/_/id/2489530/troy-daniels', 'http://www.espn.com/nba/player/gamelog/_/id/2578213/ben-mclemore', 'http://www.espn.com/nba/pl

1:"RK"
2:"NAME"
3:"TEAM"
4:"GP"
5:"MPG"
6:"ORPM"
7:"DRPM"
8:"RPM"
9:"WINS"
links:  ['http://www.espn.com/nba/player/gamelog/_/id/2489785/ian-clark', 'http://www.espn.com/nba/player/gamelog/_/id/6448/brandon-knight', 'http://www.espn.com/nba/player/gamelog/_/id/6485/lance-thomas', 'http://www.espn.com/nba/player/gamelog/_/id/2488958/solomon-hill', "http://www.espn.com/nba/player/gamelog/_/id/3133601/devonte'-graham", 'http://www.espn.com/nba/player/gamelog/_/id/2566745/quinn-cook', 'http://www.espn.com/nba/player/gamelog/_/id/4011991/dragan-bender', 'http://www.espn.com/nba/player/gamelog/_/id/3907525/allonzo-trier', 'http://www.espn.com/nba/player/gamelog/_/id/4065836/omari-spellman', 'http://www.espn.com/nba/player/gamelog/_/id/3893016/cedi-osman', 'http://www.espn.com/nba/player/gamelog/_/id/4080610/hamidou-diallo', 'http://www.espn.com/nba/player/gamelog/_/id/2595435/abdel-nader', 'http://www.espn.com/nba/player/gamelog/_/id/4264/patrick-patterson', 'http://www.espn.com/nba/player/g

Unnamed: 0,RK,NAME,POS,TEAM,GP,MPG,ORPM,DRPM,RPM,WINS,GAME LOG PAGE
0,1,Paul George,SF,OKC,77,36.9,4.48,3.08,7.56,19.73,http://www.espn.com/nba/player/gamelog/_/id/42...
1,2,James Harden,PG,HOU,78,36.8,7.41,0.0,7.41,18.53,http://www.espn.com/nba/player/gamelog/_/id/39...
2,3,Stephen Curry,PG,GS,69,33.8,5.92,0.81,6.73,15.07,http://www.espn.com/nba/player/gamelog/_/id/39...
3,4,Giannis Antetokounmpo,PF,MIL,72,32.8,3.12,3.39,6.51,14.94,http://www.espn.com/nba/player/gamelog/_/id/30...
4,5,Joel Embiid,C,PHI,64,33.7,2.7,3.71,6.41,12.91,http://www.espn.com/nba/player/gamelog/_/id/30...


## Save data to csv file

In [3]:
rpm_data.to_csv('nba_rpm_data_2018.csv')