<a id="top"></a>

<center><h1>Source code for scraping NBA player stats from ESPN's web site</h1></center>

The scripts will save the NBA data into a sqlite3 database.  So you need to first create the sqlite3 database and run the CREATE TABLE scripts prior to running the Python web scraping scripts.

**Step 1:** At terminal run: **sqlite3 nba.db**<br>
**Step 2:** Then run the CREATE TABLE sqlite3 scripts to create the empty tables.<br>
**Step 3:** Run the Python web scraping scripts

### Quick Links

- [sqlite table definitions](#sqlite_tables)
- [source code for player game stats](#game_stats)
- [source code for player shooting stats](#shooting_stats)
- [source code for regular season averages](#season_avgs)
- [source code for regular season totals](#season_totals)
- [source code for reguarl season misc totals](#season_misc)

<a id="sqlite_tables"></a>


## sqlite table definitions

[[back to top]](#top)

In [None]:
CREATE TABLE "player_game_stats" (
    "id" INTEGER PRIMARY KEY NOT NULL,
    "name_pos" TEXT NOT NULL,
    "team_name" TEXT NOT NULL,
    "GP" INTEGER NOT NULL,
    "GS" INTEGER NOT NULL,
    "MIN" REAL NOT NULL,
    "PPG" REAL NOT NULL,
    "OFFR" REAL NOT NULL,
    "DEFR" REAL NOT NULL,
    "RPG" REAL NOT NULL,
    "APG" REAL NOT NULL,
    "SPG" REAL NOT NULL,
    "BPG" REAL NOT NULL,
    "TPG" REAL NOT NULL,
    "FPG" REAL NOT NULL,
    "A2TO" REAL NOT NULL,
    "PER" REAL NOT NULL
);

CREATE TABLE "player_shooting_stats" (
    "id" INTEGER PRIMARY KEY NOT NULL,
    "name_pos" TEXT NOT NULL,
    "team_name" TEXT NOT NULL,
    "FGM" REAL NOT NULL,
    "FGA" REAL NOT NULL,
    "FG_Perc" REAL NOT NULL,
    "3PM" REAL NOT NULL,
    "3PA" REAL NOT NULL,
    "3P_Perc" REAL NOT NULL,
    "FTM" REAL NOT NULL,
    "FTA" REAL NOT NULL,
    "FT_Perc" REAL NOT NULL,
    "2PM" REAL NOT NULL,
    "2PA" REAL NOT NULL,
    "2P_Perc" REAL NOT NULL,
    "PPS" REAL NOT NULL,
    "AFG_Perc" REAL NOT NULL
);

CREATE TABLE "regular_season_avgs" (
    "id" INTEGER NOT NULL,
    "player_name" TEXT NOT NULL,
    "season" TEXT NOT NULL,
    "team" TEXT NOT NULL,
    "GP" INTEGER NOT NULL,
    "GS" INTEGER NOT NULL,
    "MIN" REAL NOT NULL,
    "FGM-A" TEXT NOT NULL,
    "FG_Perc" REAL NOT NULL,
    "3PM-A" TEXT NOT NULL,
    "3P_Perc" REAL NOT NULL,
    "FTM-A" TEXT NOT NULL,
    "FT_Perc" REAL NOT NULL,
    "OR" REAL NOT NULL,
    "DR" REAL NOT NULL,
    "REB" REAL NOT NULL,
    "AST" REAL NOT NULL,
    "BLK" REAL NOT NULL,
    "STL" REAL NOT NULL,
    "PF" REAL NOT NULL,
    "TO" REAL NOT NULL,
    "PTS" REAL NOT NULL,
    unique ("id", "season","team")
);

CREATE TABLE "regular_season_totals" (
    "id" INTEGER NOT NULL,
    "player_name" TEXT NOT NULL,
    "season" TEXT NOT NULL,
    "team" TEXT NOT NULL,
    "FGM-A" TEXT NOT NULL,
    "FG_Perc" REAL NOT NULL,
    "3PM-A" TEXT NOT NULL,
    "3P_Perc" REAL NOT NULL,
    "FTM-A" TEXT NOT NULL,
    "FT_Perc" REAL NOT NULL,
    "OR" INTEGER NOT NULL,
    "DR" INTEGER NOT NULL,
    "REB" INTEGER NOT NULL,
    "AST" INTEGER NOT NULL,
    "BLK" INTEGER NOT NULL,
    "STL" INTEGER NOT NULL,
    "PF" INTEGER NOT NULL,
    "TO" INTEGER NOT NULL,
    "PTS" INTEGER NOT NULL,
    unique ("id","season","team")
);

CREATE TABLE "regular_season_misc_totals" (
    "id" INTEGER NOT NULL,
    "player_name" TEXT NOT NULL,
    "season" TEXT NOT NULL,
    "team" TEXT NOT NULL,
    "DBLDBL" INTEGER NOT NULL,
    "TRIDBL" INTEGER NOT NULL,
    "DQ" INTEGER NOT NULL,
    "EJECT" INTEGER NOT NULL,
    "TECH" INTEGER NOT NULL,
    "FLAG" INTEGER NOT NULL,
    "AST2TO" REAL NOT NULL,
    "STL2TO" REAL NOT NULL,
    "RAT" REAL NOT NULL,
    "SCEFF" REAL NOT NULL,
    "SHEFF" REAL NOT NULL,
    unique("id","season","team")
);

<a id="game_stats"></a>

## Populating the player game stats table

[[back to top]](#top)

In [None]:
import requests # pip install requests
from bs4 import BeautifulSoup
import sqlite3
import re
from datetime import datetime

startTime = datetime.now()

base_url = 'http://espn.go.com'

teams_url = 'http://espn.go.com/nba/teams'
html_teams = requests.get(teams_url)

soup_teams = BeautifulSoup(html_teams.text,'lxml')

urls = soup_teams.find_all(href=re.compile('/nba/teams/stats'))

team_urls = [base_url+url['href'] for url in urls]

team_name_dict = {'bos':'Boston Celtics',
                  'bkn':'Brooklyn Nets',
                  'nyk':'New York Knicks',
                  'phi':'Philadelphia 76ers',
                  'tor':'Toronto Raptors',
                  'gsw':'Golden State Warriors',
                  'lac':'Los Angeles Clippers',
                  'lal':'Los Angeles Lakers',
                  'pho':'Phoenix Suns',
                  'sac':'Sacramento Kings',
                  'chi':'Chicago Bulls',
                  'cle':'Cleveland Cavaliers',
                  'det':'Detroit Pistons',
                  'ind':'Indiana Pacers',
                  'mil':'Milwaukee Bucks',
                  'dal':'Dallas Mavericks',
                  'hou':'Houston Rockets',
                  'mem':'Memphis Grizzlies',
                  'nor':'New Orleans Pelicans',
                  'sas':'San Antonio Spurs',
                  'atl':'Atlanta Hawks',
                  'cha':'Charlotte Hornets',
                  'mia':'Miami Heat',
                  'orl':'Orlando Magic',
                  'was':'Washington Wizards',
                  'den':'Denver Nuggets',
                  'min':'Minnesota Timberwolves',
                  'okc':'Oklahoma City Thunder',
                  'por':'Portland Trail Blazers',
                  'uth':'Utah Jazz'
                  }

# http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def chunks(l, n):
    """ Yield successive n-sized chunks from l.
    """
    for i in range(0, len(l), n):
        yield l[i:i+n]

for team in team_urls:
    team_code = team[-3:]
    html_team = requests.get(team)

    soup_team = BeautifulSoup(html_team.text, 'lxml')

    roster = soup_team.find_all('tr', class_=re.compile('player'))
    roster_game_stats = roster[:int(len(roster)/2)]
    #roster_shooting_stats = roster[-int(len(roster)/2):]
    
    players = []
    for row in roster_game_stats:
        for data in row:
            players.append(data.get_text())
        
    player_ids = [player.a['href'].split('/')[7] for player in roster_game_stats]
    
    index = 0
    increment = 0
    for id in player_ids:
        players.insert(index + increment, id)
        index = index + 15
        increment = increment + 1
        
    index = 2
    increment = 0
    for id in player_ids:
        players.insert(index + increment, team_name_dict[team_code])
        index = index + 16
        increment = increment + 1

    conn = sqlite3.connect('/home/pybokeh/databases/nba.db')
    c = conn.cursor()

    for row in chunks(players,17):
        try:
            c.execute('INSERT INTO player_game_stats VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', row)
        except:
            pass
        conn.commit()
    conn.close()
    
print(datetime.now() - startTime)

<a id="shooting_stats"></a>

## Populating the player shooting stats table

[[back to top]](#top)

In [None]:
import requests # pip install requests
from bs4 import BeautifulSoup
import sqlite3
import re
from datetime import datetime

startTime = datetime.now()

base_url = 'http://espn.go.com'

teams_url = 'http://espn.go.com/nba/teams'
html_teams = requests.get(teams_url)

soup_teams = BeautifulSoup(html_teams.text,'lxml')

urls = soup_teams.find_all(href=re.compile('/nba/teams/stats'))

team_urls = [base_url+url['href'] for url in urls]

team_name_dict = {'bos':'Boston Celtics',
                  'bkn':'Brooklyn Nets',
                  'nyk':'New York Knicks',
                  'phi':'Philadelphia 76ers',
                  'tor':'Toronto Raptors',
                  'gsw':'Golden State Warriors',
                  'lac':'Los Angeles Clippers',
                  'lal':'Los Angeles Lakers',
                  'pho':'Phoenix Suns',
                  'sac':'Sacramento Kings',
                  'chi':'Chicago Bulls',
                  'cle':'Cleveland Cavaliers',
                  'det':'Detroit Pistons',
                  'ind':'Indiana Pacers',
                  'mil':'Milwaukee Bucks',
                  'dal':'Dallas Mavericks',
                  'hou':'Houston Rockets',
                  'mem':'Memphis Grizzlies',
                  'nor':'New Orleans Pelicans',
                  'sas':'San Antonio Spurs',
                  'atl':'Atlanta Hawks',
                  'cha':'Charlotte Hornets',
                  'mia':'Miami Heat',
                  'orl':'Orlando Magic',
                  'was':'Washington Wizards',
                  'den':'Denver Nuggets',
                  'min':'Minnesota Timberwolves',
                  'okc':'Oklahoma City Thunder',
                  'por':'Portland Trail Blazers',
                  'uth':'Utah Jazz'
                  }

# http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def chunks(l, n):
    """ Yield successive n-sized chunks from l.
    """
    for i in range(0, len(l), n):
        yield l[i:i+n]

for team in team_urls:
    team_code = team[-3:]
    html_team = requests.get(team)

    soup_team = BeautifulSoup(html_team.text, 'lxml')

    roster = soup_team.find_all('tr', class_=re.compile('player'))
    #roster_game_stats = roster[:int(len(roster)/2)]
    roster_shooting_stats = roster[-int(len(roster)/2):]
    
    players = []
    for row in roster_shooting_stats:
        for data in row:
            players.append(data.get_text())
        
    player_ids = [player.a['href'].split('/')[7] for player in roster_shooting_stats]
    
    index = 0
    increment = 0
    for id in player_ids:
        players.insert(index + increment, id)
        index = index + 15
        increment = increment + 1
        
    index = 2
    increment = 0
    for id in player_ids:
        players.insert(index + increment, team_name_dict[team_code])
        index = index + 16
        increment = increment + 1

    conn = sqlite3.connect('/home/pybokeh/databases/nba.db')
    c = conn.cursor()

    for row in chunks(players,17):
        try:
            c.execute('INSERT INTO player_shooting_stats VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', row)
        except:
            pass
        conn.commit()
    conn.close()
    
print(datetime.now() - startTime)

<a id="season_avgs"></a>

## Populating the regular season averages table

[[back to top]](#top)

In [None]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import re
from datetime import datetime
import time

startTime = datetime.now()

# http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def chunks(l, n):
    """ Yield successive n-sized chunks from l.
    """
    for i in range(0, len(l), n):
        yield l[i:i+n]

base_url = 'http://espn.go.com'

teams_url = 'http://espn.go.com/nba/teams'
html_teams = requests.get(teams_url)

soup_teams = BeautifulSoup(html_teams.text,'lxml')
urls = soup_teams.find_all(href=re.compile('/nba/teams/stats'))

team_urls = [base_url+url['href'] for url in urls]

for team in team_urls:
    html_team = requests.get(team)
    soup_team = BeautifulSoup(html_team.text,'lxml')
    html_rows = soup_team.find_all('tr', class_=re.compile('player'))
    
    player_urls = [row.a['href'].replace('_','stats/_') for row in html_rows]
    
    for player in player_urls:
        time.sleep(3)   # added delay since i was getting timeout errors
        player_id   = player.split('/')[8]
        html_player = requests.get(player)
        soup_player = BeautifulSoup(html_player.text,'lxml')
        
        soup_name = soup_player.find('meta', property='og:title')
        player_name = soup_name['content']
        
        regular_season_stats = soup_player.find_all('tr', class_=re.compile('row'))
        
        size = int(len(regular_season_stats)/3)
        
        season_avgs_slice        = slice(0,size)
        #season_totals_slice      = slice(size,size*2)
        #season_misc_totals_slice = slice(size*2,size*3)
        
        regular_season_avgs = regular_season_stats[season_avgs_slice]
        #regular_season_totals = regular_season_stats[season_totals_slice]
        #regular_season_misc_totals = regular_season_stats[season_misc_totals_slice]
        
        avgs = []
        for row in regular_season_avgs:
            if len(row) == 20:  # Only accept row that contains complete data
                for data in row:
                    avgs.append(data.get_text())
            else:
                pass
                
        index = 0 # insert the player ID before the player's season
        increment = 0
        for row in range(len(regular_season_avgs)):
            avgs.insert(index + increment, player_id)
            index = index + 20  # There are 20 columns in the season avgs section
            increment = increment + 1
            
        index = 1 # insert the player's name after the player's ID
        increment = 0
        for row in range(len(regular_season_avgs)):
            avgs.insert(index + increment, player_name)
            index = index + 21  # There are 21 columns in the season avgs section since I've just added player ID
            increment = increment + 1

        conn = sqlite3.connect('/home/pybokeh/databases/nba.db')
        c = conn.cursor()

        for data in chunks(avgs,22):
            try:
                c.execute('INSERT INTO regular_season_avgs VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', data)
            except:
                pass
            conn.commit()
        conn.close()
        
print(datetime.now() - startTime)

<a id="season_totals"></a>

## Populating the regular season totals table

[[back to top]](#top)

In [3]:
import requests # pip install requests
from bs4 import BeautifulSoup
import sqlite3
import re
from datetime import datetime

startTime = datetime.now()

# http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def chunks(l, n):
    """ Yield successive n-sized chunks from l.
    """
    for i in range(0, len(l), n):
        yield l[i:i+n]

base_url = 'http://espn.go.com'

teams_url = 'http://espn.go.com/nba/teams'
html_teams = requests.get(teams_url)

soup_teams = BeautifulSoup(html_teams.text,'lxml')
urls = soup_teams.find_all(href=re.compile('/nba/teams/stats'))

team_urls = [base_url+url['href'] for url in urls]

for team in team_urls:
    html_team = requests.get(team)
    soup_team = BeautifulSoup(html_team.text,'lxml')
    html_rows = soup_team.find_all('tr', class_=re.compile('player'))
    
    player_urls = [row.a['href'].replace('_','stats/_') for row in html_rows]
    
    for player in player_urls:
        player_id   = player.split('/')[8]
        html_player = requests.get(player)
        soup_player = BeautifulSoup(html_player.text,'lxml')
        
        soup_name = soup_player.find('meta', property='og:title')
        player_name = soup_name['content']
        
        regular_season_stats = soup_player.find_all('tr', class_=re.compile('row'))
        
        size = int(len(regular_season_stats)/3)
        
        #season_avgs_slice        = slice(0,size)
        season_totals_slice      = slice(size,size*2)
        #season_misc_totals_slice = slice(size*2,size*3)
        
        #regular_season_avgs = regular_season_stats[season_avgs_slice]
        regular_season_totals = regular_season_stats[season_totals_slice]
        #regular_season_misc_totals = regular_season_stats[season_misc_totals_slice]
        
        totals = []
        for row in regular_season_totals:
            if len(row) == 17:  # only accept row that has complete data
                for data in row:
                    totals.append(data.get_text())
            else:
                pass
                
        index = 0 # insert the player ID before the player's season
        increment = 0
        for row in range(len(regular_season_totals)):
            totals.insert(index + increment, player_id)
            index = index + 17  # There are 17 columns in the season totals section
            increment = increment + 1
            
        index = 1 # insert the player's name after the player's ID
        increment = 0
        for row in range(len(regular_season_totals)):
            totals.insert(index + increment, player_name)
            index = index + 18 # There are now 18 columns in the reg season totals after inserting player's ID
            increment = increment + 1

        conn = sqlite3.connect('/home/pybokeh/databases/nba.db')
        c = conn.cursor()

        for data in chunks(totals,19):
            try:
                c.execute('INSERT INTO regular_season_totals VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', data)
            except:
                pass
            conn.commit()
        conn.close()
        
print(datetime.now() - startTime)

URLError: <urlopen error [Errno -3] Temporary failure in name resolution>

<a id="season_misc"></a>

## Populating regular season misc totals table

[[back to top]](#top)

In [None]:
import requests # pip install requests
from bs4 import BeautifulSoup
import sqlite3
import re
from datetime import datetime

startTime = datetime.now()

# http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def chunks(l, n):
    """ Yield successive n-sized chunks from l.
    """
    for i in range(0, len(l), n):
        yield l[i:i+n]

base_url = 'http://espn.go.com'

teams_url = 'http://espn.go.com/nba/teams'
html_teams = requests.get(teams_url)

soup_teams = BeautifulSoup(html_teams.text,'lxml')
urls = soup_teams.find_all(href=re.compile('/nba/teams/stats'))

team_urls = [base_url+url['href'] for url in urls]

for team in team_urls:
    html_team = requests.get(team)
    soup_team = BeautifulSoup(html_team.text,'lxml')
    html_rows = soup_team.find_all('tr', class_=re.compile('player'))
    
    player_urls = [row.a['href'].replace('_','stats/_') for row in html_rows]
    
    for player in player_urls:
        player_id   = player.split('/')[8]
        html_player = requests.get(player)
        soup_player = BeautifulSoup(html_player.text,'lxml')
        
        soup_name = soup_player.find('meta', property='og:title')
        player_name = soup_name['content']
        
        regular_season_stats = soup_player.find_all('tr', class_=re.compile('row'))
        
        size = int(len(regular_season_stats)/3)
        
        #season_avgs_slice        = slice(0,size)
        #season_totals_slice      = slice(size,size*2)
        season_misc_totals_slice = slice(size*2,size*3)
        
        #regular_season_avgs = regular_season_stats[season_avgs_slice]
        #regular_season_totals = regular_season_stats[season_totals_slice]
        regular_season_misc_totals = regular_season_stats[season_misc_totals_slice]
        
        misc_totals = []
        for row in regular_season_misc_totals:
            if len(row) == 13:
                for data in row:  # only accept row that has complete data
                    misc_totals.append(data.get_text())
            else:
                pass
                
        index = 0 # insert the player ID before the player's season
        increment = 0
        for row in range(len(regular_season_misc_totals)):
            misc_totals.insert(index + increment, player_id)
            index = index + 13  # There are 13 columns in the season misc totals section
            increment = increment + 1
            
        index = 1 # insert the player's name after the player's ID
        increment = 0
        for row in range(len(regular_season_misc_totals)):
            misc_totals.insert(index + increment, player_name)
            index = index + 14 # There are now 14 columns in the reg season misc totals after inserting player ID
            increment = increment + 1

        conn = sqlite3.connect('/home/pybokeh/databases/nba.db')
        c = conn.cursor()

        for data in chunks(misc_totals,15):
            try:
                c.execute('INSERT INTO regular_season_misc_totals VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', data)
            except:
                pass
            conn.commit()
        conn.close()
        
print(datetime.now() - startTime)

[[back to top]](#top)