In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request

# This notebook adds the important varaibles of home team scored points, visitor team scored points, starting players, refs, and expected points per team based on the starting players average points per game

In [2]:
html = 'https://www.basketball-reference.com/leagues/NBA_2020_totals.html#totals_stats::fg'
soup = BeautifulSoup(urlopen(html), 'html.parser')

In [3]:
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" data-root="/home/bbr/build" data-version="klecko-" itemscope="" itemtype="https://schema.org/WebSite" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="ie=edge" http-equiv="x-ua-compatible"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">
   <link href="https://d2p3bygnnzw9w3.cloudfront.net/req/202005051" rel="dns-prefetch"/>
   <!-- no:cookie fast load the css.           -->
   <script>
    function gup(n) {n = n.replace(/[\[]/, '\\[').replace(/[\]]/, '\\]'); var r = new RegExp('[\\?&]'+n+'=([^&#]*)'); var re = r.exec(location.search);   return re === null?'':decodeURIComponent(re[1].replace(/\+/g,' '));}; document.srdev = gup('srdev')
   </script>
   <link crossorigin="" href="https://d2p3bygnnzw9w3.cloudfront.net" rel="preconnect"/>
   <link crossorigin="" href="https://d2cwpp38twqe55.cloudfront.net" rel="preconnect"/>
   <style>
   </style>
   <link as="style" crossorigin="" href="https://d

In [4]:
soup.td

<td class="left" csk="Adams,Steven" data-append-csv="adamsst01" data-stat="player"><a href="/players/a/adamsst01.html">Steven Adams</a></td>

In [5]:
players = [i.text for i in soup.find_all('td', {'data-stat':'player'})]

In [6]:
games = [i.text for i in soup.find_all('td', {'data-stat':'g'})]

In [7]:
games_started = [i.text for i in soup.find_all('td', {'data-stat':'gs'})]

In [8]:
minutes_played = [i.text for i in soup.find_all('td', {'data-stat':'mp'})]

In [9]:
points = [i.text for i in soup.find_all('td', {'data-stat':'pts'})]

# This creates a dataframe of all of the individual players average stats per game

In [10]:
df = pd.DataFrame({'players':players, \
                   'games':games, \
                   'starters':games_started, \
                   'minutes':minutes_played, \
                   'points':points})

df['players'] = df['players'].astype('string')
df[['games','starters','minutes','points']] = df[['games','starters','minutes','points']].astype('int32')

In [11]:
df.dtypes

players     string
games        int32
starters     int32
minutes      int32
points       int32
dtype: object

In [12]:
df['points_per_game'] = df['points'] / df['games']
df['points_per_start'] = df['points'] / df['starters']
df['points_per_minutes'] = df['points'] / df['minutes']
df = df.replace([np.inf, -np.inf], np.nan)

In [13]:
df.head()

Unnamed: 0,players,games,starters,minutes,points,points_per_game,points_per_start,points_per_minutes
0,Steven Adams,58,58,1564,633,10.913793,10.913793,0.404731
1,Bam Adebayo,65,65,2235,1053,16.2,16.2,0.471141
2,LaMarcus Aldridge,53,53,1754,1001,18.886792,18.886792,0.570696
3,Nickeil Alexander-Walker,41,0,501,211,5.146341,,0.421158
4,Grayson Allen,30,0,498,221,7.366667,,0.443775


In [16]:
df.sort_values(by='points_per_game', ascending=False)

Unnamed: 0,players,games,starters,minutes,points,points_per_game,points_per_start,points_per_minutes
245,James Harden,61,61,2241,2096,34.360656,34.360656,0.935297
39,Bradley Beal,57,57,2053,1741,30.543860,30.543860,0.848027
11,Giannis Antetokounmpo,57,57,1763,1690,29.649123,29.649123,0.958593
620,Trae Young,60,60,2120,1778,29.633333,29.633333,0.838679
362,Damian Lillard,58,58,2140,1677,28.913793,28.913793,0.783645
...,...,...,...,...,...,...,...,...
596,Quinndary Weatherspoon,3,0,15,0,0.000000,,0.000000
594,Paul Watson,2,0,17,0,0.000000,,0.000000
443,Zach Norvell,2,0,5,0,0.000000,,0.000000
372,J.P. Macura,1,0,1,0,0.000000,,0.000000


# Getting starting players and referee (example)

In [184]:
html = 'https://www.basketball-reference.com/boxscores/202003110ATL.html'
soup = BeautifulSoup(urlopen(html), 'html.parser')

In [185]:
tables = soup.find_all('table', {'id':'box-ATL-game-basic'})

In [186]:
','.join([p.text for p in tables[0].find_all('th', {'class':'left', 'data-stat':'player'})[:5]])

"John Collins,Trae Young,De'Andre Hunter,Kevin Huerter,Dewayne Dedmon"

In [187]:
tables = soup.find_all('table', {'id':'box-NYK-game-basic'})

In [188]:
','.join([p.text for p in tables[0].find_all('th', {'class':'left', 'data-stat':'player'})[:5]])

'Maurice Harkless,RJ Barrett,Julius Randle,Elfrid Payton,Taj Gibson'

In [110]:
div = soup.find_all('div')

In [189]:
','.join([i.text for i in div[-59].find_all('a')])

'Derrick Collins,Ken Mauer,Leon Wood'

# Team initials

In [117]:
html = 'https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Basketball_Association/National_Basketball_Association_team_abbreviations'
soup = BeautifulSoup(urlopen(html), 'html.parser')

In [119]:
tab = soup.find_all('tbody')

In [131]:
test = [i.text[:-1] for i in tab[0].find_all('td')[2:]]

In [142]:
initials, team_names = np.array(test).reshape(30,2).transpose()

In [148]:
initials[initials == 'BOS'][0]

'BOS'

In [211]:
# correction
initials[initials == 'CHA'] = 'CHO'
initials[initials == 'BKN'] = 'BRK'
initials[initials == 'PHX'] = 'PHO'

In [212]:
initials

array(['ATL', 'BRK', 'BOS', 'CHO', 'CHI', 'CLE', 'DAL', 'DEN', 'DET',
       'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN',
       'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS',
       'TOR', 'UTA', 'WAS'], dtype='<U22')

# Integrate to table

In [213]:
games = pd.read_csv('~/Downloads/nba_games_2020.csv')

In [206]:
games.head()

Unnamed: 0,date,start_time,vis_team,home_team,vis_points,home_points,ot,attendance
0,2019-10-22,20:00:00,New Orleans Pelicans,Toronto Raptors,122,130,1,20787
1,2019-10-22,22:30:00,Los Angeles Lakers,Los Angeles Clippers,102,112,0,19068
2,2019-10-23,19:00:00,Chicago Bulls,Charlotte Hornets,125,126,0,15424
3,2019-10-23,19:00:00,Detroit Pistons,Indiana Pacers,119,110,0,17923
4,2019-10-23,19:00:00,Cleveland Cavaliers,Orlando Magic,85,94,0,18846


In [214]:
def pick_initial(team):
    return initials[team_names == team][0]

games['home_initial'] = games['home_team'].apply(pick_initial)

In [215]:
games['vis_initial'] = games['vis_team'].apply(pick_initial)

games.head()

Unnamed: 0,date,start_time,vis_team,home_team,vis_points,home_points,ot,attendance,home_initial,vis_initial
0,2019-10-22,20:00:00,New Orleans Pelicans,Toronto Raptors,122,130,1,20787,TOR,NOP
1,2019-10-22,22:30:00,Los Angeles Lakers,Los Angeles Clippers,102,112,0,19068,LAC,LAL
2,2019-10-23,19:00:00,Chicago Bulls,Charlotte Hornets,125,126,0,15424,CHO,CHI
3,2019-10-23,19:00:00,Detroit Pistons,Indiana Pacers,119,110,0,17923,IND,DET
4,2019-10-23,19:00:00,Cleveland Cavaliers,Orlando Magic,85,94,0,18846,ORL,CLE


In [216]:
strdate = games['date'].str.replace('-','')
home = games['home_initial']
vis  = games['vis_initial']

In [219]:
home_players = []
vis_players  = []
referees     = []

for i in range(len(games)):
    html = 'https://www.basketball-reference.com/boxscores/%s0%s.html' % (strdate[i], home[i])
    print(html)
    soup = BeautifulSoup(urlopen(html), 'html.parser')
    
    # Home players
    tables = soup.find_all('table', {'id':'box-%s-game-basic' % home[i]})
    home_players.append(','.join([p.text for p in tables[0].find_all('th', {'class':'left', 'data-stat':'player'})[:5]]))
    
    # Visitor players
    tables = soup.find_all('table', {'id':'box-%s-game-basic' % vis[i]})
    vis_players.append(','.join([p.text for p in tables[0].find_all('th', {'class':'left', 'data-stat':'player'})[:5]]))

    # Referees
    div = soup.find_all('div')
    referees.append(','.join([i.text for i in div[-59].find_all('a')]))
    
# Adding columns
games['home_players'] = home_players
games['vis_players']  = vis_players
games['referees']     = referees

https://www.basketball-reference.com/boxscores/201910220TOR.html
https://www.basketball-reference.com/boxscores/201910220LAC.html
https://www.basketball-reference.com/boxscores/201910230CHO.html
https://www.basketball-reference.com/boxscores/201910230IND.html
https://www.basketball-reference.com/boxscores/201910230ORL.html
https://www.basketball-reference.com/boxscores/201910230BRK.html
https://www.basketball-reference.com/boxscores/201910230MIA.html
https://www.basketball-reference.com/boxscores/201910230PHI.html
https://www.basketball-reference.com/boxscores/201910230DAL.html
https://www.basketball-reference.com/boxscores/201910230SAS.html
https://www.basketball-reference.com/boxscores/201910230UTA.html
https://www.basketball-reference.com/boxscores/201910230PHO.html
https://www.basketball-reference.com/boxscores/201910230POR.html
https://www.basketball-reference.com/boxscores/201910240DET.html
https://www.basketball-reference.com/boxscores/201910240HOU.html
https://www.basketball-re

https://www.basketball-reference.com/boxscores/201911090CHO.html
https://www.basketball-reference.com/boxscores/201911090CHI.html
https://www.basketball-reference.com/boxscores/201911090MEM.html
https://www.basketball-reference.com/boxscores/201911090OKC.html
https://www.basketball-reference.com/boxscores/201911100MIN.html
https://www.basketball-reference.com/boxscores/201911100ORL.html
https://www.basketball-reference.com/boxscores/201911100PHI.html
https://www.basketball-reference.com/boxscores/201911100OKC.html
https://www.basketball-reference.com/boxscores/201911100NYK.html
https://www.basketball-reference.com/boxscores/201911100PHO.html
https://www.basketball-reference.com/boxscores/201911100POR.html
https://www.basketball-reference.com/boxscores/201911100LAL.html
https://www.basketball-reference.com/boxscores/201911110DET.html
https://www.basketball-reference.com/boxscores/201911110BOS.html
https://www.basketball-reference.com/boxscores/201911110SAS.html
https://www.basketball-re

https://www.basketball-reference.com/boxscores/201911270CLE.html
https://www.basketball-reference.com/boxscores/201911270IND.html
https://www.basketball-reference.com/boxscores/201911270PHI.html
https://www.basketball-reference.com/boxscores/201911270TOR.html
https://www.basketball-reference.com/boxscores/201911270HOU.html
https://www.basketball-reference.com/boxscores/201911270MEM.html
https://www.basketball-reference.com/boxscores/201911270MIL.html
https://www.basketball-reference.com/boxscores/201911270SAS.html
https://www.basketball-reference.com/boxscores/201911270PHO.html
https://www.basketball-reference.com/boxscores/201911270NOP.html
https://www.basketball-reference.com/boxscores/201911270POR.html
https://www.basketball-reference.com/boxscores/201911270GSW.html
https://www.basketball-reference.com/boxscores/201911290BRK.html
https://www.basketball-reference.com/boxscores/201911290DET.html
https://www.basketball-reference.com/boxscores/201911290ORL.html
https://www.basketball-re

https://www.basketball-reference.com/boxscores/201912140CHI.html
https://www.basketball-reference.com/boxscores/201912140MEM.html
https://www.basketball-reference.com/boxscores/201912140DAL.html
https://www.basketball-reference.com/boxscores/201912140MIL.html
https://www.basketball-reference.com/boxscores/201912140DEN.html
https://www.basketball-reference.com/boxscores/201912140HOU.html
https://www.basketball-reference.com/boxscores/201912150NOP.html
https://www.basketball-reference.com/boxscores/201912150IND.html
https://www.basketball-reference.com/boxscores/201912150ATL.html
https://www.basketball-reference.com/boxscores/201912150BRK.html
https://www.basketball-reference.com/boxscores/201912150DEN.html
https://www.basketball-reference.com/boxscores/201912150GSW.html
https://www.basketball-reference.com/boxscores/201912160DET.html
https://www.basketball-reference.com/boxscores/201912160TOR.html
https://www.basketball-reference.com/boxscores/201912160HOU.html
https://www.basketball-re

https://www.basketball-reference.com/boxscores/202001020IND.html
https://www.basketball-reference.com/boxscores/202001020MIA.html
https://www.basketball-reference.com/boxscores/202001020CHI.html
https://www.basketball-reference.com/boxscores/202001020MIN.html
https://www.basketball-reference.com/boxscores/202001020DAL.html
https://www.basketball-reference.com/boxscores/202001020SAS.html
https://www.basketball-reference.com/boxscores/202001020SAC.html
https://www.basketball-reference.com/boxscores/202001020LAC.html
https://www.basketball-reference.com/boxscores/202001030BOS.html
https://www.basketball-reference.com/boxscores/202001030ORL.html
https://www.basketball-reference.com/boxscores/202001030WAS.html
https://www.basketball-reference.com/boxscores/202001030HOU.html
https://www.basketball-reference.com/boxscores/202001030PHO.html
https://www.basketball-reference.com/boxscores/202001030LAL.html
https://www.basketball-reference.com/boxscores/202001040LAC.html
https://www.basketball-re

https://www.basketball-reference.com/boxscores/202001180UTA.html
https://www.basketball-reference.com/boxscores/202001190SAS.html
https://www.basketball-reference.com/boxscores/202001190DEN.html
https://www.basketball-reference.com/boxscores/202001200WAS.html
https://www.basketball-reference.com/boxscores/202001200ATL.html
https://www.basketball-reference.com/boxscores/202001200BRK.html
https://www.basketball-reference.com/boxscores/202001200CHO.html
https://www.basketball-reference.com/boxscores/202001200CLE.html
https://www.basketball-reference.com/boxscores/202001200HOU.html
https://www.basketball-reference.com/boxscores/202001200MEM.html
https://www.basketball-reference.com/boxscores/202001200MIA.html
https://www.basketball-reference.com/boxscores/202001200MIL.html
https://www.basketball-reference.com/boxscores/202001200BOS.html
https://www.basketball-reference.com/boxscores/202001200MIN.html
https://www.basketball-reference.com/boxscores/202001200PHO.html
https://www.basketball-re

https://www.basketball-reference.com/boxscores/202003060NYK.html
https://www.basketball-reference.com/boxscores/202003060CHI.html
https://www.basketball-reference.com/boxscores/202003060MIN.html
https://www.basketball-reference.com/boxscores/202003060NOP.html
https://www.basketball-reference.com/boxscores/202003060BOS.html
https://www.basketball-reference.com/boxscores/202003060DAL.html
https://www.basketball-reference.com/boxscores/202003060PHO.html
https://www.basketball-reference.com/boxscores/202003060LAL.html
https://www.basketball-reference.com/boxscores/202003070CHO.html
https://www.basketball-reference.com/boxscores/202003070DET.html
https://www.basketball-reference.com/boxscores/202003070CLE.html
https://www.basketball-reference.com/boxscores/202003070MEM.html
https://www.basketball-reference.com/boxscores/202003070GSW.html
https://www.basketball-reference.com/boxscores/202003070POR.html
https://www.basketball-reference.com/boxscores/202003080BRK.html
https://www.basketball-re

In [222]:
games.head()

Unnamed: 0,date,start_time,vis_team,home_team,vis_points,home_points,ot,attendance,home_initial,vis_initial,home_players,vis_players,referees
0,2019-10-22,20:00:00,New Orleans Pelicans,Toronto Raptors,122,130,1,20787,TOR,NOP,"Kyle Lowry,Fred VanVleet,Pascal Siakam,OG Anun...","Jrue Holiday,Brandon Ingram,J.J. Redick,Lonzo ...","Tony Brown,Eric Dalen,Josh Tiven"
1,2019-10-22,22:30:00,Los Angeles Lakers,Los Angeles Clippers,102,112,0,19068,LAC,LAL,"Kawhi Leonard,Patrick Beverley,Landry Shamet,P...","Anthony Davis,LeBron James,Danny Green,Avery B...","Kane Fitzgerald,Mark Lindsay,Dedric Taylor"
2,2019-10-23,19:00:00,Chicago Bulls,Charlotte Hornets,125,126,0,15424,CHO,CHI,"P.J. Washington,Cody Zeller,Dwayne Bacon,Miles...","Lauri Markkanen,Zach LaVine,Wendell Carter,Ott...","Rodney Mott,Gediminas Petraitis,Leroy Richardson"
3,2019-10-23,19:00:00,Detroit Pistons,Indiana Pacers,119,110,0,17923,IND,DET,"Myles Turner,Malcolm Brogdon,Domantas Sabonis,...","Andre Drummond,Tony Snell,Bruce Brown,Markieff...","Marc Davis,Brett Nansel,Ben Taylor"
4,2019-10-23,19:00:00,Cleveland Cavaliers,Orlando Magic,85,94,0,18846,ORL,CLE,"Nikola Vučević,Aaron Gordon,Evan Fournier,D.J....","Kevin Love,Tristan Thompson,Darius Garland,Col...","Brent Barnaky,Eric Lewis,Haywoode Workman"


# The below function gets the expected points scored for the starting lineup based on thier individual average points per game

In [298]:
def get_score(players):
    pts = 0
    for player in players.split(','):
        p = df[df['players'] == player].points_per_game.values
        if len(p) > 0: pts += p[0]
    return pts

In [299]:
games['exp_score_home'] = games['home_players'].apply(get_score)

In [301]:
games['exp_score_vis'] = games['vis_players'].apply(get_score)

# Now we have an expected score for each team 

In [302]:
games.head()

Unnamed: 0,date,start_time,vis_team,home_team,vis_points,home_points,ot,attendance,home_initial,vis_initial,home_players,vis_players,referees,exp_score_home,exp_score_vis
0,2019-10-22,20:00:00,New Orleans Pelicans,Toronto Raptors,122,130,1,20787,TOR,NOP,"Kyle Lowry,Fred VanVleet,Pascal Siakam,OG Anun...","Jrue Holiday,Brandon Ingram,J.J. Redick,Lonzo ...","Tony Brown,Eric Dalen,Josh Tiven",79.32962,80.35398
1,2019-10-22,22:30:00,Los Angeles Lakers,Los Angeles Clippers,102,112,0,19068,LAC,LAL,"Kawhi Leonard,Patrick Beverley,Landry Shamet,P...","Anthony Davis,LeBron James,Danny Green,Avery B...","Kane Fitzgerald,Mark Lindsay,Dedric Taylor",57.011597,76.038714
2,2019-10-23,19:00:00,Chicago Bulls,Charlotte Hornets,125,126,0,15424,CHO,CHI,"P.J. Washington,Cody Zeller,Dwayne Bacon,Miles...","Lauri Markkanen,Zach LaVine,Wendell Carter,Ott...","Rodney Mott,Gediminas Petraitis,Leroy Richardson",60.023165,62.076264
3,2019-10-23,19:00:00,Detroit Pistons,Indiana Pacers,119,110,0,17923,IND,DET,"Myles Turner,Malcolm Brogdon,Domantas Sabonis,...","Andre Drummond,Tony Snell,Bruce Brown,Markieff...","Marc Davis,Brett Nansel,Ben Taylor",77.9017,57.526521
4,2019-10-23,19:00:00,Cleveland Cavaliers,Orlando Magic,85,94,0,18846,ORL,CLE,"Nikola Vučević,Aaron Gordon,Evan Fournier,D.J....","Kevin Love,Tristan Thompson,Darius Garland,Col...","Brent Barnaky,Eric Lewis,Haywoode Workman",75.108187,73.763983


In [303]:
games.to_csv('nba.csv')