# NBA DPOY Shares

### Anish Shourie

Script that calculates the cumulative number of shares of the Defensive Player of the Year (DPOY) award vote that a player has earned throughout his career. *dpoy_shares* is defined as the following:

$$\sum_{x}\dfrac{\text{number of points received in DPOY voting in year x}}{\text{maximum number of points in year x (all the first place votes)}}$$

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
from datetime import datetime

# Getting the helper functions from the NBA_bball_ref_help.py file
import os, sys
currentdir = os.path.dirname(os.path.realpath('dpoy_shares.ipynb'))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)

# see Python module in NBA directory
import NBA_bball_ref_help as bb

In [6]:
"""
This cell will output a dictionary of player names and codes (together in a tuple)
The code is just the basketball reference URL subdirectory
For example, for Kobe Bryant it would be /players/b/bryanko01.html
The player name and code tuple is the key, and the dpoy_share is the value
"""

current_year = datetime.today().year

# DPOY first awarded in 1982-1983 season
years = list(range(1983,current_year+1))
dpoy_shares = {}
    
# scraping the table (commented out on the basketball-reference.com)    
def get_table(comments: list, s: str):
    
    for comment in comments:
        comment1 = BeautifulSoup(str(comment), 'lxml')
        table = comment1.find(id = s)
        if table:
            break
    
    return table


for year in years:

    url = "https://www.basketball-reference.com/awards/awards_{}.html".format(year)
    html = urlopen(url)
    soup = BeautifulSoup(html)
    comments = soup.find_all(string=lambda text:isinstance(text,Comment))

    table = get_table(comments, "dpoy")
    
    headers = [th.getText() for th in table.find_all('tr')[1].findAll('th')]
    headers = headers[1:]
    rows = table.find_all('tr')[1:]
    voting = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
    voting = pd.DataFrame(voting, columns = headers)
    voting = voting.dropna()

    players = []
    
    # finding table data
    tags = table.find_all('td')

    # finding "players" data tags to get unique identifier
    for tag in tags:
        for t in tag.find_all('a'):
            if 'players' in t.get('href'): # if 'players' in URL
                players.append(t.get('href'))

    i = 0
    for index, row in voting.iterrows():

        share = float(row['Pts Won']) / float(row['Pts Max'])
        tup = (players[i], row['Player'])
        
        # updating dictionary with dpoy_shares from the year in the loop
        try:
            dpoy_shares[tup] += share
        except:
            dpoy_shares[tup] = share
        
        # iterating through players list (rows in DataFrame are just the players in the list)
        i+=1

In [7]:
"""
Run this cell if you just want the DPOY output
"""

dpoy_simple = pd.DataFrame(list(dpoy_shares.items()),columns = ['Code/Name','DPOY_shares'])
dpoy_simple = dpoy_simple.set_index('Code/Name')
dpoy_simple = dpoy_simple.sort_values(by='DPOY_shares', ascending=False)
dpoy_simple.head(5)

Unnamed: 0_level_0,DPOY_shares
Code/Name,Unnamed: 1_level_1
"(/players/w/wallabe01.html, Ben Wallace)",3.746404
"(/players/g/goberru01.html, Rudy Gobert)",3.655935
"(/players/h/howardw01.html, Dwight Howard)",3.242041
"(/players/g/greendr01.html, Draymond Green)",2.210769
"(/players/m/mutomdi01.html, Dikembe Mutombo)",2.146213


In [3]:
"""
Creating full output
"""

cols = ['Player', 'Seasons', 'Games', 'Minutes', 'DWS', \
        'Seasons_>50', 'Seasons_>75', 'Seasons_DPOY', 'Seasons_>50_DPOY', 'Seasons_>75_DPOY',\
       'DPOY_awards','DPOY_shares']
output = pd.DataFrame(columns = cols)
output

Unnamed: 0,Player,Seasons,Games,Minutes,DWS,Seasons_>50,Seasons_>75,Seasons_DPOY,Seasons_>50_DPOY,Seasons_>75_DPOY,DPOY_awards,DPOY_shares


In [4]:
"""
Reading in spreadsheet with basketball-reference data about DPOY winners
"""

dpoy_winners = pd.read_excel('dpoy_winners.xlsx', index_col='code')
dpoy_winners.head(5)

Unnamed: 0_level_0,player,num
code,Unnamed: 1_level_1,Unnamed: 2_level_1
/players/m/mutomdi01.html,Dikembe Mutombo,4
/players/w/wallabe01.html,Ben Wallace,4
/players/g/goberru01.html,Rudy Gobert,3
/players/h/howardw01.html,Dwight Howard,3
/players/e/eatonma01.html,Mark Eaton,2


In [5]:
"""
This cell gets other statistics about the players
to output in a DataFrame
"""

base_url = 'https://www.basketball-reference.com'


for p in dpoy_shares.keys():
    
    url = base_url+p[0]

    html = urlopen(url)
    soup = BeautifulSoup(html)
    table = soup.find(id='div_advanced')

    headers = [th.getText() for th in table.find_all('tr')[0].findAll('th')]
    rows = table.find_all('tr')[1:]
    stats = [[tc.getText() for tc in rows[i].findAll(['th','td'])]for i in range(len(rows))]
    stats = pd.DataFrame(stats, columns = headers)

    stats = stats.replace(r'^\s*$', np.nan, regex=True)
    stats = stats.dropna(subset=['Age'])
    stats = stats.dropna(axis=1, how='all')

    stats['G'] = pd.to_numeric(stats['G'])
    stats['MP'] = pd.to_numeric(stats['MP'])
    stats['DWS'] = pd.to_numeric(stats['DWS'])
    
    stats = stats.drop_duplicates(subset='Season', keep='first')

    seasons = len(stats)
    games = sum(stats['G'])
    minutes = sum(stats['MP'])
    dws = sum(stats['DWS'])

    pct_50 = 0
    pct_50_dpoy = 0
    pct_75 = 0
    pct_75_dpoy = 0
    num_dpoy = 0

    for index, row in stats.iterrows():

        year = bb.get_season_int(season = row['Season'])

        if year >= 1983:
            num_dpoy += 1
        
        teams = bb.team_totals(year, row['Lg'])
        
        if row['Tm'] == 'TOT':
            tm_games = int(max(teams['G']))
        else:
            team = bb.team_codes(row['Tm'])
            tm_games = int(teams.loc[team,'G'])

        if row['G'] >= 0.75 * tm_games:
            pct_75 += 1
            
        if row['G'] >= 0.5 * tm_games:
            pct_50 += 1

        if row['G'] >= 0.75 * tm_games and year >= 1983:
            pct_75_dpoy += 1
            
        if row['G'] >= 0.5 * tm_games and year >= 1983:
            pct_50_dpoy += 1

    output.loc[p[0], 'Player'] = p[1]
    output.loc[p[0], 'Seasons'] = seasons
    output.loc[p[0], 'Games'] = games
    output.loc[p[0], 'Minutes'] = minutes
    output.loc[p[0], 'DWS'] = dws
    output.loc[p[0], 'Seasons_DPOY'] = num_dpoy
    output.loc[p[0], 'Seasons_>50'] = pct_50
    output.loc[p[0], 'Seasons_>75'] = pct_75
    output.loc[p[0], 'Seasons_>50_DPOY'] = pct_50_dpoy
    output.loc[p[0], 'Seasons_>75_DPOY'] = pct_75_dpoy
    output.loc[p[0], 'Seasons_>75_DPOY'] = pct_75_dpoy
    try:
        output.loc[p[0], 'DPOY_awards'] = dpoy_winners.loc[p[0], 'num']
    except:
        output.loc[p[0], 'DPOY_awards'] = 0
    output.loc[p[0], 'DPOY_shares'] = dpoy_shares[p]
    
    
output.head(5)

Unnamed: 0,Player,Seasons,Games,Minutes,DWS,Seasons_>50,Seasons_>75,Seasons_DPOY,Seasons_>50_DPOY,Seasons_>75_DPOY,DPOY_awards,DPOY_shares
/players/m/moncrsi01.html,Sidney Moncrief,11,767,23150,28.9,10,9,8,7,6,2,0.705897
/players/r/rollitr01.html,Tree Rollins,18,1156,24028,42.1,16,10,13,12,6,0,0.146491
/players/b/birdla01.html,Larry Bird,13,897,34443,59.0,12,10,10,9,7,0,0.093158
/players/c/cheekma01.html,Maurice Cheeks,15,1101,34845,42.6,14,13,11,10,9,0,0.273674
/players/c/coopemi01.html,Michael Cooper,12,873,23635,24.7,11,10,8,8,7,1,0.631687


In [6]:
output.to_csv("dpoy_shares.csv")