# Mining NBA Awards
Source: [Basketball-Reference](https://www.basketball-reference.com/leagues/NBA_2021.html)

In [311]:
import json
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pprint
import re 
import pandas as pd
import numpy as np
import time
from bs4 import Comment

# fix ssl certificate (needed for MacOS sometimes)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

### Scrape page containing awards

In [407]:
def get_league_champ(soup):
    for p in soup.find_all('p'):
        if 'League Champion' in p.text:
            return p.find('a').get('href')
        
# scoring champ is the player with highest ppg
def get_scoring_champ(soup):
    for p in soup.find_all('p'):
        if 'PPG Leader' in p.text:
            return p.find('a').text
        
def get_rebound_champ(soup):
    for p in soup.find_all('p'):
        if 'RPG Leader' in p.text:
            return p.find('a').text

def get_assist_champ(soup):
    for p in soup.find_all('p'):
        if 'APG Leader' in p.text:
            return p.find('a').text
        
def get_all_nba(soup, season):
    # the data in the table is stored as a js comment
    comment = soup.find('div', {'id':'all_all-nba'}).find(string=lambda text: isinstance(text, Comment))
    soup_c = BeautifulSoup(comment, 'html.parser')
    
    first_team = [p.text.strip() for p in soup_c.find('div', {'id':'all-nba_1'}).find_all('p')]
    second_team = [p.text.strip() for p in soup_c.find('div', {'id':'all-nba_2'}).find_all('p')]
    third_team = [p.text.strip() for p in soup_c.find('div', {'id':'all-nba_3'}).find_all('p')]
    
    history = []
    history.extend([[season, player, 'allNBA1stTeam'] for player in first_team])
    history.extend([[season, player, 'allNBA2ndTeam'] for player in second_team])
    history.extend([[season, player, 'allNBA3rdTeam'] for player in third_team])
    
    return history

def get_all_defensive(soup, season):
    # the data in the table is stored as a js comment
    comment = soup.find('div', {'id':'all_all-defensive'}).find(string=lambda text: isinstance(text, Comment))
    soup_c = BeautifulSoup(comment, 'html.parser')
    
    first_team = [p.text.strip() for p in soup_c.find('div', {'id':'all-defensive_1'}).find_all('p')]
    second_team = [p.text.strip() for p in soup_c.find('div', {'id':'all-defensive_2'}).find_all('p')]
    
    history = []
    history.extend([[season, player, 'allDEF1stTeam'] for player in first_team])
    history.extend([[season, player, 'allDEF2ndTeam'] for player in second_team])
    
    return history
    
def get_all_rookie(soup, season):
    # the data in the table is stored as a js comment
    comment = soup.find('div', {'id':'all_all-rookie'}).find(string=lambda text: isinstance(text, Comment))
    soup_c = BeautifulSoup(comment, 'html.parser')
    
    first_team = [p.text.strip() for p in soup_c.find('div', {'id':'all-rookie_1'}).find_all('p')]
    second_team = [p.text.strip() for p in soup_c.find('div', {'id':'all-rookie_2'}).find_all('p')]
    
    history = []
    history.extend([[season, player, 'allROOKIE1stTeam'] for player in first_team])
    history.extend([[season, player, 'allROOKIE2ndTeam'] for player in second_team])
    
    return history

def get_all_star(soup, season):
    # the data in the table is stored as a js comment
    comment = soup.find('div', {'id':'all_all_star_game_rosters'}).find(string=lambda text: isinstance(text, Comment))
    soup_c = BeautifulSoup(comment, 'html.parser')
    
    first_team = [p.text.strip() for p in soup_c.find('div', {'id':'all_star_game_rosters_1'}).find_all('p')]
    second_team = [p.text.strip() for p in soup_c.find('div', {'id':'all_star_game_rosters_2'}).find_all('p')]
    
    history = []
    history.extend([[season, player, 'allStar'] for player in first_team])
    history.extend([[season, player, 'allStar'] for player in second_team])
    
    return history

def get_all_all_teams(soup, season):
    history = []
    try: 
        history.extend(get_all_nba(soup, season)) 
    except: 
        pass
    try: 
        history.extend(get_all_defensive(soup, season)) 
    except: 
        pass
    try: 
        history.extend(get_all_rookie(soup, season)) 
    except: 
        pass
    try: 
        history.extend(get_all_star(soup, season)) 
    except: 
        pass
    return history

In [422]:
# download html awards this season
years = [i for i in range(1960,2022)]
league_champs = {}
individual_champs = []
all_awd_teams = []


for year in years:
    season = str(year-1) + '-' + str(year)[2:]
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}.html'
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')
    
    # extract data from page
    league_champs[season] = get_league_champ(soup)
    individual_champs.append([season, get_scoring_champ(soup), 'scoreChamp'])
    individual_champs.append([season, get_rebound_champ(soup), 'reboundChamp'])
    individual_champs.append([season, get_assist_champ(soup), 'assistChamp'])
    all_awd_teams.extend(get_all_all_teams(soup, season))
    
    if year % 10 == 0:
        print('scraping season: ' + season)
    

scraping season: 1959-60
scraping season: 1969-70
scraping season: 1979-80
scraping season: 1989-90
scraping season: 1999-00
scraping season: 2009-10
scraping season: 2019-20


In [423]:
# league_champs

In [424]:
# individual_champs

In [425]:
# all_awd_teams

#### extract player names of nba champ

In [426]:
def get_champ_roster(season, link):
    url  = f'https://www.basketball-reference.com{link}'
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')
    
    players = [td.text for td in soup.find('table', {'id':'roster'}).find_all('td', {'data-stat':'player'})]
    
    return [[season, player, 'nbaChamp'] for player in players]

In [427]:
nba_champs = []
for season, link in league_champs.items():
    nba_champs.extend(get_champ_roster(season, link))

In [428]:
# nba_champs

#### Get Individual Awards for all Seasons

* MVP (mvp)
* Rookie of the Year (roy)
* Defensive Player of the Year (dpoy)
* Most Improved Player (mip)
* Sixth Man of the Year (smoy)

In [429]:
def get_award_data(award):
    url = f'https://www.basketball-reference.com/awards/{award}.html'
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')
    
    history = []
    for row in soup.find('table', {'id':f'{award}_NBA'}).find('tbody').find_all('tr'):
        season = row.find('th', {'data-stat': 'season'}).text
        player = row.find('td', {'data-stat': 'player'}).text
        history.append([season, player, award])
        
    return history

In [430]:
awards = ['mvp', 'roy', 'dpoy', 'mip', 'smoy']
all_awards = []
for award in awards:
    all_awards.extend(get_award_data(award))

##### merge individual and league awards

In [431]:
df_indiv = pd.DataFrame(individual_champs, columns=['season', 'name', 'award'])
df_awd = pd.DataFrame(all_awards, columns=['season', 'name', 'award'])
df_team = pd.DataFrame(all_awd_teams, columns=['season', 'name', 'award'])
df_champs = pd.DataFrame(nba_champs, columns=['season', 'name', 'award'])

In [468]:
df_team[(df_team['award']=='allStar') & (df_team['season']=='1985-86')]

Unnamed: 0,season,name,award
813,1985-86,Larry Bird*,allStar
814,1985-86,Maurice Cheeks*,allStar
815,1985-86,Julius Erving*,allStar
816,1985-86,Patrick Ewing* (1),allStar
817,1985-86,Michael Jordan* (2),allStar
818,1985-86,Jeff Malone,allStar
819,1985-86,Moses Malone*,allStar
820,1985-86,Kevin McHale*,allStar
821,1985-86,Sidney Moncrief*,allStar
822,1985-86,Robert Parish*,allStar


In [531]:
df = pd.concat([df_indiv, df_awd, df_team, df_champs])

# clean names - remove diactrics, remove anything that is '(Tie)' or '*' or '(1)'
df['name'] = df['name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df['name'] = df['name'].str.replace('(Tie)', '', regex=False).str.replace('*', '', regex=False).str.strip()
df['name'] = df['name'].str.replace('(1)', '', regex=False).str.replace('*', '', regex=False).str.strip()
df['name'] = df['name'].str.replace('(2)', '', regex=False).str.replace('*', '', regex=False).str.strip()
df['name'] = df['name'].str.replace('(3)', '', regex=False).str.replace('*', '', regex=False).str.strip()

# dummy var to count total awards when calculating cumulative sum later
df['count'] = 1

df

Unnamed: 0,season,name,award,count
0,1959-60,Wilt Chamberlain,scoreChamp,1
1,1959-60,Wilt Chamberlain,reboundChamp,1
2,1959-60,Bob Cousy,assistChamp,1
3,1960-61,Wilt Chamberlain,scoreChamp,1
4,1960-61,Wilt Chamberlain,reboundChamp,1
...,...,...,...,...
911,2020-21,Bobby Portis,nbaChamp,1
912,2020-21,Jeff Teague,nbaChamp,1
913,2020-21,Axel Toupane,nbaChamp,1
914,2020-21,P.J. Tucker,nbaChamp,1


In [532]:
# we need a row for each player and each season
# use a dummy helper to do this
dummy_name_helper = []
for name in df['name'].unique():
    for year in range(2000,2022):
        season = str(year-1) + '-' + str(year)[2:]
        dummy_name_helper.append([season, name, 'dummyaward', 1])

In [533]:
df = pd.concat([df, pd.DataFrame(dummy_name_helper, columns=['season', 'name', 'award', 'count'])])


In [534]:
df = df.pivot(index=['name', 'season'], columns='award', values='count').fillna(0).reset_index()
df.drop('dummyaward', axis=1, inplace=True)
df = df.sort_values(['season', 'name'])
df

award,name,season,allDEF1stTeam,allDEF2ndTeam,allNBA1stTeam,allNBA2ndTeam,allNBA3rdTeam,allROOKIE1stTeam,allROOKIE2ndTeam,allStar,assistChamp,dpoy,mip,mvp,nbaChamp,reboundChamp,roy,scoreChamp,smoy
19651,Paul Hoffman,1947-48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10152,Howie Shannon,1948-49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
355,Alex Groza,1949-50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
19603,Paul Arizin,1950-51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1748,Bill Tosheff,1951-52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25962,Zelmo Beaty,2020-21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25984,Zion Williamson,2020-21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26007,Zydrunas Ilgauskas,2020-21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16730,Marcus Smart,2021-22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [535]:
df = df.groupby(['name', 'season']).sum() \
      .groupby(level=0).cumsum().reset_index()
df

award,name,season,allDEF1stTeam,allDEF2ndTeam,allNBA1stTeam,allNBA2ndTeam,allNBA3rdTeam,allROOKIE1stTeam,allROOKIE2ndTeam,allStar,assistChamp,dpoy,mip,mvp,nbaChamp,reboundChamp,roy,scoreChamp,smoy
0,A.C. Green,1986-87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,A.C. Green,1987-88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
2,A.C. Green,1988-89,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
3,A.C. Green,1989-90,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
4,A.C. Green,1999-00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26003,Zydrunas Ilgauskas,2016-17,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26004,Zydrunas Ilgauskas,2017-18,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26005,Zydrunas Ilgauskas,2018-19,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26006,Zydrunas Ilgauskas,2019-20,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [536]:
cols = ['name', 'season', 'nbaChamp', 'mvp']
df[df['name']=='Stephen Curry'][cols]
# df[df['name']=='Michael Jordan'][cols]
# df[df['name']=='Bill Russell'][cols].head(40)
# df[df['name']=='LeBron James'][cols]
# df[df['name']=='Allen Iverson'][cols]


award,name,season,nbaChamp,mvp
22934,Stephen Curry,1999-00,0.0,0.0
22935,Stephen Curry,2000-01,0.0,0.0
22936,Stephen Curry,2001-02,0.0,0.0
22937,Stephen Curry,2002-03,0.0,0.0
22938,Stephen Curry,2003-04,0.0,0.0
22939,Stephen Curry,2004-05,0.0,0.0
22940,Stephen Curry,2005-06,0.0,0.0
22941,Stephen Curry,2006-07,0.0,0.0
22942,Stephen Curry,2007-08,0.0,0.0
22943,Stephen Curry,2008-09,0.0,0.0


In [537]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26008 entries, 0 to 26007
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              26008 non-null  object 
 1   season            26008 non-null  object 
 2   allDEF1stTeam     26008 non-null  float64
 3   allDEF2ndTeam     26008 non-null  float64
 4   allNBA1stTeam     26008 non-null  float64
 5   allNBA2ndTeam     26008 non-null  float64
 6   allNBA3rdTeam     26008 non-null  float64
 7   allROOKIE1stTeam  26008 non-null  float64
 8   allROOKIE2ndTeam  26008 non-null  float64
 9   allStar           26008 non-null  float64
 10  assistChamp       26008 non-null  float64
 11  dpoy              26008 non-null  float64
 12  mip               26008 non-null  float64
 13  mvp               26008 non-null  float64
 14  nbaChamp          26008 non-null  float64
 15  reboundChamp      26008 non-null  float64
 16  roy               26008 non-null  float6

In [538]:
df.to_csv('../data/playerCumulativeAwards.csv', index=False)