# Mining NBA Contracts
Source: [Basketball-Reference](https://www.basketball-reference.com/players/j/jamesle01.html)

In [1]:
import json
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pprint
import re 
import pandas as pd
import numpy as np
import time

# fix ssl certificate (needed for MacOS sometimes)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

### Get links to all active player pages

In [2]:
years = [y for y in range(2012,2022)]
players_links = {}

for year in years:
    # download html for all players that played this season
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html'
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')
    
    # extract all players/links
    for row in soup.find('table').find_all('tr', {'class': 'full_table'}):
        try:
            player = row.find('a').text
            link = row.find('a').get('href')
            if player not in players_links:
                players_links[player] = link
        except:
            print('failed on')
            print(row)
            
    print('year: ' + str(year))
    print('players: ' + str(len(players_links)))
    print()

year: 2012
players: 478

year: 2013
players: 568

year: 2014
players: 654

year: 2015
players: 737

year: 2016
players: 811

year: 2017
players: 901

year: 2018
players: 1020

year: 2019
players: 1125

year: 2020
players: 1243

year: 2021
players: 1337



In [3]:
# players_links

### Get stats for each player

In [4]:
def download_html(link):
    url = f'https://www.basketball-reference.com{link}'
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')
    return soup

def get_bio(soup, name):
    bio = {}
    info = soup.find('div', {"id": "info"})
    
    bio['name'] = name
    bio['height'] = info.find('span', {"itemprop": "height"}).text
    bio['weight'] = info.find('span', {"itemprop": "weight"}).text
    bio['birthDate'] = info.find('span', {"itemprop": "birthDate"}).get('data-birth')
    
    # get draft info
    bio['draftRound'] = -1
    bio['draftOverall'] = -1
    bio['draftYear'] = -1
    bio['undrafted'] = False
    draft_text = ''
    for p in info.find_all('p'):
        if 'NBA Draft' in p.text:
            draft_text = p.text
    if draft_text == '':
        bio['undrafted'] = True
    else:
        draft_data = draft_text.split(',')[1:]
        bio['draftRound'] = draft_data[0].split()[0]
        bio['draftOverall'] = draft_data[1].split()[0]
        bio['draftYear'] = draft_data[2].split()[0]
    
    return bio

# def get_awards(soup):
#     awards_list = soup.find('div', {"id": "info"}).find('ul', {'id': 'bling'}).find_all('li')
#     return [award.text for award in awards_list]

def get_stats_headers(soup):
    return ['Name'] + [header.text for header in soup.find('table', {"id": "per_game"}).find('thead').find_all('th')]

def get_stats(soup, name):
    table_data = []
    for row in soup.find('table', {"id": "per_game"}).find('tbody').find_all('tr'):
        row_data = [name] + [element.text for element in row.find_all(['th','td'])]
        table_data.append(row_data)
    return table_data

def get_adv_stats_headers(soup):
    headers = ['Name'] + [header.text for header in soup.find('table', {"id": "advanced"}).find('thead').find_all('th')]
    headers = [h for h in headers if h != '\xa0']
    return headers 

def get_adv_stats(soup, name):
    table_data = []
    for row in soup.find('table', {"id": "advanced"}).find('tbody').find_all('tr'):
        row_data = [name] + [element.text for element in row.find_all(['th','td'])]
        row_data = [row for row in row_data if row != '']
        table_data.append(row_data)
    return table_data

In [19]:
# pl = {'John Wall': '/players/w/walljo01.html'}

In [20]:
players_bios = []
players_stats_headers = []
players_adv_stats_headers = []
players_stats = []
players_adv_stats = []

for player, link in players_links.items():    
    try:
        soup = download_html(link)
        players_bios.append(get_bio(soup, player))
        players_stats.extend(get_stats(soup, player))
        players_adv_stats.extend(get_adv_stats(soup, player))

        # get headers for tables if we don't have them yet
        if len(players_stats_headers) == 0:
            players_stats_headers = get_stats_headers(soup)
            players_adv_stats_headers = get_adv_stats_headers(soup)

        if len(players_bios) % 50 == 0:
            print(str(len(players_bios)) + 'th player paged scraped')
            time.sleep(1) # do not want to get blocked by basketball-ref
    except:
        print('failed at ' + player)

50th player paged scraped
100th player paged scraped
150th player paged scraped
200th player paged scraped
250th player paged scraped
300th player paged scraped
350th player paged scraped
400th player paged scraped
450th player paged scraped
500th player paged scraped
550th player paged scraped
600th player paged scraped
650th player paged scraped
700th player paged scraped
750th player paged scraped
800th player paged scraped
850th player paged scraped
900th player paged scraped
950th player paged scraped
1000th player paged scraped
1050th player paged scraped
1100th player paged scraped
1150th player paged scraped
1200th player paged scraped
1250th player paged scraped
1300th player paged scraped


In [21]:
len(players_bios)

1337

### Create df and calculate lags

In [22]:
df_basic = pd.DataFrame(players_stats, columns=players_stats_headers)
df_adv = pd.DataFrame(players_adv_stats, columns=players_adv_stats_headers)
df_adv.drop(['MP'], axis=1, inplace=True)

df_stats = df_basic.merge(df_adv, on=['Name', 'Season', 'Age', 'Tm', 'Pos', 'G', 'Lg'])
df_stats.dropna(inplace=True)
df_stats.drop_duplicates(['Name', 'Season', 'Lg'], keep='first', inplace=True)

In [23]:
# convert cols to numeric where possible
cols = df_stats.columns.drop(['Name', 'Season', 'Tm', 'Pos', 'Lg'])
df_stats[cols] = df_stats[cols].apply(pd.to_numeric, errors='coerce')

In [24]:
# save stats to csv
df_stats.to_csv('../data/playerStats.csv',index=False)

In [25]:
# calculate lagged effects
df_complete = pd.concat([df_stats, 
                         df_stats.groupby('Name').shift().add_suffix('_prev1'), 
                         df_stats.groupby('Name').shift(2).add_suffix('_prev2')], axis=1).fillna(0)


df_complete.drop(['Season_prev1', 'Age_prev1', 'Tm_prev1', 'Lg_prev1', 'Pos_prev1',
                  'Season_prev2', 'Age_prev2', 'Tm_prev2', 'Lg_prev2', 'Pos_prev2'],
                axis=1, inplace=True)


df_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7877 entries, 0 to 9817
Columns: 141 entries, Name to VORP_prev2
dtypes: float64(133), int64(3), object(5)
memory usage: 8.5+ MB


In [26]:
# df_complete.info(verbose=True)

In [28]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     display(df_complete)
# df_complete[['Name', 'Season','G', 'G_prev1', 'G_prev2']]
df_complete.head()

Unnamed: 0,Name,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,...,TOV%_prev2,USG%_prev2,OWS_prev2,DWS_prev2,WS_prev2,WS/48_prev2,OBPM_prev2,DBPM_prev2,BPM_prev2,VORP_prev2
0,Jeff Adrien,2010-11,24,GSW,NBA,PF,23,0,8.5,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Jeff Adrien,2011-12,25,HOU,NBA,PF,8,0,7.9,0.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Jeff Adrien,2012-13,26,CHA,NBA,PF,52,5,13.7,1.4,...,12.6,16.1,0.1,0.1,0.2,0.051,-3.3,-2.0,-5.3,-0.2
3,Jeff Adrien,2013-14,27,TOT,NBA,PF,53,12,18.1,2.7,...,8.6,16.8,0.1,0.1,0.1,0.106,-4.0,-0.2,-4.3,0.0
6,Jeff Adrien,2014-15,28,MIN,NBA,PF,17,0,12.6,1.1,...,13.1,15.6,0.5,0.4,1.0,0.064,-2.1,-0.9,-3.0,-0.2


In [29]:
df_complete.to_csv('../data/playerStatsWithLags.csv',index=False)

In [30]:
pd.DataFrame(players_bios).to_csv('../data/playerBio.csv', index=False)