# Mining NBA Contracts
Source: [Sporttrac](https://www.spotrac.com/nba/contracts/)

In [147]:
import json
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pprint
import re 
import pandas as pd
import numpy as np
import time

# fix ssl certificate (needed for MacOS sometimes)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

### Sample of one page

In [51]:
# get html, convert to bs4 object
category = 'entry-level'
url = f'https://www.spotrac.com/nba/contracts/sort-value/type-{category}/limit-500/'
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, 'html.parser')

In [94]:
players = []
for row in soup.find_all('tr'):
#     cell = [re.sub('\s+',' ',item.strip()) for item in row.find_all('td') if item not None]
    row_data = [category]
    for cell in row.find_all('td'):
        if cell is None:
            continue
        else:
            # look through this cell
            if '\n' in cell.text.strip(): # this cell is the 'name' cell
                for cell_text in cell.text.strip().split('\n'):
                    if cell_text != '\n':
                        row_data.append(cell_text)
                    if len(row_data) > 14:
                        break
            else:
                row_data.append(cell.text.strip())
#     pprint.pprint(row_data)

    row_data = [r for r in row_data if len(r) > 0]
    players.append(row_data)
    
# drop first frow
players = players[1:]

In [95]:
df = pd.DataFrame(players)
df.columns = ['category','categoryRank', 'lastName', 'fullName', 'position', 'empty', 'term',
              'signAge', 'contractLength', 'totalValue', 'aav', 'signBonus']
df = df.drop(['empty'], axis=1)

In [96]:
df

Unnamed: 0,category,categoryRank,lastName,fullName,position,term,signAge,contractLength,totalValue,aav,signBonus
0,entry-level,1,Cunningham,Cade Cunningham,Point Guard,2021-2024 (FA: 2025),19,4,"$45,599,089","$11,399,772",0-
1,entry-level,2,Edwards,Anthony Edwards,Shooting Guard,2020-2023 (FA: 2024),19,4,"$44,271,137","$11,067,784",0-
2,entry-level,2,Williamson,Zion Williamson,Power Forward,2019-2022 (FA: 2023),18,4,"$44,271,137","$11,067,784",0-
3,entry-level,4,Green,Jalen Green,Shooting Guard,2021-2024 (FA: 2025),19,4,"$40,808,448","$10,202,112",0-
4,entry-level,5,Ayton,Deandre Ayton,Center,2018-2021 (FA: 2022),19,4,"$40,379,230","$10,094,808",0-
...,...,...,...,...,...,...,...,...,...,...,...
139,entry-level,140,Butler,Jared Butler,Point Guard,2021-2022 (FA: 2023),20,2,"$2,488,776","$1,244,388",0-
140,entry-level,140,Dosunmu,Ayo Dosunmu,Shooting Guard,2021-2022 (FA: 2023),21,2,"$2,488,776","$1,244,388",0-
141,entry-level,142,Banton,Dalano Banton,Point Guard,2021-2022 (FA: 2023),21,2,"$2,488,775","$1,244,388",0-
142,entry-level,143,Nwora,Jordan Nwora,Power Forward,2020-2021 (FA: 2022),22,2,"$2,416,291","$1,208,146",0-


### Automate for all pages

In [99]:
def download_html(category):
    url = f'https://www.spotrac.com/nba/contracts/sort-value/type-{category}/limit-500/'
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, 'html.parser')
    return soup

def extract_players(soup, category):
    players = []
    for row in soup.find_all('tr'):
        row_data = [category]
        for cell in row.find_all('td'):
            if cell is None:
                continue
            else:
                # look through this cell
                if '\n' in cell.text.strip(): # this cell is the 'name' cell
                    for cell_text in cell.text.strip().split('\n'):
                        if cell_text != '\n':
                            row_data.append(cell_text)
                        if len(row_data) > 14:
                            break
                else:
                    row_data.append(cell.text.strip())

        row_data = [r for r in row_data if len(r) > 0]
        players.append(row_data)

    # drop first frow
    return players[1:]

In [112]:
categories = [
    'entry-level',
    'rookie-extension',
    'rookie-maximum-extension',
    'designated-rookie-extension',
    'veteran-extension',
    'designated-player-veteran-extension',
    'free-agent',
    'maximum',
    'rest-of-season'
]
players = []

for category in categories:
    try:
        soup = download_html(category)
        players.extend(extract_players(soup, category))
        time.sleep(0.5) # don't get caught scraping (we don't want to get banned)
    except:
        print('failed on ' + category)
    

In [150]:
df = pd.DataFrame(players)
df.columns = ['category','categoryRank', 'lastName', 'fullName', 'position', 'empty', 'term',
              'signAge', 'contractLength', 'totalValue', 'aav', 'signBonus']
df = df.drop(['empty'], axis=1)
df['position'] = df['position'].str.strip()
df['term'] = df['term'].str.strip()
df['signBonus'] = np.where(df['signBonus']=='0-', '$0', df['signBonus'])

In [151]:
df

Unnamed: 0,category,categoryRank,lastName,fullName,position,term,signAge,contractLength,totalValue,aav,signBonus
0,entry-level,1,Cunningham,Cade Cunningham,Point Guard,2021-2024 (FA: 2025),19,4,"$45,599,089","$11,399,772",$0
1,entry-level,2,Edwards,Anthony Edwards,Shooting Guard,2020-2023 (FA: 2024),19,4,"$44,271,137","$11,067,784",$0
2,entry-level,2,Williamson,Zion Williamson,Power Forward,2019-2022 (FA: 2023),18,4,"$44,271,137","$11,067,784",$0
3,entry-level,4,Green,Jalen Green,Shooting Guard,2021-2024 (FA: 2025),19,4,"$40,808,448","$10,202,112",$0
4,entry-level,5,Ayton,Deandre Ayton,Center,2018-2021 (FA: 2022),19,4,"$40,379,230","$10,094,808",$0
...,...,...,...,...,...,...,...,...,...,...,...
384,maximum,4,Porzingis,Kristaps Porzingis,Power Forward,2019-2023 (FA: 2024),23,5,"$158,253,000","$31,650,600",$0
385,maximum,4,Ingram,Brandon Ingram,Small Forward,2020-2024 (FA: 2025),23,5,"$158,253,000","$31,650,600",$0
386,maximum,6,Jokic,Nikola Jokic,Center,2018-2022 (FA: 2023),23,5,"$147,710,050","$29,542,010",$0
387,maximum,7,Butler,Jimmy Butler,Small Forward,2019-2022 (FA: 2023),29,4,"$140,790,600","$35,197,650",$0


In [152]:
df['position']

0         Point Guard
1      Shooting Guard
2       Power Forward
3      Shooting Guard
4              Center
            ...      
384     Power Forward
385     Small Forward
386            Center
387     Small Forward
388       Point Guard
Name: position, Length: 389, dtype: object

In [153]:
df['signBonus'].unique()

array(['$0', '$300,000'], dtype=object)

In [154]:
df[df['signBonus']=='$300,000']

Unnamed: 0,category,categoryRank,lastName,fullName,position,term,signAge,contractLength,totalValue,aav,signBonus
242,free-agent,24,Beasley,Malik Beasley,Shooting Guard,2020-2023 (FA: 2024),24,4,"$60,000,000","$15,000,000","$300,000"


In [155]:
df.to_csv('../data/contracts.csv', index=False)