# NBA Data Collection

In [1]:
import re
import os
import json
import pandas as pd
import numpy as np
import time
import requests
from bs4 import BeautifulSoup
from nba_api.stats.endpoints import leaguedashteamstats, leaguedashplayerstats, leaguestandingsv3, shotchartdetail

### Retrieving NBA.com Data

Import player and team stats for this season from NBA.com 

In [2]:
player_stats = leaguedashplayerstats.LeagueDashPlayerStats().get_data_frames()[0]
team_stats = leaguedashteamstats.LeagueDashTeamStats().get_data_frames()[0]
standings = leaguestandingsv3.LeagueStandingsV3(season_type='Regular Season').get_data_frames()[0]

In [3]:
# Add points rebounds and assists per game as a feature
team_stats['PPG'] = team_stats['PTS']/team_stats['GP']
team_stats['APG'] = team_stats['AST']/team_stats['GP']
team_stats['RPG'] = team_stats['REB']/team_stats['GP']

player_stats['PPG'] = player_stats['PTS']/player_stats['GP']
player_stats['APG'] = player_stats['AST']/player_stats['GP']
player_stats['RPG'] = player_stats['REB']/player_stats['GP']

In [4]:
# Add TEAM_NAME in standings
standings['TEAM_NAME'] = standings[['TeamCity', 'TeamName']].apply(lambda x: ' '.join(x), axis=1)

### Retrieving NBA Advanced stats from Basketball Reference

In [5]:
data = requests.get("https://www.basketball-reference.com/leagues/NBA_2023.html").text
soup = BeautifulSoup(data, 'html.parser')

table = soup.find('table', {'id':'advanced-team'})

In [6]:
headers = []
header = table.thead.find_all('tr')[1]
for th in header.find_all('th'):
    headers.append(th.get_text())
    
headers = headers[1:14]
headers

['Team',
 'Age',
 'W',
 'L',
 'PW',
 'PL',
 'MOV',
 'SOS',
 'SRS',
 'ORtg',
 'DRtg',
 'NRtg',
 'Pace']

In [7]:
dat  = []
for row in table.tbody.find_all('tr'):
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    dat.append([ele for ele in cols if ele])

In [8]:
data =[]
for x in dat:
    data.append(x[0:13])
    
advanced_stats = pd.DataFrame(data, columns=headers)

In [9]:
advanced_stats['Orank'] = pd.to_numeric(advanced_stats['ORtg']).rank(method='min', ascending=False).astype(int)
advanced_stats['Drank'] = pd.to_numeric(advanced_stats['DRtg']).rank(method='min').astype(int)
advanced_stats['Nrank'] = advanced_stats.index.astype(int) + 1
advanced_stats['PaceRank'] = pd.to_numeric(advanced_stats['Pace']).rank(method='min', ascending=False).astype(int)
advanced_stats

Unnamed: 0,Team,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,Pace,Orank,Drank,Nrank,PaceRank
0,Boston Celtics*,27.4,57,25,57,25,6.52,-0.15,6.38,118.0,111.5,6.5,98.5,2,3,1,19
1,Cleveland Cavaliers*,25.4,51,31,55,27,5.38,-0.15,5.23,116.1,110.6,5.5,95.7,8,1,2,30
2,Philadelphia 76ers*,28.2,54,28,52,30,4.32,0.06,4.37,117.7,113.3,4.4,96.9,4,8,3,27
3,Memphis Grizzlies*,24.4,51,31,51,31,3.94,-0.34,3.6,115.1,111.2,3.9,101.1,14,2,4,4
4,Milwaukee Bucks*,29.8,58,24,50,32,3.63,-0.02,3.61,115.4,111.9,3.5,100.5,12,4,5,10
5,Denver Nuggets*,26.6,53,29,49,33,3.33,-0.29,3.04,117.6,114.2,3.4,98.1,5,14,6,23
6,New York Knicks*,24.5,47,35,48,34,2.93,0.06,2.99,117.8,114.8,3.0,97.1,3,19,7,25
7,Sacramento Kings*,25.4,48,34,47,35,2.65,-0.35,2.3,119.4,116.8,2.6,100.3,1,25,8,12
8,Phoenix Suns*,28.1,45,37,46,36,2.07,0.01,2.08,115.1,113.0,2.1,98.2,14,7,9,22
9,New Orleans Pelicans*,25.9,42,40,46,36,1.89,-0.26,1.63,114.4,112.5,1.9,99.1,21,6,10,14


In [10]:
# Fix team names
advanced_stats.replace("Los Angeles Clippers*", "LA Clippers", inplace=True)
team_names = [name.replace("*","") for name in advanced_stats['Team']]
advanced_stats['Team'] = team_names

In [11]:
team_stats['FG_PCT_RANK'] = team_stats['FG_PCT'].rank(method='min', ascending=False).astype(int)
team_stats['FG3_PCT_RANK'] = team_stats['FG3_PCT'].rank(method='min', ascending=False).astype(int)

### Retrieving NBA Team Salaries from Basketball Reference

In [12]:
data = requests.get("https://www.basketball-reference.com/contracts/").text
soup = BeautifulSoup(data, 'html.parser')

table = soup.find('table', {'id':'team_summary'})

In [13]:
headers = []
header = table.thead.find_all('tr')[1]
for th in header.find_all('th'):
    headers.append(th.get_text())
headers = headers[1:8]

data = []
for row in table.tbody.find_all('tr'):
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele])

team_salaries = pd.DataFrame(data, columns=headers)
team_salaries

Unnamed: 0,Team,2022-23,2023-24,2024-25,2025-26,2026-27,2027-28
0,Los Angeles Clippers,"$199,618,760","$201,872,990","$148,079,728","$20,482,758",,
1,Golden State Warriors,"$196,914,190","$211,685,897","$148,977,903","$125,348,403","$65,464,286",
2,Milwaukee Bucks,"$189,554,017","$156,679,413","$112,176,005","$79,586,167",,
3,Boston Celtics,"$188,309,002","$163,100,916","$120,909,897","$71,995,974",,
4,Dallas Mavericks,"$178,751,606","$108,671,031","$94,252,163","$56,999,660","$48,967,380",
5,Phoenix Suns,"$177,017,682","$165,602,967","$175,429,147","$155,194,023","$57,124,200","$61,063,800"
6,Los Angeles Lakers,"$176,771,244","$127,212,036","$100,859,025",,,
7,Denver Nuggets,"$166,664,217","$168,687,600","$168,659,596","$124,856,778","$98,962,150","$61,908,000"
8,Brooklyn Nets,"$165,325,654","$145,554,223","$86,592,682","$40,278,480",,
9,Oklahoma City Thunder,"$161,215,815","$99,032,051","$101,135,018","$92,388,970","$65,691,372",


Create .csv

In [14]:
team_stats.to_csv("Team-stats.csv")
player_stats.to_csv("Player_stats.csv")
standings.to_csv('Standings.csv')
advanced_stats.to_csv('advanced_stats.csv')
team_salaries.to_csv('team_salaries.csv')

In [15]:
newdir = "logos/"
if not os.path.exists(newdir):
    os.makedirs(newdir)

In [16]:
good_names = [team.replace(" ", "-").lower() for team in team_stats['TEAM_NAME']]

In [17]:
for team in good_names:
    url = f"https://loodibee.com/nba/nba-{team}-logo/"
    try:
        logo = requests.get(url)
        soup = BeautifulSoup(logo.text, "html.parser")
    
        main = soup.find(id = "main")
        img = main.find('img')['src']

        img_data = requests.get(img).content
        team_name = team.replace("-", " ")
        with open(f'logos/{team_name}.png', "wb") as f: 
           f.write(img_data)
        
    except:
        continue

Manually Add logos Denver Nuggets

In [18]:
logo = requests.get("https://loodibee.com/nba/nba-denver-nuggets-logo-2018/")
soup = BeautifulSoup(logo.text, "html.parser")
    
main = soup.find(id = "main")
img = main.find('img')['src']

img_data = requests.get(img).content

with open('logos/Denver Nuggets.png', "wb") as f: 
    f.write(img_data)

Rename LA Clippers.png

In [19]:
os.rename("logos/la clippers.png", "logos/LA Clippers.png")