# Deduplicating Player IDs

In [7]:
import json
import pandas as pd
import urllib3
import bs4

In [9]:
def player_totals_page(season):
    return "https://www.basketball-reference.com/leagues/NBA_{0}_totals.html".format(season)

In [10]:
def extract_column_names(table):
    columns = [col["aria-label"] for col in table.find_all("thead")[0].find_all("th")][1:]
    columns.append("id")
    return columns


In [11]:
def extract_rows(table):
    rows = table.find_all("tbody")[0].find_all("tr")
    parsed_rows = []
    for r in rows:
        parsed = parse_row(r)
        if len(parsed) > 0:
            parsed_rows.append(parsed)
    return parsed_rows

In [18]:
def parse_row(row):
    other_data = row.find_all("td")
    if len(other_data) == 0:
        return []
    id = other_data[0].find_all("a")[0]["href"].replace("/players/", "").replace(".html","").split("/")[-1]
    row_data = [td.string for td in other_data]
    row_data.append(id)
    return row_data

In [13]:
http = urllib3.PoolManager()

season = '2023'

columns = []
rows = []


In [14]:
r = http.request('GET', player_totals_page(season))         # Request the page
soup = bs4.BeautifulSoup(r.data, 'html')                    # Parse page with BeuatifulSoup
f = soup.find_all("table")                                  # Find the talbe
if len(f) > 0:                                              # Check to ensure the table is there
    columns = extract_column_names(f[0])                    # Extract column names from the table header
    rows = rows + extract_rows(f[0])                        # Extract data from table rows

frame = pd.DataFrame(rows)

In [21]:
frame.columns = columns
frame = frame.rename(columns = {"Tm":"Team"})

               Player Pos Age Team Games Games Started Minutes Played  \
0    Precious Achiuwa   C  23  TOR    55            12           1140   
1        Steven Adams   C  29  MEM    42            42           1133   
2         Bam Adebayo   C  25  MIA    75            75           2598   
3        Ochai Agbaji  SG  22  UTA    59            22           1209   
4        Santi Aldama  PF  22  MEM    77            20           1682   
..                ...  ..  ..  ...   ...           ...            ...   
674    Thaddeus Young  PF  34  TOR    54             9            795   
675        Trae Young  PG  24  ATL    73            73           2541   
676    Omer Yurtseven   C  24  MIA     9             0             83   
677       Cody Zeller   C  30  MIA    15             2            217   
678       Ivica Zubac   C  25  LAC    76            76           2170   

    Field Goals   FGA Field Goal Percentage  ... Offensive Rebounds  \
0           196   404                  .485  ...    

In [22]:
frame.to_csv("basketball_reference_totals_{0}.csv".format(season), index=False)