Importing necessary libraries        

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from io import StringIO       

Year to be analyzed (The year the playoff season is played in, i.e 2022 = 2021-22 season)


In [2]:
year = 2000                           

Creating a list containing the name of all players who played in the selected season

In [None]:
URL = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html#totals_stats::pts'

site = requests.get(URL)
soup = BeautifulSoup(site.text, 'html')
table = soup.find('table')
tbody = table.find('tbody')
playername = tbody.find_all('td', {'data-stat':'player'})                
realplayernames = [name.text for name in playername]
realplayernames = list(dict.fromkeys(realplayernames)) #Removing duplicated names

Now we need to get the dynamic parts from the URL, as we are going to access all the players' pages to get the data needed

In [4]:
tr = tbody.find_all('tr')
urlname = []

for i in tr:
    td = i.find('td')
    if td is not None:
        urlname.append(list(td.attrs.values())[1])

urlname = list(dict.fromkeys(urlname))

In [5]:
first_letter = []
for i in realplayernames:
    first_letter.append(i[:1])

Now, we loop into all the players' URLs and grab the points per game in the season. Then we store it into a table, which the columns are the name and the games played in the season (1-82)

In [6]:
from time import sleep

headers = list(range(1,83))
headers.insert(0,'Player')
dff = pd.DataFrame(columns = headers)

ignore_rows = [20,41,62,83]

for x in range(0,len(realplayernames)):
    URL = f'https://www.basketball-reference.com/players/{first_letter[x]}/{urlname[x]}/gamelog/{year}'
    sleep(5)   #We must wait 5 seconds before another request, so we won't overwhelm the web server
    site = requests.get(URL)
    soup = BeautifulSoup(site.text,'html')
    table = soup.find('table',{'id':'pgl_basic'})
    tbody = table.find('tbody')
    tr = tbody.find_all('tr')

    points = []
    for k in range(0,len(tr)):
        points_value = tr[k].find('td',{'data-stat':'pts'})
        if points_value is not None: 
            points.append(int(points_value.text))
        elif k not in ignore_rows:
            points.append(0)
    if len(points) < 82:   #Some players play more than 82 games (mid-season transfers), and others play less than 82 (dropped from NBA), so we must fill/cut down the list to 82 elements, so it is possible to concatenate into the main table
        for i in range(0,(83 - len(points))):
            points.append(0)
    new_row = points[0:82]
    new_row.insert(0,realplayernames[x])
    new_df = pd.DataFrame([new_row], columns = dff.columns)
    dff = pd.concat([dff,new_df], ignore_index = True)


The following code section sums all the columns with the previous column, as we are interested in cumulative points as the games are played into the season. This fits very well to analyze which player was the top scorer in certain point in the season, and many other insights, like how given player was performing until getting injured/suspended, etc. 

In [7]:
for x in range(2,83):
    dff[x] = dff[[x-1,x]].sum(axis=1)

Now, we sort our table by top scorers at the end of regular season

In [8]:
dff = dff.sort_values(by=82,ascending=False)

And finnaly, we save our data into a csv file.

In [11]:
dff.to_csv(f'NBA {year - 1}-{str(year)[2:]} Regular Season Scoring Leaders.csv', index=False)