# NBA Yearly Player Stats Web Scraping 

In [None]:
from IPython.display import display, Math, Latex
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, date
import random
import os
import requests
from bs4 import BeautifulSoup

#### Web Scrape NBA players data

Using NBA reference, we will web scrape, NBA players stats from 1991-2023

In [None]:
#Create a players folder 
folder_name = "players"

#checks if the folder exist 
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Folder '{folder_name}' created successfully.")
else:
    print(f"Folder '{folder_name}' already exists.")

In [None]:
#Base url for nba reference players data
base_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"
years = range(1991, 2021) 

In [None]:
#Loop through each year from 1991 to 2020
for year in years:
    #gets the url for the current year
    url = base_url.format(year)
    #Send an HTTP GET request to the URL to retrieve the web page content
    data = requests.get(url)
    #save the web page content to the players folder
    with open("players/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [None]:
from bs4 import BeautifulSoup
import pandas as pd

dfs = []
# Loop through each year from 1991 to 2020
for year in years:
    # Read the HTML content of the web page for the current year
    with open("players/{}.html".format(year)) as f:
        page = f.read()

    #Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(page, 'html.parser')
    
    # Check if the target element exists
    thead_element = soup.find('tr', class_="thead")
    if thead_element:
        thead_element.decompose()
    else:
        print(f"Element not found for year {year}, skipping...")
        continue
    #gets a df of each year and appends it in the dfs list
    player_table = soup.find_all(id="per_game_stats")[0]
    player_df = pd.read_html(str(player_table))[0]
    player_df["Year"] = year
    dfs.append(player_df)

# Concatenate the dataframes from different years
old_players_df = pd.concat(dfs, ignore_index=True)

In [None]:
#saves the old_players_df to csv
old_players_df.to_csv("old_players.csv")

In [None]:
#call the players df from 1991-2020
old_players_df

In [None]:
#Create a players_new folder 
import os

folder_name = "players_new"

#checks if the folder exist 
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Folder '{folder_name}' created successfully.")
else:
    print(f"Folder '{folder_name}' already exists.")

In [None]:
#Base url for nba reference players data
base_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"
years = range(2021,2024)

In [None]:
#Loop through each year from 2021 to 2023
for year in years:
    #gets the url for the current year
    url = base_url.format(year)
    #Send an HTTP GET request to the URL to retrieve the web page content
    data = requests.get(url)
    #save the web page content to the players folder
    with open("players_new/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [None]:
from bs4 import BeautifulSoup
import pandas as pd

dfs = []
# Loop through each year from 1991 to 2020
for year in years:
    # Read the HTML content of the web page for the current year
    with open("players_new/{}.html".format(year)) as f:
        page = f.read()

    #Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(page, 'html.parser')
    
    # Check if the target element exists
    thead_element = soup.find('tr', class_="thead")
    if thead_element:
        thead_element.decompose()
    else:
        print(f"Element not found for year {year}, skipping...")
        continue
    #gets a df of each year and appends it in the dfs list
    player_table = soup.find_all(id="per_game_stats")[0]
    player_df = pd.read_html(str(player_table))[0]
    player_df["Year"] = year
    dfs.append(player_df)

# Concatenate the dataframes from different years
new_players_df = pd.concat(dfs, ignore_index=True)

In [None]:
#call the new_players df
new_players_df

In [None]:
#save it to a csv file
new_players_df.to_csv("new_players_yearly_stats.csv")

In [None]:
#concatentates the old_players_df and new_players_df
full_players_df= pd.concat([old_players_df, new_players_df], ignore_index=True)
full_players_df.to_csv("players.csv")

#### Web Scrape NBA Teams data

Using NBA reference, we will web scrape, NBA teams stats from 1991-2023 for each year

In [None]:
import os

#Create a teams_old folder 
folder_name = "teams_old"

#checks if the folder exists
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Folder '{folder_name}' created successfully.")
else:
    print(f"Folder '{folder_name}' already exists.")

In [None]:
#Base url for nba reference team data
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"
years = range(1991, 2021) 

In [None]:
#Loop through each year from 1991 to 2020
for year in years:
    #gets the url for the current year
    url = team_stats_url.format(year)
    #Send an HTTP GET request to the URL to retrieve the web page content
    data = requests.get(url)
    #save the web page content to the players folder
    with open("teams_old/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [None]:
dfs = []
#loop through each year 
for year in years:
    # Read the HTML content of the web page for the current year
    with open("teams_old/{}.html".format(year)) as f:
        page = f.read()
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(page, 'html.parser')
    #append all the eastern conference data to dfs list
    soup.find('tr', class_="thead").decompose()
    e_table = soup.find_all(id="divs_standings_E")[0]
    e_df = pd.read_html(str(e_table))[0]
    e_df["Year"] = year
    e_df["Team"] = e_df["Eastern Conference"]
    del e_df["Eastern Conference"]
    dfs.append(e_df)
    
    #append all the western conference data to dfs list
    w_table = soup.find_all(id="divs_standings_W")[0]
    w_df = pd.read_html(str(w_table))[0]
    w_df["Year"] = year
    w_df["Team"] = w_df["Western Conference"]
    del w_df["Western Conference"]
    dfs.append(w_df)

#concatentate all eastern and western conference dfs
teams_old = pd.concat(dfs)

In [None]:
#save it to a csv file
teams_old.to_csv("old_teams.csv")

In [None]:
#Base url for nba reference team data
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"
years = range(2021, 2024) 

In [None]:
import os

#Create a teams_new folder 
folder_name = "teams_new"

#checks if it exists
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Folder '{folder_name}' created successfully.")
else:
    print(f"Folder '{folder_name}' already exists.")

In [None]:
#Loop through each year from 2021 to 2023
for year in years:
    #gets the url for the current year
    url = team_stats_url.format(year)
    #Send an HTTP GET request to the URL to retrieve the web page content
    data = requests.get(url)
    #save the web page content to the players folder
    with open("teams_new/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [None]:
dfs = []
#loop through each year 
for year in years:
    # Read the HTML content of the web page for the current year
    with open("teams_old/{}.html".format(year)) as f:
        page = f.read()
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(page, 'html.parser')
    #append all the eastern conference data to dfs list
    soup.find('tr', class_="thead").decompose()
    e_table = soup.find_all(id="divs_standings_E")[0]
    e_df = pd.read_html(str(e_table))[0]
    e_df["Year"] = year
    e_df["Team"] = e_df["Eastern Conference"]
    del e_df["Eastern Conference"]
    dfs.append(e_df)
    
    #append all the western conference data to dfs list
    w_table = soup.find_all(id="divs_standings_W")[0]
    w_df = pd.read_html(str(w_table))[0]
    w_df["Year"] = year
    w_df["Team"] = w_df["Western Conference"]
    del w_df["Western Conference"]
    dfs.append(w_df)

#concatentate all eastern and western conference dfs
teams_new = pd.concat(dfs)

In [None]:
#saves it to a csv file
teams_new.to_csv("new_teams.csv")

In [None]:
#concatentate both dfs 
full_teams_df= pd.concat([teams_old, teams_new], ignore_index=True)
full_teams_df.to_csv("teams_record.csv")

In [None]:
#calls the full_teams_df
full_teams_df