# Notebook content

1. Import Necessary Libraries
2. Webscraping
3. Scraping with Selenium


### 1. Imports

In [40]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

### 2. Webscraping

### 2.1 Download HTML files 

In [9]:
#Create a list of years we want to analyze
years = list(range(1980,2025))

#Base url to website
url_start = 'https://www.basketball-reference.com/leagues/NBA_{}.html'

#Download each page for each year from website
for year in years:
    url = url_start.format(year) # Replace placeholder in url_start with year
    data = requests.get(url)
    
    #Save HTML to a file
    with open('stats/{}.html'.format(year), 'w+', encoding='utf-8') as f:
        f.write(data.text)
    
    #Pause in between requests to avoid overwhelming the server
    time.sleep(15)

### 2.2  Extracting tables from Downloaded HTML Files

In [3]:
#Testing for one page first
with open('stats/1980.html', encoding='utf-8') as f:
    page = f.read()
    

In [4]:
#Parse the HTML content of a web page
soup = BeautifulSoup(page,'html.parser')

In [6]:
#Extract desired table using their id
team_stats_per_game = soup.find(id='div_per_game-team')

In [8]:
per_game_1980 = pd.read_html(str(team_stats_per_game))[0]

  per_game_1980 = pd.read_html(str(team_stats_per_game))[0]


In [9]:
per_game_1980

Unnamed: 0,Rk,Team,G,MP,FG,FGA,FG%,3P,3PA,3P%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1.0,San Antonio Spurs*,82,240.9,47.0,94.4,0.498,0.6,2.5,0.252,...,0.801,14.1,30.7,44.7,28.4,9.4,4.1,19.4,25.6,119.4
1,2.0,Los Angeles Lakers*,82,242.4,47.5,89.9,0.529,0.2,1.2,0.2,...,0.775,13.2,32.4,45.6,29.4,9.4,6.7,20.0,21.8,115.1
2,3.0,Cleveland Cavaliers,82,243.0,46.5,98.1,0.474,0.4,2.3,0.193,...,0.772,15.9,29.0,45.0,25.7,9.3,4.2,16.7,23.6,114.1
3,4.0,New York Knicks,82,241.2,46.4,93.6,0.496,0.5,2.3,0.22,...,0.747,15.1,28.1,43.2,27.6,10.7,5.6,19.7,26.4,114.0
4,5.0,Boston Celtics*,82,242.4,44.1,90.1,0.49,2.0,5.1,0.384,...,0.779,15.0,30.0,44.9,26.8,9.9,3.8,18.8,24.1,113.5
5,6.0,Indiana Pacers,82,242.1,44.4,93.8,0.473,1.1,3.8,0.28,...,0.751,17.0,28.4,45.4,26.2,11.0,6.5,18.5,24.1,111.2
6,7.0,Phoenix Suns*,82,240.9,43.5,88.2,0.493,0.8,3.4,0.243,...,0.773,13.1,30.0,43.0,27.8,11.1,4.2,19.9,22.6,111.1
7,8.0,Houston Rockets*,82,243.0,43.9,91.4,0.48,1.3,4.6,0.274,...,0.766,17.0,27.0,44.0,26.2,9.5,4.5,19.1,23.5,110.8
8,9.0,Milwaukee Bucks*,82,241.5,44.9,92.1,0.488,0.6,1.9,0.323,...,0.764,15.2,29.2,44.4,27.8,9.5,6.2,18.2,23.6,110.1
9,10.0,Philadelphia 76ers*,82,242.1,43.0,87.3,0.492,0.3,1.5,0.216,...,0.772,14.5,32.1,46.6,27.1,9.7,8.0,20.8,22.7,109.1


### 2.3 Stats Per Game Table

In [41]:
#Lets extract team stats table for every year
#Placeholder for stats tables
df = []

for year in years:
    with open('stats/{}.html'.format(year), encoding='utf-8') as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    team_stats_per_game = soup.find(id='div_per_game-team')
    per_game = pd.read_html(str(team_stats_per_game))[0]
    per_game['Year'] = year
    
    df.append(per_game)

In [26]:
#Convert df into pandas dataframe
team_stats = pd.concat(df) 

In [27]:
#Explore team_stats
team_stats.head()

Unnamed: 0,Rk,Team,G,MP,FG,FGA,FG%,3P,3PA,3P%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1.0,San Antonio Spurs*,82,240.9,47.0,94.4,0.498,0.6,2.5,0.252,...,14.1,30.7,44.7,28.4,9.4,4.1,19.4,25.6,119.4,1980
1,2.0,Los Angeles Lakers*,82,242.4,47.5,89.9,0.529,0.2,1.2,0.2,...,13.2,32.4,45.6,29.4,9.4,6.7,20.0,21.8,115.1,1980
2,3.0,Cleveland Cavaliers,82,243.0,46.5,98.1,0.474,0.4,2.3,0.193,...,15.9,29.0,45.0,25.7,9.3,4.2,16.7,23.6,114.1,1980
3,4.0,New York Knicks,82,241.2,46.4,93.6,0.496,0.5,2.3,0.22,...,15.1,28.1,43.2,27.6,10.7,5.6,19.7,26.4,114.0,1980
4,5.0,Boston Celtics*,82,242.4,44.1,90.1,0.49,2.0,5.1,0.384,...,15.0,30.0,44.9,26.8,9.9,3.8,18.8,24.1,113.5,1980


In [28]:
#Some missing values in RK 
team_stats.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1299 entries, 0 to 30
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rk      1254 non-null   float64
 1   Team    1299 non-null   object 
 2   G       1299 non-null   int64  
 3   MP      1299 non-null   float64
 4   FG      1299 non-null   float64
 5   FGA     1299 non-null   float64
 6   FG%     1299 non-null   float64
 7   3P      1299 non-null   float64
 8   3PA     1299 non-null   float64
 9   3P%     1299 non-null   float64
 10  2P      1299 non-null   float64
 11  2PA     1299 non-null   float64
 12  2P%     1299 non-null   float64
 13  FT      1299 non-null   float64
 14  FTA     1299 non-null   float64
 15  FT%     1299 non-null   float64
 16  ORB     1299 non-null   float64
 17  DRB     1299 non-null   float64
 18  TRB     1299 non-null   float64
 19  AST     1299 non-null   float64
 20  STL     1299 non-null   float64
 21  BLK     1299 non-null   float64
 22  TOV    

In [30]:
#Store dataframe into a csv file
team_stats.to_csv('team_stats.csv')

### 2.4 Total Stats Table

In [42]:
#Lets extract total stats table for every year
df2 = []

for year in years:
    with open('stats/{}.html'.format(year), encoding='utf-8') as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    team_stats_total = soup.find(id='all_totals_team-opponent')
    stats_total = pd.read_html(str(team_stats_total))[0]
    stats_total['Year'] = year
    
    df2.append(stats_total)

In [38]:
#Convert df into pandas dataframe
total_stats = pd.concat(df2) 

In [40]:
total_stats.tail()

Unnamed: 0,Rk,Team,G,MP,FG,FGA,FG%,3P,3PA,3P%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
26,27.0,Detroit Pistons,82,19755,3353,7236,0.463,906,2602,0.348,...,861,2692,3553,2088,531,384,1248,1688,9010,2024
27,28.0,Charlotte Hornets,82,19730,3281,7133,0.46,989,2788,0.355,...,765,2538,3303,2033,562,371,1129,1472,8740,2024
28,29.0,Portland Trail Blazers,82,19880,3227,7356,0.439,939,2723,0.345,...,1036,2469,3505,1894,627,354,1249,1654,8722,2024
29,30.0,Memphis Grizzlies,82,19780,3145,7230,0.435,1071,3097,0.346,...,896,2598,3494,2025,673,501,1236,1563,8677,2024
30,,League Average,82,19792,3458,7290,0.474,1053,2879,0.366,...,865,2705,3570,2187,613,422,1116,1536,9365,2024


In [42]:
#Some missing values in RK (once again, no rank for league avg)
total_stats.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1299 entries, 0 to 30
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rk      1254 non-null   float64
 1   Team    1299 non-null   object 
 2   G       1299 non-null   int64  
 3   MP      1299 non-null   int64  
 4   FG      1299 non-null   int64  
 5   FGA     1299 non-null   int64  
 6   FG%     1299 non-null   float64
 7   3P      1299 non-null   int64  
 8   3PA     1299 non-null   int64  
 9   3P%     1299 non-null   float64
 10  2P      1299 non-null   int64  
 11  2PA     1299 non-null   int64  
 12  2P%     1299 non-null   float64
 13  FT      1299 non-null   int64  
 14  FTA     1299 non-null   int64  
 15  FT%     1299 non-null   float64
 16  ORB     1299 non-null   int64  
 17  DRB     1299 non-null   int64  
 18  TRB     1299 non-null   int64  
 19  AST     1299 non-null   int64  
 20  STL     1299 non-null   int64  
 21  BLK     1299 non-null   int64  
 22  TOV    

In [43]:
#Store dataframe into a csv file
total_stats.to_csv('total_stats.csv')

### 2.5 Champions

In [26]:
#Extracting Champions of 1980 (test)
champion = None
with open('rank/1980_standing.html', encoding='utf-8') as f:
        page = f.read()
        soup = BeautifulSoup(page, 'html.parser')
        
for p in soup.find_all('p'):
    if p.find('strong') and 'League Champion' in p.find('strong').text:
        # Extract the text within the <a> tag
        champion = p.find('a').text.strip()
        break

In [27]:
#Champions of 1980
champion

'Los Angeles Lakers'

In [30]:
#Let's extract champions for every year

#Initialize an empty list to store the results
champion_data = []

#Loop through each year
for year in years:
    with open('rank/{}_standing.html'.format(year), encoding='utf-8') as f:
        page = f.read()
        
    soup = BeautifulSoup(page, 'html.parser')
    
    # Find the <p> tag that contains the League Champion information
    champion = None
    for p in soup.find_all('p'):
        if p.find('strong') and 'League Champion' in p.find('strong').text:
            # Extract the text within the <a> tag
            champion = p.find('a').text.strip()
            break
    
    # Append the result to the list
    if champion:
        champion_data.append({'Year': year, 'Champion': champion})

# Convert the list to a DataFrame
Champs = pd.DataFrame(champion_data)

Champs

Unnamed: 0,Year,Champion
0,1980,Los Angeles Lakers
1,1981,Boston Celtics
2,1982,Los Angeles Lakers
3,1983,Philadelphia 76ers
4,1984,Boston Celtics
5,1985,Los Angeles Lakers
6,1986,Boston Celtics
7,1987,Los Angeles Lakers
8,1988,Los Angeles Lakers
9,1989,Detroit Pistons


In [39]:
#Store dataframe into a csv file
Champs.to_csv('champions.csv')

### 3. Scraping with Selenium
- The goal is extract the team standings for each year
- Use selenium to help scrape dynamic tables containing this information 

In [14]:
!pip install selenium

Defaulting to user installation because normal site-packages is not writeable
Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting websocket-client~=1.8 (from selenium)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-24.2.0-py3-none-any.whl.metadata (11 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Dow



In [37]:
#Imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

url_rank = 'https://www.basketball-reference.com/leagues/NBA_{}_standings.html'

for year in years:
    url = url_rank.format(year)

    # Set up Chrome options
    chrome_options = Options()

    # Automatically download and use the correct ChromeDriver version
    driver = webdriver.Chrome(options=chrome_options)

    # Now you can use the driver to open websites
    driver.get(url)

    driver.execute_script('window.scrollTo(1,10000)')

    time.sleep(10)

    html = driver.page_source

    # Close the browser
    driver.quit()
    
    with open('rank/{}_standing.html'.format(year),'w+',encoding='utf-8') as f:
        f.write(html)
    

In [43]:
#Lets extract team stats table for every year
df_rank = []

for year in years:
    with open('rank/{}_standing.html'.format(year), encoding='utf-8') as f:
        page = f.read()
    
    soup = BeautifulSoup(page, 'html.parser')
    rank_table = soup.find(id='all_expanded_standings')
    rank = pd.read_html(str(rank_table))[0]
    rank['Year'] = year
    
    df_rank.append(rank)

In [33]:
team_rank = pd.concat(df_rank)

In [34]:
team_rank.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Place,Place,Conference,Conference,Division,Division,Division,...,Month,Month,Year,Month,Month,Division,Division,Division,Month,Month
Unnamed: 0_level_1,Rk,Team,Overall,Home,Road,E,W,A,C,M,...,Feb,Mar,Unnamed: 14_level_1,Apr,May,SE,NW,SW,Jul,Aug
0,1,Boston Celtics,61-21,35-6,26-15,45-15,16-6,17-7,28-8,9-1,...,9-2,12-6,1980,,,,,,,
1,2,Los Angeles Lakers,60-22,37-4,23-18,18-4,42-18,8-2,10-2,23-7,...,9-2,13-3,1980,,,,,,,
2,3,Philadelphia 76ers,59-23,36-5,23-18,44-16,15-7,19-5,25-11,7-3,...,10-3,11-6,1980,,,,,,,
3,4,Seattle SuperSonics,56-26,33-8,23-18,17-5,39-21,7-3,10-2,21-9,...,9-4,9-6,1980,,,,,,,
4,5,Phoenix Suns,55-27,37-5,18-22,13-9,42-18,5-5,8-4,23-7,...,8-5,12-3,1980,,,,,,,


In [36]:
#Dropping the top-level header
team_rank.columns = team_rank.columns.droplevel(0)
team_rank.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,M,...,Feb,Mar,Unnamed: 14,Apr,May,SE,NW,SW,Jul,Aug
0,1,Boston Celtics,61-21,35-6,26-15,45-15,16-6,17-7,28-8,9-1,...,9-2,12-6,1980,,,,,,,
1,2,Los Angeles Lakers,60-22,37-4,23-18,18-4,42-18,8-2,10-2,23-7,...,9-2,13-3,1980,,,,,,,
2,3,Philadelphia 76ers,59-23,36-5,23-18,44-16,15-7,19-5,25-11,7-3,...,10-3,11-6,1980,,,,,,,
3,4,Seattle SuperSonics,56-26,33-8,23-18,17-5,39-21,7-3,10-2,21-9,...,9-4,9-6,1980,,,,,,,
4,5,Phoenix Suns,55-27,37-5,18-22,13-9,42-18,5-5,8-4,23-7,...,8-5,12-3,1980,,,,,,,


In [38]:
#Store dataframe into a csv file
team_rank.to_csv('team_rank.csv')