## Import libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import re 
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
from time import sleep

## Main link and displaying elements to scrape

In [2]:
main_link = 'https://www.basketball-reference.com/leagues/NBA_2021_per_game.html'
user_agent = {'User-agent': 'Chrome'}
response = requests.get(main_link, timeout=15, headers= user_agent)
soup = BeautifulSoup(response.content, 'html.parser')
elements = soup.find_all('div', {'class':"table_container"})

In [7]:
elements[0]

<div class="table_container" id="div_per_game_stats">
<table class="sortable stats_table" data-cols-to-freeze=",2" data-non-qual="1" data-qual-label=" When table is sorted, hide non-qualifiers for rate stats" data-qual-text="" id="per_game_stats">
<caption>Player Per Game Table</caption>
<colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
<thead>
<tr>
<th aria-label="Rank" class="ranker poptip sort_default_asc show_partial_when_sorting center" data-stat="ranker" data-tip="Rank" scope="col">Rk</th>
<th aria-label="Player" class="poptip sort_default_asc center" data-stat="player" scope="col">Player</th>
<th aria-label="Position" class="poptip sort_default_asc center" data-stat="pos" data-tip="Position" scope="col">Pos</th>
<th aria-label="Player's age on February 1 of the season" class="poptip sort_default_asc center" data-stat="age" data-t

## Get data for 1960-2021

In [7]:
dates = [year for year in range(2021, 1959, -1)]
url ='https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'
lst_pages=[]
for date in dates:
    lst_pages.append(url.format(date))
lst_pages

['https://www.basketball-reference.com/leagues/NBA_2021_per_game.html',
 'https://www.basketball-reference.com/leagues/NBA_2020_per_game.html',
 'https://www.basketball-reference.com/leagues/NBA_2019_per_game.html',
 'https://www.basketball-reference.com/leagues/NBA_2018_per_game.html',
 'https://www.basketball-reference.com/leagues/NBA_2017_per_game.html',
 'https://www.basketball-reference.com/leagues/NBA_2016_per_game.html',
 'https://www.basketball-reference.com/leagues/NBA_2015_per_game.html',
 'https://www.basketball-reference.com/leagues/NBA_2014_per_game.html',
 'https://www.basketball-reference.com/leagues/NBA_2013_per_game.html',
 'https://www.basketball-reference.com/leagues/NBA_2012_per_game.html',
 'https://www.basketball-reference.com/leagues/NBA_2011_per_game.html',
 'https://www.basketball-reference.com/leagues/NBA_2010_per_game.html',
 'https://www.basketball-reference.com/leagues/NBA_2009_per_game.html',
 'https://www.basketball-reference.com/leagues/NBA_2008_per_game

In [8]:
dates = [year for year in range(2021, 1959, -1)]
url ='https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'
lst_pages=[]
for date in dates:
    lst_pages.append(url.format(date))
    
cols = ['rank', 'player', 'pos', 'age', 'team', 'games', 'gs', 'mp', 'fg', 'fga', 'fg_perc', 'p3', 'p3a', 'p3_perc', 'p2', 'p2a',
      'p2_perc', 'efg_perc', 'ft', 'fta', 'ft_perc', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'date']
data = pd.DataFrame(columns=cols)


for i in tqdm(range(len(lst_pages))):
    sleep=1
    user_agent={'User-agent': 'Chrome'}
    response = requests.get(lst_pages[i], timeout=15, headers= user_agent)
    soup = BeautifulSoup(response.content, 'html.parser')
    elements = soup.find_all('div', {'class':"table_container"})[0].find_all('tr', class_='full_table')
    date = dates[i]
    for x in range(len(elements)):
        try:
            rank = elements[x].find('th', class_='right').get_text()
        except:
            rank = np.nan
            
        try:
            player = elements[x].find('td', class_='left').get_text()
        except:
            player = np.nan
            
        try:
            pos = elements[x].find('td', class_='center').get_text()
        except:
            pos = np.nan
        
        try:
            age = elements[x].find('td', class_='right').get_text()
        except:
            age = np.nan
            
        try:
            team = elements[x].find_all('td', class_='left')[-1].get_text()
        except:
            team = np.nan
        
        try:
            games = elements[x].find_all('td', class_='right')[1].get_text()
        except:
            games = np.nan
            
        try:
            gs = elements[x].find_all('td', class_='right')[2].get_text()
        except:
            gs = np.nan
            
        try:
            mp = elements[x].find_all('td', class_='right')[3].get_text()
        except:
            mp = np.nan
            
        try:
            fg = elements[x].find_all('td', class_='right')[4].get_text()
        except:
            fg = np.nan
            
        try:
            fga = elements[x].find_all('td', class_='right')[5].get_text()
        except:
            fga = np.nan
            
        try:
            fg_perc = elements[x].find_all('td', class_='right')[6].get_text()
        except:
            fg_perc = np.nan
            
        try:
            p3 = elements[x].find_all('td', class_='right')[7].get_text()
        except:
            p3 = np.nan
            
        try:
            p3a = elements[x].find_all('td', class_='right')[8].get_text()
        except:
            p3a =np.nan
            
        try:
            p3_perc = elements[x].find_all('td', class_='right')[9].get_text()
        except:
            p3_perc = np.nan
            
        try:
            p2 = elements[x].find_all('td', class_='right')[10].get_text()
        except:
            p2 = np.nan
        
        try:
            p2a = elements[x].find_all('td', class_='right')[11].get_text()
        except:
            p2a = np.nan
            
        try:
            p2_perc = elements[x].find_all('td', class_='right')[12].get_text()
        except:
            p2_perc = np.nan
            
        try:
            efg_perc = elements[x].find_all('td', class_='right')[13].get_text()
        except:
            efg_perc = np.nan
        
        try:
            ft = elements[x].find_all('td', class_='right')[14].get_text() 
        except:
            ft = np.nan
            
        try:
            fta = elements[x].find_all('td', class_='right')[15].get_text() 
        except:
            fta = np.nan
            
        try:
            ft_perc = elements[x].find_all('td', class_='right')[16].get_text() 
        except:
            ft_perc = np.nan
            
        try:
            orb = elements[x].find_all('td', class_='right')[17].get_text() 
        except:
            orb = np.nan
        
        try:
            drb = elements[x].find_all('td', class_='right')[18].get_text() 
        except:
            drb = np.nan
            
        try:
            trb = elements[x].find_all('td', class_='right')[19].get_text()
        except:
            trb = np.nan
            
        try:
            ast = elements[x].find_all('td', class_='right')[20].get_text()
        except:
            ast = np.nan
            
        try:
            stl = elements[x].find_all('td', class_='right')[21].get_text()
        except:
            stl = np.nan
            
        try:
            blk = elements[x].find_all('td', class_='right')[22].get_text()
        except:
            blk = np.nan
            
        try:
            tov = elements[x].find_all('td', class_='right')[23].get_text()
        except:
            tov = np.nan
            
        try:
            pf = elements[x].find_all('td', class_='right')[24].get_text()
        except:
            pf = np.nan
            
        try:
            pts = elements[x].find_all('td', class_='right')[25].get_text()
        except:
            pts = np.nan
             
        temp = {
            'rank': rank,
            'player': player,
            'pos': pos,
            'age': age,
            'team': team,
            'games': games,
            'gs': gs,
            'mp': mp,
            'fg': fg,
            'fga': fga,
            'fg_perc': fg_perc,
            'p3': p3,
            'p3a': p3a,
            'p3_perc': p3_perc,
            'p2': p2,
            'p2_perc': p2_perc,
            'efg_perc': efg_perc,
            'ft': ft,
            'fta': fta,
            'ft_perc': ft_perc,
            'orb': orb,
            'drb': drb,
            'trb': trb,
            'ast': ast,
            'stl': stl,
            'blk': blk,
            'tov': tov,
            'pf': pf,
            'pts': pts,
            'date': date
        }
        data = data.append(temp, ignore_index=True)

data

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(len(lst_pages))):


  0%|          | 0/62 [00:00<?, ?it/s]

Unnamed: 0,rank,player,pos,age,team,games,gs,mp,fg,fga,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,date
0,1,Precious Achiuwa,PF,21,MIA,61,4,12.1,2.0,3.7,...,1.2,2.2,3.4,0.5,0.3,0.5,0.7,1.5,5.0,2021
1,2,Jaylen Adams,PG,24,MIL,7,0,2.6,0.1,1.1,...,0.0,0.4,0.4,0.3,0.0,0.0,0.0,0.1,0.3,2021
2,3,Steven Adams,C,27,NOP,58,58,27.7,3.3,5.3,...,3.7,5.2,8.9,1.9,0.9,0.7,1.3,1.9,7.6,2021
3,4,Bam Adebayo,C,23,MIA,64,64,33.5,7.1,12.5,...,2.2,6.7,9.0,5.4,1.2,1.0,2.6,2.3,18.7,2021
4,5,LaMarcus Aldridge,C,35,TOT,26,23,25.9,5.4,11.4,...,0.7,3.8,4.5,1.9,0.4,1.1,1.0,1.8,13.5,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21252,95,Jack Twyman*,SF,25,CIN,75,,40.3,11.6,27.5,...,,,8.9,3.5,,,,3.7,31.2,1960
21253,96,Charlie Tyra,C,24,NYK,74,,27.5,5.5,12.9,...,,,8.1,1.1,,,,3.5,12.8,1960
21254,97,Win Wilfong,PG,26,CIN,72,,27.7,3.9,10.6,...,,,4.9,3.7,,,,3.2,10.1,1960
21255,98,Tony Windis,PG,27,DET,9,,21.4,1.8,6.7,...,,,5.2,3.6,,,,2.2,4.0,1960


### check for nan values(values that were not scraped)

In [15]:
for key in data.keys():
    print(100*data[key].isna().value_counts()/len(data[key]))

False    100.0
Name: rank, dtype: float64
False    100.0
Name: player, dtype: float64
False    100.0
Name: pos, dtype: float64
False    100.0
Name: age, dtype: float64
False    100.0
Name: team, dtype: float64
False    100.0
Name: games, dtype: float64
False    100.0
Name: gs, dtype: float64
False    100.0
Name: mp, dtype: float64
False    100.0
Name: fg, dtype: float64
False    100.0
Name: fga, dtype: float64
False    100.0
Name: fg_perc, dtype: float64
False    100.0
Name: p3, dtype: float64
False    100.0
Name: p3a, dtype: float64
False    100.0
Name: p3_perc, dtype: float64
False    100.0
Name: p2, dtype: float64
True    100.0
Name: p2a, dtype: float64
False    100.0
Name: p2_perc, dtype: float64
False    100.0
Name: efg_perc, dtype: float64
False    100.0
Name: ft, dtype: float64
False    100.0
Name: fta, dtype: float64
False    100.0
Name: ft_perc, dtype: float64
False    100.0
Name: orb, dtype: float64
False    100.0
Name: drb, dtype: float64
False    100.0
Name: trb, dtype: flo

### Export data to csv file

In [16]:
data.to_csv('NBA_PLAYERS.csv', encoding='utf-8', index=False)