In [1]:
import requests
import bs4
import pandas as pd
import numpy as np
from difflib import get_close_matches

### Get traditional data
### source [ESPN](https://www.espn.com/nba/)

In [2]:
def get_ESPN(season):
    teams = ['atl','bkn','bos','cha','chi','cle','dal','den','det','gs','hou','ind',
            'lac','lal','mem','mia','mil','min','no','ny','okc','orl','phi','phx','por',
            'sac','sa','tor','utah','wsh']
    url = "https://www.espn.com/nba/team/stats/_/name/"
    season_type = f"/season/{season}/seasontype/2" # regular season
    class_id = "Table Table--align-right"
    data = []
    for team in teams:
        # web scrapping
        team_info = requests.get(url+team+season_type)
        soup = bs4.BeautifulSoup(team_info.text,"html.parser")
        # get stats tables
        tables = soup.select('table', attrs={'class':class_id})
        tables = pd.read_html(str(tables))
        # process player info
        names = []
        positions = []
        for player in tables[0].values[:-1]:
            info = player[0].replace('*','').split()
            names.append(' '.join(info[:-1]))
            positions.append(info[-1])
        names.append('Total')
        positions.append(team)
        player_info = {'Name': names, 'Pos': positions}
        player_info = pd.DataFrame(player_info)
        # integrate data
        data.append(pd.concat([player_info,tables[1],tables[3]],axis=1))
    Data = pd.concat([d for d in data])
    return Data

### Get Double-Doubles

In [3]:
def get_dd(season, Names_espn):
    url = "https://www.landofbasketball.com/year_by_year_stats/"
    stats_type = "_double_doubles_rs.htm"
    page = requests.get(url+f'{season-1}_{season}'+stats_type)
    soup = bs4.BeautifulSoup(page.text,"html.parser")
    # get stats tables
    class_id = "color-alt sobre a-center" 
    tables = soup.select('table', attrs={'class':class_id})
    tables = pd.read_html(str(tables))
    data = tables[0]
    data = data.iloc[2:-1].drop(0,axis=1)[[1,2]]
    data.columns = ['Name', 'DD']
    data = data[data['DD'] != "Double-Doubles"]
    # process names to align with ESPN
    names = []
    for player in data['Name']:
        info = player.split(' (')
        name = info[0]
        # name resolution: e.g. C.J. McCollum -> CJ McCollum
        if name not in Names_espn:
            candidates = get_close_matches(name, Names_espn)
            if candidates:
                names.append(candidates[0])
                print(f'change {name} -> {candidates[0]}')
            else:
                print(f'Did not find a match, who is {name} ?!')
        else:
            names.append(name)
    data['Name'] = names
    return data

### Get Hollinger statistics

In [4]:
def get_Hollinger(season, Names_espn):
    url = "http://insider.espn.com/nba/hollinger/statistics/_/sort/usageRate/page/"
    # get number of pages
    page = requests.get(url+f'{1}/year/{season}')
    soup = bs4.BeautifulSoup(page.text,"html.parser")
    class_id = "page-numbers"
    number_of_pages = soup.findAll("div",{'class':class_id})
    number_of_pages = int(str(number_of_pages).split("</")[0][~1:])
    # get stats
    data = []
    class_id = "tablehead"
    for p in range(1,number_of_pages+1):
        page = requests.get(url+f'{p}/year/{season}')
        soup = bs4.BeautifulSoup(page.text,"html.parser")
        # get stats table
        table = soup.select('table', attrs={'class':class_id})
        table = pd.read_html(str(table))
        table = table[0]
        table.drop(0, axis=1, inplace=True)
        # get columns
        if p == 1:
            columns = table.iloc[1].values
        # filter
        table = table[[b.isnumeric() for b in table[2]]]
        table.reset_index(drop=True,inplace=True)
        # process player info
        names = []
        for player in table[1].values:
            name = player.split(',')[0]
            # name resolution: e.g. C.J. McCollum -> CJ McCollum
            if name not in Names_espn:
                candidates = get_close_matches(name, Names_espn)
                if candidates:
                    names.append(candidates[0])
                    print(f'change {name} -> {candidates[0]}')
                else:
                    print(f'Did not find a match, who is {name} ?!')
            else:
                names.append(name)
        data.append(pd.concat([pd.Series(names),table.drop(1,axis=1)],axis=1))
    Data = pd.concat([d for d in data])
    Data.columns = columns
    return Data

### Get data starting from 2012-2013 season

In [12]:
for season in range(2013,2021):
    print("="*40)
    print(f"Getting statistics from {season-1}-{season} season\n")
    # ESPN
    Data = get_ESPN(season)
    Data.to_csv(f'data/{season}_season_stats_traditional.csv', encoding='utf-8', index=False)
    # will align all names to ESPN
    Names_espn = Data['Name'].values
    # double-double
    data = get_dd(season, Names_espn)
    data.to_csv(f'data/{season}_season_stats_dd.csv', encoding='utf-8', index=False)
    # Hollinger
    Data = get_Hollinger(season, Names_espn)
    Data.to_csv(f'data/{season}_season_stats_Hollinger.csv', encoding='utf-8', index=False)

Getting statistics from 2012-2013 season

change J.J. Hickson -> JJ Hickson
change J.R. Smith -> JR Smith
change Gustavo Ayon -> Gustavo Alfonso Ayon
change Gerald Henderson Jr. -> Gerald Henderson
change Mike Dunleavy Jr. -> Mike Dunleavy
change Viacheslav Kravtsov -> Slava Kravtsov
change Nene  -> Nene
Getting statistics from 2013-2014 season

change J.J. Hickson -> JJ Hickson
change Gustavo Ayon -> Gustavo Alfonso Ayon
change J.R. Smith -> JR Smith
change Nene  -> Nene
Getting statistics from 2014-2015 season

change J.J. Hickson -> JJ Hickson
change Marcus Morris -> Marcus Morris Sr.
change Gerald Henderson Jr. -> Gerald Henderson
change Robbie Hummel -> Robbie John Hummel
change Jerome Jordan -> Jerome Adolphus Jordan
change K.J. McDaniels -> KJ McDaniels
change Nene  -> Nene
Getting statistics from 2015-2016 season

change Otto Porter -> Otto Porter Jr.
change Marcus Morris -> Marcus Morris Sr.
change J.J. Hickson -> JJ Hickson
change C.J. McCollum -> CJ McCollum
change J.R. Smit