In [13]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np

year = 2020
season = str(year-1) + "-" + str(year)

# Configure this dictionary with the list of teams that played in the particular season
tms = {
    "ATL" : "Atlanta Hawks",
    "BOS" : "Boston Celtics",
    "BRK" : "Brooklyn Nets",
    #"CHA" : "Charlotte Bobcats",
    #"CHH" : "Charlotte Hornets",
    "CHO" : "Charlotte Hornets",
    "CHI" : "Chicago Bulls",
    "CLE" : "Cleveland Cavaliers",
    "DAL" : "Dallas Mavericks",
    "DEN" : "Denver Nuggets",
    "DET" : "Detroit Pistons",
    "GSW" : "Golden State Warriors",
    "HOU" : "Houston Rockets",
    "IND" : "Indiana Pacers",
    #"KCK" : "Kansas City Kings",
    "LAC" : "Los Angeles Clippers",
    "LAL" : "Los Angeles Lakers",
    "MEM" : "Memphis Grizzlies",
    "MIA" : "Miami Heat",
    "MIL" : "Milwaukee Bucks",
    #"NJN" : "New Jersey Nets",
    #"NOH" : "New Orleans Hornets",
    #"NOK" : "New Orleans/Oklahoma City Hornets",
    "NOP" : "New Orleans Pelicans",
    "NYK" : "New York Knicks",
    "MIN" : "Minnesota Timberwolves",
    #"SEA" : "Seattle SuperSonics",
    "OKC" : "Oklahoma City Thunder",
    "ORL" : "Orlando Magic",
    "PHI" : "Philadelphia 76ers",
    "PHO" : "Phoenix Suns",
    "POR" : "Portland Trail Blazers",
    "SAC" : "Sacramento Kings",
    "SAS" : "San Antonio Spurs",
    "TOR" : "Toronto Raptors",
    "UTA" : "Utah Jazz",
    #"VAN" : "Vancouver Grizzlies",
    "WAS" : "Washington Wizards",
    #"WSB" : "Washington Bullets"
    
}

In [14]:
# Getting the total stats of the every NBA player in the league for a given year

url = "https://www.basketball-reference.com/leagues/NBA_{}_totals.html".format(year)
html = urlopen(url)
soup = BeautifulSoup(html, "lxml")

table = soup.find(id = "totals_stats")
headers = [th.getText() for th in table.find_all('tr')[0].findAll('th')]
headers = headers[1:]
rows = table.find_all('tr')[1:]
stats = [[td.getText() for td in rows[i].findAll('td')]
        for i in range(len(rows))]
totals = pd.DataFrame(stats, columns = headers)

for index, row in totals.iterrows():
    if row['Tm'] == None:
        totals = totals.drop(index)
    else:
        totals.loc[index, 'G'] = int(row['G'])
        totals.loc[index, 'MP'] = int(row['MP'])
        totals.loc[index, 'FG'] = int(row['FG'])
        totals.loc[index, 'FGA'] = int(row['FGA'])
        totals.loc[index, 'FT'] = int(row['FT'])
        totals.loc[index, 'FTA'] = int(row['FTA'])
        totals.loc[index, 'PTS'] = int(row['PTS'])
        
totals = totals[['Player','Pos','Tm','G','MP','FGA','FTA','PTS']]

for index, row in totals.iterrows(): # initial cleanup
    if row['FGA'] == 0:
        totals = totals.drop(index)

totals['TS%'] = totals['PTS'] / (2 * (totals['FGA'] + 0.44*totals['FTA']))
    
totals = totals.drop_duplicates(subset = ['Player'], keep='first')

In [15]:
# Helper functions to get the average team stats for games that players have played

import unidecode
import time

# getting the stats from the game and looping through the game log
# returns team average points and the teams that the player played for
def get_stats(gm_log):
    teams = []
    tm_points = []
    
    for index, row in gm_log.iterrows():
        if row['Tm'] == None or row['GS'] in ['Inactive', 'Did Not Play', 'Did Not Dress', 'Player Suspended']:
            gm_log = gm_log.drop(index)
        else:
            d = row['Date'].replace("-","")
            try:
                url = "https://www.basketball-reference.com/boxscores/{}0".format(d) + row['Tm'] + ".html"
                html = urlopen(url)
            except:
                url = "https://www.basketball-reference.com/boxscores/{}0".format(d) + row['Opp'] + ".html"
                html = urlopen(url)
                
            soup = BeautifulSoup(html, "lxml")
            
            table = soup.find(id = "box-"+row['Tm']+"-game-basic")
            headers = [th.getText() for th in table.find_all('tr')[1].findAll('th')]
            headers = headers[1:]
            rows = table.find_all('tr')[1:]
            stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
            game = pd.DataFrame(stats, columns = headers)
            game = game.tail(1)
            tm_points.append(int(game['PTS'].iloc[0]))
            
            if row['Tm'] not in teams:
                teams.append(row['Tm'])
            
            
    tm_avg_pts = sum(tm_points) / len(tm_points)
    
    return tm_avg_pts, teams


# getting the game_log for a player
def get_url(p_init, year):
    p = p_init
    p = unidecode.unidecode(p)
    for s in p:  
        if s in ['-', '.', '\'']:  
            p = p.replace(s, "")
    p = p.lower()
    p = p.split()
    try:
        code = p[1][:min(5,len(p[1]))] + p[0][:min(2,len(p[0]))]
    except:
        return None, None, None
    
    n = 1
    try:
        url = "https://www.basketball-reference.com/players/" + p[1][0] + "/" + code + "0" + str(n) + "/gamelog/{}".format(year)
        html = urlopen(url)
    except:
        return None, None, None
    
        
    soup = BeautifulSoup(html, "lxml")
    s = soup.title.string
    for i in s:
        if i.isnumeric():
            s = s.split(i)
            break
    name = s[0][:-1]
    while name != p_init:
        n+=1
        try:
            url = "https://www.basketball-reference.com/players/" + p[1][0] + "/" + code + "0" + str(n) + "/gamelog/{}".format(year)
            html = urlopen(url)
        except:
            return None, None, None
        soup = BeautifulSoup(html, "lxml")
        s = soup.title.string
        for i in s:
            if i.isnumeric():
                s = s.split(i)
                break
        name = s[0][:-1]
    
    
    table = soup.find(id = "pgl_basic")
    try:
        headers = [th.getText() for th in table.find_all('tr')[0].findAll('th')]
    except:
        return None, None, None
    headers = headers[1:]
    rows = table.find_all('tr')[1:]
    stats = [[td.getText() for td in rows[i].findAll('td')]
        for i in range(len(rows))]
    gm_log = pd.DataFrame(stats, columns = headers)
    return get_stats(gm_log)

In [16]:
# Getting overall team stats for a given year

url = "https://www.basketball-reference.com/leagues/NBA_{}.html".format(year)
html = urlopen(url)
soup = BeautifulSoup(html, "lxml")

comments = soup.find_all(string=lambda text:isinstance(text,Comment))

def get_table(comments,s):
    for comment in comments:
        comment1 = BeautifulSoup(str(comment), 'lxml')
        table = comment1.find(id = s)
        if table:
            break
    return table

def scrape(tbl):
    headers = [th.getText() for th in tbl.find_all('tr')[0].findAll('th')]
    headers = headers[1:]
    rows = tbl.find_all('tr')[1:]
    records = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
    df = pd.DataFrame(records, columns = headers)
    return df

tm_totals = scrape(get_table(comments, 'team-stats-base'))
for index, row in tm_totals.iterrows():
    tm_totals.loc[index,'PTS'] = int(row['PTS'])
    tm_totals.loc[index,'G'] = int(row['G'])
    if row['Team'][len(row['Team']) - 1] == '*':
        tm_totals.loc[index,'Team'] = row['Team'].rstrip('*')
    
tm_totals = tm_totals.set_index('Team')
tm_totals = tm_totals.drop(['League Average'], axis=0)

nba_avg = tm_totals['PTS'].sum() / tm_totals['G'].sum()

In [17]:
# cleaning up players who have played fewer than 60% of the season's games or less than 15 minutes per game

min_games = tm_totals['G'].min()

for index, row in totals.iterrows():
    totals.loc[index, 'Player'] = row['Player'].rstrip("*")
    if row['Tm'] != 'TOT':
        if row['G'] < 0.6 * tm_totals.loc[tms[row['Tm']], 'G']:
            totals = totals.drop(index)
        elif (row['MP'] / row['G']) < 20:
            totals = totals.drop(index)
    else:
        if row['G'] < 0.6 * min_games:
            totals = totals.drop(index)
        elif (row['MP'] / row['G']) < 20:
            totals = totals.drop(index)
    
totals = totals.set_index('Player')

In [None]:
start = time.time()

filenumber = 1

output = pd.DataFrame(index = totals.index, columns = ['Season','MPG','PPG','TS%','Tm_PPG','Lg_Avg_PPG'])
num = 1
for index, row in totals.iterrows():
    #print(index)
    g = row['G']
    mp = row['MP']
    pts = row['PTS']
    ts = row['TS%']
    tm_avg_pts, teams = get_url(index, year)
    if tm_avg_pts == None:
        file1 = open("missing_players"+str(year)+".txt", "a", encoding = 'utf-8')
        file1.write(index+"\n")
        file1.close()
        continue
    else:
        temp = tm_totals.copy()
        for t in teams:
            temp = temp.drop(tms[t])
        nba_avg = temp['PTS'].sum() / temp['G'].sum()
        avg_pts = (nba_avg + tm_avg_pts) / 2
        output.loc[index, 'Season'] = season
        output.loc[index, 'PPG'] = pts/g
        output.loc[index, 'MPG'] = mp/g
        output.loc[index, 'TS%'] = ts
        output.loc[index, 'Tm_PPG'] = tm_avg_pts
        output.loc[index, 'Lg_Avg_PPG'] = nba_avg
    print("player executed " + str(num))
    num += 1

In [None]:
# creating files

output.to_excel(season+"_"+str(filenumber)+'.xlsx')