In [93]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

year = 2020
season = str(year-1) + "-" + str(year)

tms = {
    "ATL" : "Atlanta Hawks",
    "BAL" : "Baltimore Bullets",
    "BOS" : "Boston Celtics",
    "BRK" : "Brooklyn Nets",
    "CHA" : "Charlotte Bobcats",
    "CHH" : "Charlotte Hornets",
    "CHO" : "Charlotte Hornets",
    "CHI" : "Chicago Bulls",
    "CLE" : "Cleveland Cavaliers",
    "DAL" : "Dallas Mavericks",
    "DEN" : "Denver Nuggets",
    "DET" : "Detroit Pistons",
    "GSW" : "Golden State Warriors",
    "HOU" : "Houston Rockets",
    "IND" : "Indiana Pacers",
    "KCK" : "Kansas City Kings",
    "LAC" : "Los Angeles Clippers",
    "LAL" : "Los Angeles Lakers",
    "MEM" : "Memphis Grizzlies",
    "MIA" : "Miami Heat",
    "MIL" : "Milwaukee Bucks",
    "NJN" : "New Jersey Nets",
    "NOH" : "New Orleans Hornets",
    "NOK" : "New Orleans/Oklahoma City Hornets",
    "NOP" : "New Orleans Pelicans",
    "NYK" : "New York Knicks",
    "MIN" : "Minnesota Timberwolves",
    "SEA" : "Seattle SuperSonics",
    "OKC" : "Oklahoma City Thunder",
    "ORL" : "Orlando Magic",
    "PHI" : "Philadelphia 76ers",
    "PHO" : "Phoenix Suns",
    "POR" : "Portland Trail Blazers",
    "SAC" : "Sacramento Kings",
    "SAS" : "San Antonio Spurs",
    "SFW" : "San Francisco Warriors",
    "TOR" : "Toronto Raptors",
    "UTA" : "Utah Jazz",
    "VAN" : "Vancouver Grizzlies",
    "WAS" : "Washington Wizards",
    "WSB" : "Washington Bullets"
    
}

In [94]:
# Getting the total stats of the every NBA player in the league for a given year

url = "https://www.basketball-reference.com/leagues/NBA_{}_totals.html".format(year)
html = urlopen(url)
soup = BeautifulSoup(html, "lxml")

table = soup.find(id = "totals_stats")
headers = [th.getText() for th in table.find_all('tr')[0].findAll('th')]
headers = headers[1:]
rows = table.find_all('tr')[1:]
stats = [[td.getText() for td in rows[i].findAll('td')]
        for i in range(len(rows))]
totals = pd.DataFrame(stats, columns = headers)

for index, row in totals.iterrows():
    if row['Tm'] == None:
        totals = totals.drop(index)
    else:
        totals.loc[index, 'G'] = int(row['G'])
        totals.loc[index, 'MP'] = int(row['MP'])
        totals.loc[index, 'FG'] = int(row['FG'])
        totals.loc[index, 'FGA'] = int(row['FGA'])
        totals.loc[index, 'FT'] = int(row['FT'])
        totals.loc[index, 'FTA'] = int(row['FTA'])
        totals.loc[index, 'PTS'] = int(row['PTS'])
        
totals = totals[['Player','Pos','Tm','G','MP','FGA','FTA','PTS']]

for index, row in totals.iterrows(): # initial cleanup
    if row['FGA'] == 0:
        totals = totals.drop(index)
    else:
        totals.loc[index, 'ln_MPG'] = np.log(row['MP'] / row['G'])
        totals.loc[index, 'ln_PPG'] = np.log(row['PTS'] / row['G'])

totals['TS%'] = totals['PTS'] / (2 * (totals['FGA'] + 0.44*totals['FTA']))
    
totals = totals.drop_duplicates(subset = ['Player'], keep='first')


# Getting overall team stats for a given year

url = "https://www.basketball-reference.com/leagues/NBA_{}.html".format(year)
html = urlopen(url)
soup = BeautifulSoup(html, "lxml")

comments = soup.find_all(string=lambda text:isinstance(text,Comment))

def get_table(comments,s):
    for comment in comments:
        comment1 = BeautifulSoup(str(comment), 'lxml')
        table = comment1.find(id = s) # edit this line to get a different table
        if table:
            break
    return table

def scrape(tbl):
    headers = [th.getText() for th in tbl.find_all('tr')[0].findAll('th')]
    headers = headers[1:]
    rows = tbl.find_all('tr')[1:]
    records = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
    df = pd.DataFrame(records, columns = headers)
    return df

tm_totals = scrape(get_table(comments, 'team-stats-base'))
for index, row in tm_totals.iterrows():
    tm_totals.loc[index,'PTS'] = int(row['PTS'])
    tm_totals.loc[index,'G'] = int(row['G'])
    if row['Team'][len(row['Team']) - 1] == '*':
        tm_totals.loc[index,'Team'] = row['Team'].rstrip('*')
    
tm_totals = tm_totals.set_index('Team')
tm_totals = tm_totals.drop(['League Average'], axis=0)

nba_avg = tm_totals['PTS'].sum() / tm_totals['G'].sum()

min_games = tm_totals['G'].min()

for index, row in totals.iterrows():
    totals.loc[index, 'Player'] = row['Player'].rstrip("*")
    if row['Tm'] != 'TOT':
        if row['G'] < 0.6 * tm_totals.loc[tms[row['Tm']], 'G']:
            totals = totals.drop(index)
        elif (row['MP'] / row['G']) < 20:
            totals = totals.drop(index)
    else:
        if row['G'] < 0.6 * min_games:
            totals = totals.drop(index)
        elif (row['MP'] / row['G']) < 20:
            totals = totals.drop(index)
    

totals = totals.set_index('Player')



In [96]:
import numpy as np
from sklearn.linear_model import LinearRegression

totals['PPG'] = totals['PTS'] / totals['G']
totals['MPG'] = totals['MP'] / totals['G']

model = LinearRegression()

x = np.array(totals['MPG']).reshape((-1,1))
y = np.array(totals['PPG']).reshape((-1,1))

model.fit(x,y)

y_pred = model.predict(x)

#plt.scatter(totals['PPG'], totals['MPG'])
#plt.scatter(totals['PPG'], totals['ln_MPG'])
#plt.scatter(totals['ln_MPG'], totals['PPG'])
#plt.scatter(totals['MPG'], totals['PPG'])
#plt.scatter(totals['ln_PPG'], totals['ln_MPG'])
plt.scatter(totals['MPG'], totals['PPG'])
plt.plot(totals['MPG'], y_pred)

print('coefficient of determination:', model.score(x, y))

KeyError: 'MPG'