In [None]:
import numpy as np
import pandas as pd
import requests
import calendar
from matplotlib import pyplot as plt
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4 import Comment

In [None]:
def scrape_wins(month, year, url):
  """@args [year] Year of NBA data to be looked at, plugged into template url to access data and scrape."""
  """@args [month] Month of NBA data to be looked at, plugged into template url to access data and scrape."""
  """@args [url] Template url """
  """@returns scraped df from source"""
  url = url.format(year,month)
  html = urlopen(url)
  soup = BeautifulSoup(html)
  headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
  # exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
  headers = headers[1:]
  headers
  # avoid the first header row
  rows = soup.findAll('tr')[1:]
  player_stats = [[td.getText() for td in rows[i].findAll('td')]
              for i in range(len(rows))]
  stats = pd.DataFrame(player_stats, columns = headers)
  stats.head(10)
  return stats

def scrape_players(abbr,year, url):
  """@args [abbr] Abbreviation of NBA team to be looked at, plugged into template url to access data and scrape."""
  """@args [year] Year of NBA data to be looked at, plugged into template url to access data and scrape."""
  """@args [url] Template url """
  """@returns scraped df from source"""
  url = url.format(abbr,year)
  html = urlopen(url)
  soup = BeautifulSoup(html)
  # finds per game player stats for the team and season passed in for the url
  comments = soup.find_all(string=lambda text: isinstance(text, Comment))
  tables = []
  for each in comments:
    if 'table' in each:
      try:
        tables.append(pd.read_html(each)[0])
      except:
        continue
  # returns top 3 players from those tables
  if abbr in ['BOS','CHI','DAL','ORL','POR']:
    return tables[5].head(3)
  else:
    return tables[4].head(3)


In [None]:
season = list(calendar.month_name)[1:13]
season = [x.lower() for x in season]
season.remove('april')
season.remove('may')
season.remove('june')
sched_2020 = pd.DataFrame()
for month in season:
  sched_2020 = pd.concat([sched_2020, scrape_wins(month,2020,"https://www.basketball-reference.com/leagues/NBA_{0}_games-{1}.html")])
# print(sched_2020)
sched_2020.to_csv("2020_season_sched.csv")


In [None]:
teams = ['ATL',
         'BRK',
         'BOS',
         'CHO',
         'CHI',
         'CLE',
         'DAL',
         'DEN',
         'DET',
         'GSW',
         'HOU',
         'IND',
         'LAC',
         'LAL',
         'MEM',
         'MIA',
         'MIL',
         'MIN',
         'NOP',
         'NYK',
         'OKC',
         'ORL',
         'PHI',
         'PHO',
         'POR',
         'SAC',
         'SAS',
         'TOR',
         'UTA',
         'WAS']
players_2020 = pd.DataFrame()
for team in teams:
  players_2020 = pd.concat([players_2020, scrape_players(team,2020,"https://www.basketball-reference.com/teams/{0}/{1}.html#per_game::none")])
# print(players_2020)
players_2020.to_csv("top_3_each_team.csv")


In [None]:
def clean_data_players(filename, teamlist):
  """@args [filename] csv of raw dataframe of scraped data."""
  """@returns Clean dataframe with relevant features"""
  teams = []
  for team in teamlist:
    for i in range(3):
      teams.append(team)
  teams = pd.Series(teams)
  df = pd.read_csv(filename)
  df = df.fillna(0) # replace "empty" cells with 0
  # removes "Rk, Age, G, GS, MP, FG, FGA, 3P, 3PA, 2P, 2PA, FT, FTA"
  df = df.drop(df.columns[0:2],axis=1)
  df = df.drop(df.columns[1:5],axis=1)
  df = df.drop(df.columns[1:3],axis=1)
  df = df.drop(df.columns[2:4],axis=1)
  df = df.drop(df.columns[3:5],axis=1)
  df = df.drop(df.columns[5:7],axis=1)
  df.insert(len(df.columns),"Team",teams)
  return df

def clean_data_season(filename):
  """@args [raw_df] csv of raw dataframe of scraped data."""
  """@returns Clean dataframe with relevant features"""
  df = pd.read_csv(filename)
  df = df.fillna(0) # replace "empty" cells with 0
  # removes "Date, Start(ET), Box Score, OT, Attend., Notes"
  df = df.drop(df.columns[0],axis=1)
  df = df.drop(df.columns[0],axis=1)
  df = df.drop(df.columns[4:],axis=1)
  return df
  

In [None]:
top_3_each_team_clean = clean_data_players("top_3_each_team.csv",teams)
# print(top_3_each_team_clean)
top_3_each_team_clean.to_csv("top_3_each_team_clean.csv")
season_2020_sched_clean = clean_data_season("2020_season_sched.csv")
# print(season_2020_sched_clean)
season_2020_sched_clean.to_csv("season_2020_sched_clean.csv")