## Code to pull game data
This script is to pull NBA game data from each season. The data will be pulled/scraped from basketball-reference. To access pages which contains all the seasonal games the URL is https://www.basketball-reference.com/leagues/NBA_YYYY_games.html, where YYYY is the year of the season. A season is denoted by the year it ends in rather than begins in.

In [39]:
!pip install --upgrade pip
!pip install lxml

Defaulting to user installation because normal site-packages is not writeable
Requirement already up-to-date: pip in /srv/DFSc/cs-teaching/home/u1/pmcwhann/.local/lib/python3.6/site-packages (20.2.4)
Defaulting to user installation because normal site-packages is not writeable


In [112]:
import requests
from bs4 import BeautifulSoup
from lxml import html

def create_list_boxscores(url, output):
    """
    Take url to a specific month of a season and
    generate a list of boxscores
    @input url: url of a specific month of games.
    @input output: a list that we will be appending the boxscore url's to.
    
    @return void: not returning anything as additions are made to output which is global.
    """
    page = requests.get(url)    
    tree = html.fromstring(page.content)
    
    # Root to box_scores
    base_path = "https://www.basketball-reference.com"
    
    # Grab all boxscores
    games = tree.xpath('//*[@id="schedule"]/tbody/tr/td[@data-stat="box_score_text"]/a/@href')
    
    # Add all box scores to the list for printing
    for g in games:
        output.append(base_path + g)
        
def get_nba_year_urls(year_start = 1956, year_end=2020):
    nba_yearly_urls = []
    example_url = "https://www.basketball-reference.com/leagues/NBA_2018_games.html"
    for year in range(year_start, year_end+1):
        nba_yearly_urls.append("https://www.basketball-reference.com/leagues/NBA_" + str(year) + "_games.html")
    return nba_yearly_urls

def get_monthly_nba_from_year(nba_year_url):
    page = requests.get(nba_year_url)
    tree = html.fromstring(page.content)
    months = tree.xpath('//*[@class="filter"]/div/a/@href')

    return months

def get_monthly_nba_links(year_start = 1956, year_end=2020):
    nba_years = get_nba_year_urls(year_start, year_end)
    monthly_urls = []
    for year in nba_years:
        monthly_urls = monthly_urls + get_monthly_nba_from_year(year)
    return monthly_urls

def get_all_boxscores():
    '''
    Get all boxscore links.
    '''
    # Root to box_scores
    base_path = "https://www.basketball-reference.com"
    
    # monthly branches
    links = get_monthly_nba_links()
    
    # boxscore links
    box_links = []
    
    for l in links:
        create_list_boxscores(base_path + l, box_links) # Adds to box_links
    
    
    with open('box_score_links.txt', 'w') as f:
        for item in box_links:
            f.write("%s\n" % item)
            
    f.close()
    

In [113]:
get_all_boxscores()

In [None]:
# GAMES
# STATS table per file
# metadata: playoff/reg?,teams names, date,
# short-chart -> Whether shot will go in or not. (player meta, time of game, other team meta, position of shot, time period)
# play-by-play

In [None]:
# PLAYERS
# 