In [439]:
# import libraries
import numpy as np 
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import pandas as pd
import warnings
from datetime import date, timedelta 
from selenium import webdriver
import time 

# suprress warnings, I'm doing this so the notebook is more readable
warnings.filterwarnings("ignore")

## Part 1: Scraping the regular season scores from the NBA website for the 2018/19 season

The approach I took was to first google the start date and end date of the 2018/19 NBA season. The scores on the NBA website are organized by date which means that the web pages are indexed by date. For example, the results for the games played on each Oct 16th, 2018 (the first day of the season) are available on https://ca.global.nba.com/scores/#!/2018-10-16. Each game played that day has its a container and within that container are containers for the scores for each quarter and every overtime period. If there is no overtime after a quarter, the overtime container for that quarter is just empty. 

I stored these results into a dataframe. Each row in the dataframe corresponds to a single game. The columns of the dataframe are as follows: date, home team name, away team name, a column for points in each quarter for the home team, a column for points in each quarter for the away team,  a column for points in each overtime period for the home team,  a column for points in each overtime period for the away team, and a column each for the total scores of each team.

It might have bee less time consuming to use an API, but I wasn't aware of any that would let me make the requisite number of calls for free.



In [440]:
def format_game_data( date, game_data):
    """
    
    This function properly formats the inputted game data so it can be appended to a dataframe with columns = ["date","a_team", "h_team",
    "a_q1", "h_q1", "a_q2", "h_q2","a_q3", "h_q3", "a_q4", "h_q4", "a_ot1", "h_ot1","a_ot2", "h_ot2","a_ot3", "h_ot3","a_ot4", "h_ot4",
    "a_final","h_final"]. Effectively, the columns are: date, team names, firt quarter scores, second quarter scores, ..., first 
    over time scores, second over time schores, ..., and the final scores with the away teams attributes always appearing first.
    
    :param date: The date the given game was played at. Type: datetime.date.
    :param game_data: The raw scraped data from the NBA website structured as a list. Type: List.
    :return : A properly formatted list ready to insert into the main dataframe. Type: List.
    
    """

    formatted_game_data = [date]
    
    # the exhibitino games help during the regular season, world vs usa and lebron vs curry
    if len(game_data) == 16:
        formatted_game_data.extend([game_data[4],game_data[10], float(game_data[5]), float(game_data[11]),\
                                    float(game_data[6]), float(game_data[12]), float(game_data[7]),\
                                    float(game_data[13]), float(game_data[8]), float(game_data[14]),\
                                    np.NaN, np.NaN, np.NaN,np.NaN, np.NaN, np.NaN,\
                                    np.NaN, np.NaN, float(game_data[9]), float(game_data[15])])
    
    # regular season games with no over time periods
    if len(game_data) == 18: 
        formatted_game_data.extend([game_data[4],game_data[11], float(game_data[6]), float(game_data[13]),\
                                    float(game_data[7]), float(game_data[14]), float(game_data[8]),\
                                    float(game_data[15]), float(game_data[9]), float(game_data[16]),\
                                    np.NaN, np.NaN, np.NaN,np.NaN, np.NaN, np.NaN,\
                                    np.NaN, np.NaN, float(game_data[10]), float(game_data[17])])
    # regular season games with one over time period
    if len(game_data) == 21:
        formatted_game_data.extend([game_data[5],game_data[13], float(game_data[7]), float(game_data[15]),\
                                    float(game_data[8]), float(game_data[16]), float(game_data[9]),\
                                    float(game_data[17]), float(game_data[10]), float(game_data[18]),\
                                    float(game_data[11]), float(game_data[19]),np.NaN, np.NaN, np.NaN, np.NaN,\
                                    np.NaN, np.NaN, float(game_data[12]), float(game_data[20])])
    
    # regular season games with two over time periods
    if len(game_data) == 24:
        formatted_game_data.extend([game_data[6],game_data[15], float(game_data[8]), float(game_data[17]),\
                                    float(game_data[9]), float(game_data[18]), float(game_data[10]),\
                                    float(game_data[19]), float(game_data[11]), float(game_data[20]),\
                                    float(game_data[12]), float(game_data[21]), float(game_data[13]), float(game_data[22]), np.NaN, np.NaN,\
                                    np.NaN, np.NaN, float(game_data[14]), float(game_data[23])])
        
    # regular season games with three over time periods
    if len(game_data) == 27:
        formatted_game_data.extend([game_data[7],game_data[17], float(game_data[9]), float(game_data[19]),\
                                    float(game_data[10]), float(game_data[20]), float(game_data[11]),\
                                    float(game_data[21]), float(game_data[12]), float(game_data[22]),\
                                    float(game_data[13]), float(game_data[23]), float(game_data[14]), float(game_data[24]), float(game_data[15]), float(game_data[25]),\
                                    np.NaN, np.NaN, float(game_data[16]), float(game_data[26])])
    
    # regular season games with four over time periods
    if len(game_data) == 30:
        formatted_game_data.extend([game_data[8],game_data[19], float(game_data[10]), float(game_data[21]),\
                                    float(game_data[11]), float(game_data[22]), float(game_data[12]),\
                                    float(game_data[23]), float(game_data[13]), float(game_data[24]),\
                                    float(game_data[14]), float(game_data[25]), float(game_data[15]), float(game_data[26]), float(game_data[16]), float(game_data[27]),\
                                    float(game_data[17]), float(game_data[28]), float(game_data[18]), float(game_data[29])])
    
    
    return formatted_game_data

The function above correctly formats the scraped web data. The script below does the scraping. It takes a while to run because it loads each game day page in turn. 

In [442]:
# The dates for the beginning and ending ot the 2018/19 season.
start_date = date(2018, 10, 16) # Oct 16th, 2018
end_date = date(2019, 4, 10) # April 10th, 2019
delta = timedelta(days=1) # this is set to one since the NBA scores pages are indexed by day


# Open firefox and navigate to web page
driver = webdriver.Firefox()
base_url = "https://ca.global.nba.com/scores/#!/"

# Extract data
columns = ["date","a_team", "h_team", "a_q1", "h_q1", "a_q2", "h_q2",\
           "a_q3", "h_q3", "a_q4", "h_q4", "a_ot1", "h_ot1",\
           "a_ot2", "h_ot2","a_ot3", "h_ot3","a_ot4", "h_ot4",\
           "a_final","h_final"] 
scores = pd.DataFrame(columns = columns)


# This loop iterates through each day in the season, and appends the games from that day to the master dataframe 
while start_date <= end_date:
    # updates the url
    driver.get(base_url+start_date.strftime("%Y-%m-%d")) 
    
    # includes a 10 second delay to allow the page to fully load
    time.sleep(10)
    
    # extracts the scores from the current page
    data = driver.find_elements_by_xpath('//table[@class="final-game-table"]') 
    
    # append the game data from current page to the master dataframe
    for i in range(len(data)):  
        scores = scores.append(dict(zip(columns, format_game_data(start_date,data[i].text.split()))),ignore_index=True)
    
    # increment the current date by one day
    start_date += delta
    
    
driver.close()

In [445]:
scores

Unnamed: 0,date,a_team,h_team,a_q1,h_q1,a_q2,h_q2,a_q3,h_q3,a_q4,...,a_ot1,h_ot1,a_ot2,h_ot2,a_ot3,h_ot3,a_ot4,h_ot4,a_final,h_final
0,2018-10-16,PHI,BOS,21.0,21.0,21.0,26.0,24.0,30.0,21.0,...,,,,,,,,,87.0,105.0
1,2018-10-16,OKC,GSW,23.0,31.0,24.0,26.0,32.0,26.0,21.0,...,,,,,,,,,100.0,108.0
2,2018-10-17,MIL,CHA,36.0,23.0,31.0,31.0,26.0,29.0,20.0,...,,,,,,,,,113.0,112.0
3,2018-10-17,BKN,DET,29.0,24.0,22.0,27.0,25.0,32.0,24.0,...,,,,,,,,,100.0,103.0
4,2018-10-17,MEM,IND,16.0,27.0,23.0,29.0,19.0,20.0,25.0,...,,,,,,,,,83.0,111.0
5,2018-10-17,MIA,ORL,31.0,25.0,20.0,29.0,27.0,25.0,23.0,...,,,,,,,,,101.0,104.0
6,2018-10-17,ATL,NYK,24.0,23.0,25.0,49.0,35.0,34.0,23.0,...,,,,,,,,,107.0,126.0
7,2018-10-17,CLE,TOR,25.0,28.0,22.0,32.0,28.0,30.0,29.0,...,,,,,,,,,104.0,116.0
8,2018-10-17,NOP,HOU,35.0,29.0,36.0,25.0,30.0,30.0,30.0,...,,,,,,,,,131.0,112.0
9,2018-10-17,MIN,SAS,23.0,31.0,29.0,25.0,31.0,25.0,25.0,...,,,,,,,,,108.0,112.0


As shown above, there are 1232 games played during the regular season including the Lebron vs Curry and USA vs World exhibition games. The statement in the following cell shows that there was data recorded for every quarter indicating that my webscraper did not miss anything. 

In [463]:
scores.iloc[:,3:11].isnull().any()

a_q1    False
h_q1    False
a_q2    False
h_q2    False
a_q3    False
h_q3    False
a_q4    False
h_q4    False
dtype: bool

In [503]:
scores.iloc[[866, 0, 45, 142, 479,929],1:21]

Unnamed: 0,a_team,h_team,a_q1,h_q1,a_q2,h_q2,a_q3,h_q3,a_q4,h_q4,a_ot1,h_ot1,a_ot2,h_ot2,a_ot3,h_ot3,a_ot4,h_ot4,a_final,h_final
866,WLD,USA,30.0,40.0,41.0,43.0,46.0,40.0,27.0,38.0,,,,,,,,,144.0,161.0
0,PHI,BOS,21.0,21.0,21.0,26.0,24.0,30.0,21.0,28.0,,,,,,,,,87.0,105.0
45,WAS,POR,28.0,32.0,34.0,30.0,16.0,15.0,33.0,34.0,14.0,13.0,,,,,,,125.0,124.0
142,CHI,NYK,24.0,21.0,23.0,26.0,31.0,25.0,24.0,30.0,6.0,6.0,8.0,7.0,,,,,116.0,115.0
479,PHX,WAS,29.0,33.0,26.0,22.0,24.0,28.0,30.0,26.0,13.0,13.0,10.0,10.0,14.0,17.0,,,146.0,149.0
929,CHI,ATL,26.0,33.0,31.0,27.0,38.0,24.0,29.0,40.0,16.0,16.0,7.0,7.0,8.0,8.0,13.0,6.0,168.0,161.0


These were five test cases I used to ensure that my data from the webscrape was accurate. I cross referenced these scraped results againt the NBA website.

## Part 2: Creating additional period data

In this section, I add columns for the following to the original scores dataframe: first half score, second half score including overtime, overtime scores, and regulation time excluding overtime. The full time scores including overtime were part of the data I scraped. I saved this dataframe in a separate variable.  

In [481]:
# creating a new scores dataframe
modified_scores = scores
modified_scores = scores.fillna(0)

# first half scores
modified_scores["a_half1"] = modified_scores["a_q1"]+ modified_scores["a_q2"] 
modified_scores["h_half1"] = modified_scores["h_q1"]+ modified_scores["h_q2"] 

# second half scores plus overtime
modified_scores["a_half2"] = modified_scores["a_q3"] + modified_scores["a_q4"]\
                            + modified_scores["a_ot1"] + modified_scores["a_ot2"]\
                            + modified_scores["a_ot3"] + modified_scores["a_ot4"]
modified_scores["h_half2"] = modified_scores["h_q3"] + modified_scores["h_q4"]\
                            + modified_scores["h_ot1"] + modified_scores["h_ot2"]\
                            + modified_scores["h_ot3"] + modified_scores["h_ot4"]

# total overtime scores
modified_scores["a_overtime"] =  modified_scores["a_ot1"] + modified_scores["a_ot2"] + modified_scores["a_ot3"] + modified_scores["a_ot4"]
modified_scores["h_overtime"] =  modified_scores["h_ot1"] + modified_scores["h_ot2"] + modified_scores["h_ot3"] + modified_scores["h_ot4"]

# regulation time scores
modified_scores["a_regulation"] = modified_scores["a_q1"] + modified_scores["a_q2"] + modified_scores["a_q3"]+ modified_scores["a_q4"] 
modified_scores["h_regulation"] = modified_scores["h_q1"] + modified_scores["h_q2"] + modified_scores["h_q3"]+ modified_scores["h_q4"] 



In [489]:
modified_scores

Unnamed: 0,date,a_team,h_team,a_q1,h_q1,a_q2,h_q2,a_q3,h_q3,a_q4,...,a_final,h_final,a_half1,h_half1,a_half2,h_half2,a_overtime,h_overtime,a_regulation,h_regulation
0,2018-10-16,PHI,BOS,21.0,21.0,21.0,26.0,24.0,30.0,21.0,...,87.0,105.0,42.0,47.0,45.0,58.0,0.0,0.0,87.0,105.0
1,2018-10-16,OKC,GSW,23.0,31.0,24.0,26.0,32.0,26.0,21.0,...,100.0,108.0,47.0,57.0,53.0,51.0,0.0,0.0,100.0,108.0
2,2018-10-17,MIL,CHA,36.0,23.0,31.0,31.0,26.0,29.0,20.0,...,113.0,112.0,67.0,54.0,46.0,58.0,0.0,0.0,113.0,112.0
3,2018-10-17,BKN,DET,29.0,24.0,22.0,27.0,25.0,32.0,24.0,...,100.0,103.0,51.0,51.0,49.0,52.0,0.0,0.0,100.0,103.0
4,2018-10-17,MEM,IND,16.0,27.0,23.0,29.0,19.0,20.0,25.0,...,83.0,111.0,39.0,56.0,44.0,55.0,0.0,0.0,83.0,111.0
5,2018-10-17,MIA,ORL,31.0,25.0,20.0,29.0,27.0,25.0,23.0,...,101.0,104.0,51.0,54.0,50.0,50.0,0.0,0.0,101.0,104.0
6,2018-10-17,ATL,NYK,24.0,23.0,25.0,49.0,35.0,34.0,23.0,...,107.0,126.0,49.0,72.0,58.0,54.0,0.0,0.0,107.0,126.0
7,2018-10-17,CLE,TOR,25.0,28.0,22.0,32.0,28.0,30.0,29.0,...,104.0,116.0,47.0,60.0,57.0,56.0,0.0,0.0,104.0,116.0
8,2018-10-17,NOP,HOU,35.0,29.0,36.0,25.0,30.0,30.0,30.0,...,131.0,112.0,71.0,54.0,60.0,58.0,0.0,0.0,131.0,112.0
9,2018-10-17,MIN,SAS,23.0,31.0,29.0,25.0,31.0,25.0,25.0,...,108.0,112.0,52.0,56.0,56.0,56.0,0.0,0.0,108.0,112.0


## Part 3: Creating factors to express how teams compare to the league mean per period

Let $\mu_{league,p}$ be the league average point tally for some period $p$ calculated over all 1230 games played that season. To calculate the average difference, $d_{p}$, between a given team and the league average over $n=82$ league games for that period, I do the following: $$d_{p} = \sum_{i=1}^{n} \frac{x_{i,p}-\mu_{league,p}}{n}   $$.

where $x_{i,p}$ is the team's point tally for game $i$ of period $p$. For example, $x_{2,Q1}-\mu_{league,Q1}$ is the difference between the team's point tally in the first quarter of the second game of the season and the league average point tally for the first quarter. I sum up the differences for all 82 games and then divide by the total number of games.

To allow for comparisons across categories and period, I normalize $d_{p}$. Let $f_{p}$ be the normalized $d_{p}$. Then $$ f_{p} = \frac{d_{p}}{\mu_{league,p}} $$.

The reason I chose this over a variant of the standard deviation is because the average difference has a sign that indicates whether the team does better or worse on average rather than just giving an absolute,signless measure of the spread.


In [484]:
# the periods of interest
mean_columns = ["q1", "q2", "q3", "q4", "first_half", "overtime", "second_half", "regulation", "full" ]

# the means data
league_means = pd.DataFrame(columns=mean_columns)

# compute each mean in turn
q1_mean = modified_scores["a_q1"].mean()*0.5 + modified_scores["a_q1"].mean()*0.5
q2_mean = modified_scores["a_q2"].mean()*0.5 + modified_scores["a_q2"].mean()*0.5
q3_mean = modified_scores["a_q3"].mean()*0.5 + modified_scores["a_q3"].mean()*0.5
q4_mean = modified_scores["a_q4"].mean()*0.5 + modified_scores["a_q4"].mean()*0.5
first_half_mean = q1_mean + q2_mean 
regulation_mean = q1_mean + q2_mean + q3_mean + q4_mean
full_mean = modified_scores["a_final"].mean()*0.5+modified_scores["h_final"].mean()*0.5
second_half_mean = full_mean- first_half_mean
overtime_mean = full_mean - regulation_mean

# present as a data frame
league_means = league_means.append(dict(zip(mean_columns,[q1_mean,q2_mean,q3_mean,q4_mean,first_half_mean,overtime_mean,second_half_mean,regulation_mean,full_mean])),ignore_index=True)

This next cell contains the means for the desired periods. Note that the second half mean is inclusive of overtime and that the overtime mean is calculated over all games- not just the games that actually had overtime periods. 

In [485]:
league_means

Unnamed: 0,q1,q2,q3,q4,first_half,overtime,second_half,regulation,full
0,27.592532,27.600649,27.453734,26.576299,55.193182,2.06737,56.097403,109.223214,111.290584


The next cell computes the factor I described earlier. Using the methods that are already part of the pandas library, the function is quite simple to write. The script that follows calls this function for every team and for every category. 

In [486]:
def diff_factor(series, mean):
    """
    This factor computes the average difference between the numbers in the 
    input series and the mean parameter. The average difference is normalized by 
    dividing by the mean parameter
    
    :param series: A series that a difference is required for. Type: Pandas.Series.
    :param mean: The mean to compare to. Type: float.
    :return difference factor. Type: float.
    
    """
    diff = series-mean
    
    return diff.mean()/mean

In [487]:
team_columns = ["team","q1", "q2", "q3", "q4", "first_half", "overtime", "second_half", "regulation", "full" ]
team_factors = pd.DataFrame(columns = team_columns)


for team in modified_scores.a_team.unique():
    # initialize list to append to dataframe
    team_data = [team]
    
    # create mean for each period and append them to the team_data list
    for period_a, period_h, mean in zip(["a_q1","a_q2","a_q3","a_q4","a_half1","a_overtime","a_half2","a_regulation","a_final"],\
                              ["h_q1","h_q2","h_q3","h_q4","h_half1","h_overtime","h_half2","h_regulation","h_final"], \
                              league_means.loc[0]):
        period_q = diff_factor(modified_scores.loc[modified_scores["a_team"]==team][period_a], mean)
        period_q = diff_factor(modified_scores.loc[modified_scores["h_team"]==team][period_h], mean)
        team_data.append(period_q*0.5+period_q*0.5)

    # append list to dataframe
    team_factors = team_factors.append(dict(zip(team_columns,team_data)), ignore_index=True)
    
    

In [494]:
team_factors.round(3)

Unnamed: 0,team,q1,q2,q3,q4,first_half,overtime,second_half,regulation,full
0,PHI,0.138,0.09,0.008,0.082,0.114,-0.835,0.012,0.079,0.062
1,OKC,0.032,0.02,0.053,0.063,0.026,-0.658,0.031,0.042,0.029
2,MIL,0.082,0.13,0.093,0.084,0.106,-1.0,0.048,0.097,0.077
3,BKN,-0.007,0.044,0.079,0.006,0.018,-0.387,0.027,0.031,0.023
4,MEM,-0.049,0.007,-0.096,-0.064,-0.021,-0.575,-0.098,-0.05,-0.06
5,MIA,-0.005,-0.027,-0.033,-0.054,-0.016,-1.0,-0.078,-0.029,-0.047
6,ATL,0.035,0.094,-0.019,0.074,0.065,-0.245,0.017,0.046,0.04
7,CLE,-0.011,-0.047,-0.075,-0.043,-0.029,-0.646,-0.081,-0.044,-0.055
8,NOP,-0.0,0.087,0.043,0.075,0.043,-0.835,0.026,0.051,0.034
9,MIN,0.006,0.066,0.079,0.002,0.036,-0.422,0.024,0.039,0.03


The 30 NBA teams are accounted for in the table above. 
