In [1]:
import pandas as pd
import numpy as np
from scipy.integrate import odeint

# Function declarations

In [2]:
def loadGameData(initialSeason,finalSeason):
    '''
    loadGameData loads each season of game data into a single table, with columns
    Season, Visitor/Neutral, Home/Neutral, VisitorWin.
    
    Inputs: 
    initialSeason    :    first season (second year of season) to include (int)
    finalSeason      :    last season to include (inclusive) (int)
    
    Outputs:
    dataset          :    pandas dataframe of format described above
    '''
    seasonTables = [] # to store tables from each season
    
    # loop over and download each season's data
    for season in range(initialSeason,finalSeason+1):
        print('Dowloading',season,'season data...')
        table = pd.read_hdf('pyData/games'+str(season)+'.h5','table')
        table = table.reindex(columns=['Season','Visitor/Neutral','Home/Neutral','VisitorWin'])
        seasonStr = str(season)
        table['Season'] = str(season-1)+'-'+seasonStr[2:4] # format season in same way as PCA
        seasonTables.append(table)
        print('##################################')
    
    dataset = pd.concat(seasonTables)
    return dataset

def generatePCAVectors(dataFile,seasonToExclude):
    '''
    generataPCAVectors creates the PCA vectors for a subset of the team season-average stat data.
    
    Inputs:
    dataFile - name of the file containing every team's season-average stat data
    seasonToExclude - season to be removed from data set
    
    Outputs:
    statMean - mean of each statistical category included in dataset
    topVectors - top 7 covariance vectors (rows - stat catgory; columns - index in decreasing eigenvalue order)
    '''
    # load data
    dfTeamData = pd.read_hdf(dataFile)
    dfTeamData = dfTeamData[dfTeamData['Season'] != seasonToExclude]
    
    # compute PCA vectors
    teamDataMat = (dfTeamData.loc[:,'FG':'oppPTS']).to_numpy()
    statMean = np.mean(teamDataMat,axis=0) # Mean subtraction
    teamDataZero = teamDataMat - statMean
    N = teamDataZero.shape[0]
    covMat = 1/N*np.dot(teamDataZero.T,teamDataZero) # covariance matrix
    covLam,covVec = np.linalg.eig(covMat) # diagonlize
    inds = np.argsort(covLam) # get indexes of sorted eigenvalues
    topVectors = covVec[:,inds[:-8:-1]] #  covariance eigenvectors of top 7 eigenvalues
    
    # return covariance eigenvectors and means of each stat category
    return statMean,topVectors

def generateInputOutputData(statMean,PCABasis,dataset,statDataFile):
    '''
    generateInputOutputData converts tables of NBA game outcomes into a NumPy matrix giving the PCA components of each
    team and the outcome of the game as a 1 (visitor win) or 0.
    
    Inputs:
    statMean - mean of each team season average statistical category
    PCABasis - matrix whose columns are the PCA basis vectors
    dataset - table (pd.DataFrame) of matchups and game outcomes
    statDataFile - name of file containing team season average stats
    
    Outputs:
    trainingData - matrix where each row is one game, and if n is number of PCA basis vectors, then
                    - first n columns are visiting team's PCA components,
                    - second n columns are home team's PCA components,
                    - final column is 1.0 if visiting team won; 0 otherwise.
    '''
    # convert team stat data to data dictionary
    dfTeamData = pd.read_hdf(statDataFile) # load team stat data
    # convert CHH to CHO for (first year of) season <= 2001
    seasonInts = dfTeamData['Season'].map(lambda x : int(x[0:4]))
    teamAbbrev = dfTeamData['Tm']
    dfTeamData.loc[(seasonInts <= 2001) & (teamAbbrev == 'CHH'),'Tm'] = 'CHO'
    # proceed with data dictionary
    seasonTm = dfTeamData[['Season','Tm']]
    keys = list(seasonTm.itertuples(index=False,name=None)) # keys for stat data dictionary
    teamDataMat = (dfTeamData.loc[:,'FG':'oppPTS']).to_numpy()
    teamDataZero = teamDataMat - statMean
    teamPCA = np.dot(teamDataZero,PCABasis) # values for stat data dictionary
    teamDataDict = dict(zip(keys,teamPCA))
    
    # convert dataset to set of keys for each team and output values (1 if True, 0 otherwise)
    seasonTmVis = dataset[['Season','Visitor/Neutral']]
    x_aKeys = pd.Series(list(seasonTmVis.itertuples(index=False,name=None)),name='x_a')
    seasonTmHom = dataset[['Season','Home/Neutral']]
    x_bKeys = pd.Series(list(seasonTmHom.itertuples(index=False,name=None)),name='x_b')
    y = (dataset['VisitorWin'].map(float)).to_numpy()
    
    
    # convert keys and output values into a single matrix, each row containing each teams PCA components and the outcome 
    x_aPCA = np.stack(x_aKeys.map(teamDataDict))
    x_bPCA = np.stack(x_bKeys.map(teamDataDict))
    trainingData = np.vstack((x_aPCA.T,x_bPCA.T,y)).T

    return trainingData

def sigma(a):
    '''sigmoid function'''
    return 1./(1.+np.exp(-a))

def wVel(w,t,x,y):
    '''
    wVel evaluates velocity dw/dt = -dE/dw of logistic model, where E is error function.
    
    Inputs:
    t - current integration time
    w - array of current values of logistic model parameters
    x - matrix of training data predictors. each row is a different data point;
        assume first column is all ones, remaining columns are values of predictor variables
    y - array of training data outcomes    
        
    Outputs:
    -dE/dw - velocity of parameters (-)
    '''
    sigmaN = sigma(np.dot(x,w))
    dEdw = np.dot(x.T,sigmaN-y)
    return -dEdw

def logisticInt(w0,T,x,y):
    '''
    logisticInt performs gradient descent (dw/dt = -dE/dw) on the logistic regression model.
    
    Inputs:
    w0 - initial set of parameters of the model
    T - total time to integrate for
    x - set of predictor data (each row is a different data point, first column is ones)
    y - set of outcome data
    
    Outputs:
    w - final parameters after integration
    dEdw - gradient of error function at the end of integration
    Et - value of error function as a function of time
    tt - time steps
    '''
    # perform gradient descent
    nSteps = 100
    tt = np.linspace(0,T,nSteps+1)
    wt = odeint(wVel,w0,tt,(x,y))
    
    # gather observables
    w = wt[-1]
    dEdw = -wVel(w,0,x,y)
    # calculation of error as function of time
    sigmaNT = sigma(wt @ x.T)
    Et = -(np.dot(np.log(sigmaNT),y) + np.dot(np.log(1.-sigmaNT),1.-y))
    
    return w,dEdw,Et,tt


# Train Logistic regression model on 2001-02 to 2020-21 season data

Run <code>loadGameData</code> to extract a table with all the training data, 2000-01 to 2019-20 seasons.

In [3]:
dataset = loadGameData(2002,2021)

Dowloading 2002 season data...
##################################
Dowloading 2003 season data...
##################################
Dowloading 2004 season data...
##################################
Dowloading 2005 season data...
##################################
Dowloading 2006 season data...
##################################
Dowloading 2007 season data...
##################################
Dowloading 2008 season data...
##################################
Dowloading 2009 season data...
##################################
Dowloading 2010 season data...
##################################
Dowloading 2011 season data...
##################################
Dowloading 2012 season data...
##################################
Dowloading 2013 season data...
##################################
Dowloading 2014 season data...
##################################
Dowloading 2015 season data...
##################################
Dowloading 2016 season data...
##################################
Dowloading

Create PCA vectors for the teams in these seasons.

In [4]:
statMean,topV = generatePCAVectors('pyData/regSeasonData.h5','2000-01')

Generate training data.

In [5]:
trainingData = generateInputOutputData(statMean,topV,dataset,'pyData/regSeasonData.h5')

Train the model.

In [6]:
n = trainingData.shape[0] # # of data points
x = np.hstack([np.ones((n,1)),trainingData[:,:-1]]) # training data
y = trainingData[:,-1] # outcomes
nw = x.shape[1]

w0 = np.zeros((nw,)) # initial condition
T = 1 # integration time

w,dEdw,Et,tt = logisticInt(w0,T,x,y)

Print model coefficients.

In [11]:
w

array([-0.3840704 , -0.01434259, -0.03438915,  0.058654  , -0.00311654,
       -0.11415171,  0.02359393, -0.01524577,  0.01094597,  0.02315544,
       -0.06196125,  0.00262423,  0.12908761, -0.02927948,  0.01771595])

Print statMean.

In [12]:
statMean

array([ 37.67788945,  82.94857621,   0.45416248,   7.6681742 ,
        21.46046901,   0.35549079,  30.01524288,  61.48693467,
         0.48955276,  18.09296482,  23.83366834,   0.75986767,
        10.97889447,  31.65527638,  42.63031826,  22.12512563,
         7.58308208,   4.88425461,  14.41055276,  20.84706868,
       101.12026801,  37.68492462,  82.94907873,   0.4541474 ,
         7.66834171,  21.45862647,   0.35648409,  30.01490787,
        61.48944724,   0.48927973,  18.09564489,  23.83433836,
         0.75940704,  10.97554439,  31.65661642,  42.63350084,
        22.12579564,   7.58442211,   4.88509213,  14.40871022,
        20.84572864, 101.12713568])

Print PCA basis.

In [13]:
topV

array([[-1.25891739e-01,  1.17113673e-01,  4.16749492e-02,
        -5.67605167e-02, -2.09278425e-01,  1.55117145e-01,
        -1.67320966e-01],
       [-2.11566440e-01,  2.07785347e-01, -4.96418860e-02,
        -2.08234093e-01,  3.72990567e-03,  3.01820692e-01,
         2.32981406e-01],
       [-3.52156558e-04,  2.73363084e-04,  7.79591334e-04,
         4.71495547e-04, -2.53032510e-03,  2.02950153e-04,
        -3.32325960e-03],
       [-1.66281818e-01, -1.16306862e-01,  1.21650035e-01,
         1.21116268e-03,  3.46076588e-02,  9.42851387e-02,
        -2.15439778e-02],
       [-4.50657361e-01, -3.00713379e-01,  2.60249293e-01,
         1.17175352e-03,  1.85914548e-01,  1.84664250e-01,
         1.13299422e-01],
       [-2.95931858e-04, -4.20821357e-04,  1.47087155e-03,
         6.03016896e-05, -1.21986296e-03,  1.41160637e-03,
        -2.89117494e-03],
       [ 4.05540896e-02,  2.33413554e-01, -8.01139139e-02,
        -5.82855381e-02, -2.44382843e-01,  6.14080767e-02,
        -1.4757018