In [1]:
# third party libraries
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())
from matplotlib import pyplot as plt
from sklearn.model_selection import ParameterGrid
import numpy as np
import pandas as pd
import pickle
import heapq

# personal libraries
from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.dataPrep.importData import *
import mrsc.src.utils as utils

# dennis libraries
from mrsc.src.dataPrep import plotData, annualData
from mrsc.src.predictions import annualSLA, predictionMethods

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [2]:
""" Compute mean absolute error (l1-norm) """
def mae(pred, true):
    return np.mean(np.abs(pred - true))

def convertMetrics(df, metrics): 
    for metric in metrics: 
        df[metric] = df[metric] / df['G']
    return df 

def getParamDicts(paramDict, infoDict, featureTypes, labelType): 
    # unpack
    statsWindow = paramDict['statsWindow']
    statsCom = paramDict['statsCom']
    teamWindow = paramDict['teamWindow']
    teamCom = paramDict['teamCom']
    n = paramDict['n']
    rank = paramDict['rank']
    project = paramDict['project']
    
    # create feeature dictionary
    featuresDict = dict()
    for feature in featureTypes: 
        if feature == 'std': 
            featuresDict.update({'std': {'window': statsWindow}})
        if feature == 'mean':
            featuresDict.update({'mean': {'window': statsWindow}})
        if feature == 'ewm': 
            featuresDict.update({'ewm': {'window': statsWindow, 'com': statsCom}})
        if feature == 'teammates': 
            featuresDict.update({'teammates': {'window': teamWindow, 'n': n, 'com': teamCom}})
            
    # create label dictionary    
    if labelType == 'mean':
        labelsDict = {'mean': {'window': statsWindow}}
    elif labelType == 'ewm':
        labelsDict = {'ewm': {'window': statsWindow, 'com': statsCom}}
    else:
        labelsDict = {'none': {}}
    
    # create model dictionary
    modelDict = {'rank': rank, 'project': project}
    
    # create info dictionary
    bufferWindow = np.max([statsWindow, teamWindow])
    infoDict.update({'buffer': bufferWindow})
    
    return infoDict, featuresDict, labelsDict, modelDict

def getPlayerDF(player, value, metric='PTS'):
    df = pd.DataFrame(value)
    df.columns = [player]
    df.index = [metric]
    return df

def getErrorDF(dfPred, dfTrue): 
    error = dfPred - dfTrue
    dfTrue.index = ['True']
    dfPred.index = ['Pred']
    error.index = ['Error']
    return pd.concat([dfTrue, dfPred, error]).T.round(2)

# Adding 2018 Data

In [3]:
d1 = '2017-09-01'
d2 = '2018-06-01'
df = pd.read_csv("../data/nba-enhanced-stats/2012-18_playerBoxScore.csv")
df_recent = df[(df.gmDate >= d1) & (df.gmDate <= d2)]

colname_dict = {'playPTS': 'PTS', 'playAST': 'AST', 'playTO':'TOV',
                'playFG%': 'FG%','playFT%':'FT%','play3PM':'3P',
                'playTRB':'TRB','playSTL':'STL','playBLK':'BLK',
                'teamAbbr': 'Tm', 'playPos': 'Pos', 'playMin': 'MP',
                'playDispNm': 'Player'}
df_recent = df_recent.rename(columns=colname_dict)

# add games played
numGmsPlayed = df_recent.groupby(['Player']).count()['gmDate'].values

# take totals 
sumColumns = ['PTS', 'AST', 'TOV', '3P', 'TRB', 'STL', 'BLK', 'MP']
avgColumns = ['FG%', 'FT%']
d = {sumColumn: 'sum' for sumColumn in sumColumns}
dtemp = {avgColumn: 'mean' for avgColumn in avgColumns}
d.update(dtemp)
df_recent = df_recent.groupby(['Player'], as_index=False).agg(d)
df_recent['Year'] = 2018

# fix accented characters
df_recent.Player = df_recent.Player.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

# add games played
df_recent['G'] = numGmsPlayed

# PARAMETERS

In [4]:
""" USER PARAMETERS """
starting_year = 1970
min_games = 30
min_years = 2
validate_year = 2017
pred_interval = 1
params = [starting_year, min_games, min_years, validate_year, pred_interval]
predMetrics = ['PTS_G']

# SLA Predictions

In [5]:
# load annual dataframe 
dfAnnual = annualData.createAnnualData(params, df_recent)

In [6]:
dfAnnual = dfAnnual[dfAnnual.G >= min_games]

In [7]:
metrics = ['PTS', 'MP', '3P', '3PA', 'AST', 'STL', 'BLK', 'TOV', 'FT', 'FTA']
dfAnnual = convertMetrics(dfAnnual, metrics)

trainYear = 2015
cvYear = 2016
testYear = 2017

dfTrain = dfAnnual[dfAnnual.Year <= trainYear]
dfCV = dfAnnual[dfAnnual.Year <= cvYear]
dfTest = dfAnnual[dfAnnual.Year <= testYear]

In [169]:
# feature parameters
statsWindow = np.array([2])
statsCom = np.array([0.3])
teamWindow = np.array([2])
teamCom = np.array([0.3])
n = np.array([1])

# model parameters
rank = np.array([0, 1, 2])
project = [True, False]

In [202]:
featureTypes = ['std', 'mean', 'ewm', 'teammates']
labelType = 'none'
metric = 'PTS'
metrics = [metric]

In [203]:
hyperparamDict = {'statsWindow': statsWindow, 'statsCom': statsCom, 
                  'teamWindow': teamWindow, 'teamCom': teamCom, 'n': n, 
                  'rank': rank, 'project': project}
paramGrid = list(ParameterGrid(hyperparamDict))
len(paramGrid)

6

In [210]:
#players = ['Russell Westbrook', 'LeBron James', 'Klay Thompson', 'Stephen Curry', 'Kevin Durant', 'James Harden']
players = ['Carmelo Anthony']
players_optParamDict = {player: {} for player in players}

In [211]:
slaDict = {'type': 'linear', 'params': {}}

for player in players: 
    # create info and SLA dictionaries
    infoDict = {'player': player, 'metric': metric}
    
    # initialize optimality conditions
    optParamDict = paramGrid[0]
    error = float('inf')
    
    # iterate through parameters
    for paramDict in paramGrid: 
        print(paramDict)
        
        # create parameter dictionaries
        infoDict, featuresDict, labelsDict, modelDict = getParamDicts(paramDict, infoDict, featureTypes, labelType)
        
        # train 
        dataDict = {'df': dfTrain}
        annualSLA.trainSLA(infoDict, dataDict, featuresDict, labelsDict, modelDict, slaDict)
        predTrain = slaDict['model'].predict(slaDict['features'])
        trueTrain = slaDict['labels']
        
        # test
        dataDict = {'df': dfCV}
        pred, true = annualSLA.testSLA(infoDict, dataDict, featuresDict, labelsDict, modelDict, slaDict)
        
        # compute error
        errorTemp = mae(pred, true)
        if errorTemp <= error: 
            error = errorTemp
            optParamDict = paramDict
            
        # presentation 
        dfPred = getPlayerDF(player, pred, metric)
        dfTrue = getPlayerDF(player, true, metric)
        df = getErrorDF(dfPred, dfTrue)
        print(df)
        print()
            
        # plot
        #preds = np.append(predTrain, pred)
        #true = np.append(trueTrain, true)
        #seriesDict = {'SLA': {'data': preds}}
        #trueDict = {'data': true}
        #plotData.plotAnnual(seriesDict, trueDict, player, infoDict['buffer'])
    
    # update player optimal hyperparamters
    players_optParamDict[player] = optParamDict
    
    # display results
    print("{}: {}".format(player, optParamDict))
    print("Error = {}".format(error.round(2)))
    print() 

{'n': 1, 'project': True, 'rank': 0, 'statsCom': 0.3, 'statsWindow': 2, 'teamCom': 0.3, 'teamWindow': 2}
                  True   Pred  Error
Carmelo Anthony  21.85  25.73   3.88

{'n': 1, 'project': True, 'rank': 1, 'statsCom': 0.3, 'statsWindow': 2, 'teamCom': 0.3, 'teamWindow': 2}
                  True   Pred  Error
Carmelo Anthony  21.85  24.47   2.63

{'n': 1, 'project': True, 'rank': 2, 'statsCom': 0.3, 'statsWindow': 2, 'teamCom': 0.3, 'teamWindow': 2}
                  True   Pred  Error
Carmelo Anthony  21.85  25.84   3.99

{'n': 1, 'project': False, 'rank': 0, 'statsCom': 0.3, 'statsWindow': 2, 'teamCom': 0.3, 'teamWindow': 2}
                  True   Pred  Error
Carmelo Anthony  21.85  25.73   3.88

{'n': 1, 'project': False, 'rank': 1, 'statsCom': 0.3, 'statsWindow': 2, 'teamCom': 0.3, 'teamWindow': 2}
                  True   Pred  Error
Carmelo Anthony  21.85  24.47   2.63

{'n': 1, 'project': False, 'rank': 2, 'statsCom': 0.3, 'statsWindow': 2, 'teamCom': 0.3, 'teamWind

In [212]:
allPred = pd.DataFrame()
allTrue = pd.DataFrame()

for player in players: 
    playerPred = pd.DataFrame()
    playerTrue = pd.DataFrame()
    
    for metric in metrics:  
        # create info and SLA dictionaries
        infoDict = {'player': player, 'metric': metric}

        # initialize optimality conditions
        optParamDict = players_optParamDict[player]

        # create parameter dictionaries
        infoDict, featuresDict, labelsDict, modelDict = getParamDicts(optParamDict, infoDict, featureTypes, labelType)

        # train 
        dataDict = {'df': dfCV}
        annualSLA.trainSLA(infoDict, dataDict, featuresDict, labelsDict, modelDict, slaDict)

        # test
        dataDict = {'df': dfTest}
        pred, true = annualSLA.testSLA(infoDict, dataDict, featuresDict, labelsDict, modelDict, slaDict)

        # presentation
        dfPred = getPlayerDF(player, pred, metric)
        playerPred = pd.concat([playerPred, dfPred], axis=0)
        dfTrue = getPlayerDF(player, true, metric)
        playerTrue = pd.concat([playerTrue, dfTrue], axis=0)
    
    allPred = pd.concat([allPred, playerPred], axis=1)
    allTrue = pd.concat([allTrue, playerTrue], axis=1)
    df = getErrorDF(allPred, allTrue)
df

Unnamed: 0,True,Pred,Error
Carmelo Anthony,22.42,22.82,0.4


In [215]:
optParamDict

{'n': 1,
 'project': False,
 'rank': 1,
 'statsCom': 0.3,
 'statsWindow': 2,
 'teamCom': 0.3,
 'teamWindow': 2}

In [216]:
slaDict['features']

array([[ 1.3675461 , 21.13067143, 21.23263582, 14.71947077],
       [ 1.50182048, 23.20541512, 23.31739102, 16.16472201],
       [ 1.95846779, 30.26131205, 30.40733564, 21.07980807],
       [ 1.61889375, 25.01437559, 25.13508049, 17.42483061],
       [ 1.55440616, 24.01794409, 24.13384079, 16.73072373],
       [ 1.79764664, 27.77638013, 27.91041286, 19.34882271],
       [ 1.8249443 , 28.19817059, 28.33423865, 19.64263886],
       [ 1.4543422 , 22.47180342, 22.58023934, 15.65369348],
       [ 1.7276522 , 26.69485944, 26.82367339, 18.59544334],
       [ 1.65262208, 25.53552966, 25.65874935, 17.78786271],
       [ 1.57113642, 24.276452  , 24.39359611, 16.91079845]])

In [217]:
slaDict['test_feature']

array([ 1.15138889, 22.99861111, 22.27899306, 16.61898396])

In [218]:
slaDict['model'].model.coef_

array([0.02589276, 0.40008254, 0.40201311, 0.27869456])

In [219]:
dfAnnual[dfAnnual.Player==player]

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,PF,PTS,year_start,year_end,position,height,weight,birth_date,college,year_count
10517,16500.0,2004,Carmelo Anthony,SF,19.0,DEN,82.0,82.0,36.52439,17.6,...,225.0,21.036585,2004,2018.0,F,6-8,240.0,"May 29, 1984",Syracuse University,0
10958,17092.0,2005,Carmelo Anthony,SF,20.0,DEN,75.0,75.0,34.773333,16.7,...,229.0,20.773333,2004,2018.0,F,6-8,240.0,"May 29, 1984",Syracuse University,1
11419,17675.0,2006,Carmelo Anthony,SF,21.0,DEN,80.0,80.0,36.7625,22.0,...,229.0,26.525,2004,2018.0,F,6-8,240.0,"May 29, 1984",Syracuse University,2
11874,18239.0,2007,Carmelo Anthony,SF,22.0,DEN,65.0,65.0,38.246154,22.1,...,203.0,28.938462,2004,2018.0,F,6-8,240.0,"May 29, 1984",Syracuse University,3
12334,18761.0,2008,Carmelo Anthony,SF,23.0,DEN,77.0,77.0,36.441558,21.1,...,253.0,25.688312,2004,2018.0,F,6-8,240.0,"May 29, 1984",Syracuse University,4
12784,19359.0,2009,Carmelo Anthony,SF,24.0,DEN,66.0,66.0,34.5,19.0,...,195.0,22.787879,2004,2018.0,F,6-8,240.0,"May 29, 1984",Syracuse University,5
13222,19937.0,2010,Carmelo Anthony,SF,25.0,DEN,69.0,69.0,38.173913,22.2,...,225.0,28.15942,2004,2018.0,F,6-8,240.0,"May 29, 1984",Syracuse University,6
13665,20521.0,2011,Carmelo Anthony,SF,26.0,TOT,77.0,77.0,35.727273,21.7,...,224.0,25.584416,2004,2018.0,F,6-8,240.0,"May 29, 1984",Syracuse University,7
14115,21143.0,2012,Carmelo Anthony,SF,27.0,NYK,55.0,55.0,34.109091,21.1,...,156.0,22.636364,2004,2018.0,F,6-8,240.0,"May 29, 1984",Syracuse University,8
14590,21701.0,2013,Carmelo Anthony,PF,28.0,NYK,67.0,67.0,37.044776,24.8,...,205.0,28.656716,2004,2018.0,F,6-8,240.0,"May 29, 1984",Syracuse University,9


# RSC Predictions

In [None]:
"""
setup
"""
# user input
donor_window_type = 'sliding'
normalize_metric = None 
threshold = 0.98
helper_metrics = []

# setup 
donorSetup= [normalize_metric, donor_window_type, True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"
verbose = False
metrics = [[metric] + helper_metrics for metric in predMetrics]

In [None]:
# create donor / target for validation
donor_v, allPivotedTableDict_v, _ = annualData.createTargetDonors(params, dfAnnual)

# create donor / target for testing (update prediction year)
params[3] = params[3] + 1
donor, allPivotedTableDict, targetPlayers = annualData.createTargetDonors(params, dfAnnual)