In [5]:
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import copy
import pickle

from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.importData import *
import mrsc.src.utils as utils

from itertools import combinations, product

def prepareData(stats):
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK","3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG","FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

    allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())
    return allPivotedTableDict, allMetrics

def getActivePlayers(stats, year, buffer):
    # list of name of the players who were active in this and last year
    thisYear = stats[stats.Year == year].copy()
    players = list(thisYear.Player.unique())
    for i in range(1, buffer+1):
        previousYear = stats[stats.Year == (year-i)].copy()
        players = list(set(players) & set(previousYear.Player.unique()))
    return players

def getTopPlayers(stats, year, metric, n):
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player"]][:n]

def getBenchmark(target, metrics_to_use, pred_interval):    
    target_data, nanIndex = target.concat(metrics_to_use)
    num_k = len(metrics_to_use)
    interv_index = int(target_data.shape[1]/num_k - pred_interval)
    total_index = int(interv_index + 1)
    
    # true
    true = utils.get_postint_data(target_data, interv_index, total_index, num_k).T
    true.index = metrics_to_use
    
    # predictions
    history = utils.get_preint_data(target_data, interv_index, total_index, num_k)
    pred = []
    for i in range(num_k):
        pred.append(history.iloc[:,i*interv_index:(i+1)*interv_index].mean(axis=1).to_list())

    pred = pd.DataFrame(pred, index=metrics_to_use, columns = [playerName])
    return true, pred

def getR2(true, pred, bench):
    ss_res = pd.DataFrame((true.values - pred.values)**2, index=true.index).sum(axis=1)
    ss_tot = pd.DataFrame((true.values - bench.values)**2, index=true.index).sum(axis=1)
    return (1-ss_res/ss_tot).to_frame(name = pred.columns.values[0])

In [10]:
def getDictionaryGameByGame(data, metrics):
    my_dict = {}
    for i in range(len(metrics)):
        data_pivot = pd.pivot_table(data, values=metrics[i], index="playDispNm", columns = "gmDate")
        shifted_df = data_pivot.apply(lambda x: pd.Series(x.dropna().values), axis=1).fillna(np.nan)
        my_dict.update({metrics[i]: shifted_df})
    return my_dict

def getMonthlyData(data, metrics):
    df = copy.deepcopy(data)
    df.index = df.date
    df_grouped = df.groupby(by =[df.playDispNm,pd.Grouper(freq='M')]).mean()
    
    my_dict = {}
    for i in range(len(metrics)):
        df_pivoted = pd.pivot_table(df_grouped, values = 'playPTS', index='playDispNm', columns="date")
        cols = df_pivoted.columns
        df_pivoted.columns = range(df_pivoted.shape[1])
        my_dict.update({metrics[i]: df_pivoted})
    return my_dict, cols

# Game by Game

In [7]:
"""
import data
"""
print("*** importing data ***")
data = pd.read_csv("../data/nba-enhanced-stats/2012-18_playerBoxScore.csv")

# metrics = ['playPTS', 'playAST', 'playTO', 'playSTL', 'playBLK',
#        'playPF', 'playFGA', 'playFGM', 'playFG%', 'play2PA', 'play2PM',
#        'play2P%', 'play3PA', 'play3PM', 'play3P%', 'playFTA', 'playFTM',
#        'playFT%', 'playORB', 'playDRB', 'playTRB']

metrics = ['playPTS', 'playAST', 'playTO','playFG%','playFT%','play3PM','playTRB','playSTL', 'playBLK']

date_col = pd.to_datetime(data.gmDate + " " + data.gmTime, format='%Y-%m-%d %H:%M').rename("date")
data = pd.concat([date_col,data], axis=1)

# data_dict = {}
# for i in range(len(metrics)):
#     data_pivot = pd.pivot_table(data, values=metrics[i], index="playDispNm", columns = "gmDate")
#     shifted_df = data_pivot.apply(lambda x: pd.Series(x.dropna().values), axis=1).fillna(np.nan)
#     data_dict.update({metrics[i]: shifted_df})

print("*** preparing data ***")

pred_date = pd.to_datetime('2012-12-01') # the year that we are living in
pred_interval_time = pd.Timedelta("5 day") # we are making predictions for pred_year+1 and +2


########### Donor ##########
# filter stats by the year
donor_data = data[data.date <= pred_date]
donor_dict = getDictionaryGameByGame(donor_data, metrics)
donor = Donor(donor_dict)

########### Target ##########
# filter stats by the year
target_data = data[data.date <= pred_date + pred_interval_time]
target_dict = getDictionaryGameByGame(target_data, metrics)

""" target """
data_pred = data[(data.date > pred_date)&(data.date <= pred_date+pred_interval_time)]
# getDictionaryGameByGame(data_pred, ['playPTS'])['playPTS']
allPlayers = list(data_pred.playDispNm.unique())
allPlayers.sort()
print(len(allPlayers))

*** importing data ***
*** preparing data ***
340


In [8]:
"""
experiment setup
"""
# overall setup
donorSetup= [None,"sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics = ['playPTS', 'playAST', 'playTO','playFG%','playFT%','play3PM','playTRB','playSTL', 'playBLK']
metrics_list = [[x] for x in metrics]

"""
experiment
"""
print("1 metric at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
for playerName in allPlayers:
    # print(playerName)
    # print("*** year - year_count matching for this player")
    # a = df_year[df_year.index == playerName]
    # print(a.dropna(axis=1))

    target = Target(playerName, target_dict)
    # print("*** target - total index: ", target.total_index)
    # print(target.concat(metrics_list[1]))

    interv_index = donor_data[donor_data.playDispNm == playerName].shape[0]
    pred_interval = target.total_index - interv_index
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    
    if (donor.concat([metrics[0]], target.total_index, method = "sliding").shape[0] <5):
        continue
        
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName +" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName +" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())
##############################################################

1 metric at once
(9, 461)
*** MAPE ***
playPTS     2.498844
playAST     6.522290
playTO      3.966629
playFG%     2.373025
playFT%     3.423726
play3PM     1.932879
playTRB     2.813439
playSTL    11.543759
playBLK     2.417315
dtype: float64
MAPE for all:  4.302826660025696

*** RMSE ***
playPTS    33.532858
playAST    69.567546
playTO     12.630808
playFG%     3.431900
playFT%     5.919400
play3PM     4.900528
playTRB    24.776438
playSTL    45.193649
playBLK     3.999013
dtype: float64
RMSE for all:  22.66134902876346


# Monthly?

In [11]:
"""
import data
"""
print("*** importing data ***")
data = pd.read_csv("../data/nba-enhanced-stats/2012-18_playerBoxScore.csv")

metrics = ['playPTS', 'playAST', 'playTO','playFG%','playFT%','play3PM','playTRB','playSTL', 'playBLK']

date_col = pd.to_datetime(data.gmDate + " " + data.gmTime, format='%Y-%m-%d %H:%M').rename("date")
data = pd.concat([date_col,data], axis=1)

print("*** preparing data ***")

pred_date = pd.to_datetime('2013-10-31') # the year that we are living in
pred_interval = 3
pred_interval_time = pd.Timedelta("30 days") * pred_interval # we are making predictions for pred_year+1 and +2


########### Donor ##########
# filter stats by the year
donor_data = data[data.date <= pred_date]
donor_dict, cols_donor = getMonthlyData(donor_data, metrics)
donor = Donor(donor_dict)

########### Target ##########
# filter stats by the year
target_data = data[data.date <= pred_date + pred_interval_time]
target_dict, cols_target = getMonthlyData(target_data, metrics)

print()
print("targte columns")
print(cols_target)
print("donor columns")
print(cols_donor)

*** importing data ***
*** preparing data ***

targte columns
DatetimeIndex(['2012-10-31', '2012-11-30', '2012-12-31', '2013-01-31',
               '2013-02-28', '2013-03-31', '2013-04-30', '2013-10-31',
               '2013-11-30', '2013-12-31', '2014-01-31'],
              dtype='datetime64[ns]', name='date', freq=None)
donor columns
DatetimeIndex(['2012-10-31', '2012-11-30', '2012-12-31', '2013-01-31',
               '2013-02-28', '2013-03-31', '2013-04-30', '2013-10-31'],
              dtype='datetime64[ns]', name='date', freq=None)


In [14]:
"""
experiment setup
"""
# overall setup
donorSetup= [None,"sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics = ['playPTS', 'playAST', 'playTO','playFG%','playFT%','play3PM','playTRB','playSTL', 'playBLK']
metrics_list = [[x] for x in metrics]

"""
experiment
"""
print("1 metric at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
# for playerName in playerNames:
playerName = allPlayers[123]
# print(playerName)
# print("*** year - year_count matching for this player")
# a = df_year[df_year.index == playerName]
# print(a.dropna(axis=1))

target = Target(playerName, target_dict)
# print("*** target - total index: ", target.total_index)
# print(target.concat(metrics_list[1]))

interv_index = donor_data[donor_data.playDispNm == playerName].shape[0]
pred_interval = target.total_index - interv_index
mrsc = mRSC(donor, target, pred_interval, probObservation=1)

# if (donor.concat([metrics[0]], target.total_index, method = "sliding").shape[0] <5):
#     continue

player_pred = pd.DataFrame()
player_true = pd.DataFrame()
for i in range(len(metrics_list)):
    mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
    pred = mrsc.predict()
    true = mrsc.getTrue()
    pred.columns = [playerName +" "+ str(a) for a in range(pred_interval)]
    true.columns = [playerName +" "+ str(a) for a in range(pred_interval)]
    player_pred = pd.concat([player_pred, pred], axis=0)
    player_true = pd.concat([player_true, true], axis=0)
all_pred = pd.concat([all_pred, player_pred], axis=1)
all_true = pd.concat([all_true, player_true], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())
##############################################################

1 metric at once


Exception: Donor pool size too small. Donor pool size: Gustavo Ayon(0, 9)

# Benchmark

In [78]:
# metrics_to_use= ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G","TRB_G","STL_G","BLK_G"]
metrics_to_use = allMetrics

print("Algo: outputs the mean of the player's history")
print("-----")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    true, pred = getBenchmark(target, metrics_to_use, pred_interval)
    
    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(true_all, pred_all)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())    

Algo: outputs the mean of the player's history
-----
*** MAPE ***
PTS_G    0.413246
AST_G    0.387396
TOV_G    0.400230
TRB_G    0.275911
STL_G    0.320220
BLK_G    0.454082
3P_G     0.533344
FG%      0.063127
FT%      0.061047
PER_w    0.202120
dtype: float64
MAPE for all:  0.306121947982417

*** RMSE ***
PTS_G    4.409235
AST_G    1.131206
TOV_G    0.564801
TRB_G    1.450656
STL_G    0.274086
BLK_G    0.235605
3P_G     0.466516
FG%      0.036495
FT%      0.057969
PER_w    3.312910
dtype: float64
RMSE for all:  1.1939480121133492


In [33]:
for metric in allMetrics:
    print()
    print(metric)
    print(mape.loc[metric,:].sort_values(ascending = False)[:10])


PTS_G
Tyler Hansbrough 0    2.60442
Kirk Hinrich 0        2.54157
Nick Collison 0       2.40373
Tayshaun Prince 0     1.82742
Ty Lawson 0           1.48293
Jason Thompson 0      1.42104
Brandon Jennings 0    1.19949
Steve Blake 0         1.19918
Jonas Jerebko 0       1.19298
Beno Udrih 0          1.15311
Name: PTS_G, dtype: object

AST_G
Anthony Morrow 0       7.07181
Tyler Hansbrough 0     5.93413
Nick Young 0            3.7168
Bismack Biyombo 0      3.34997
Wesley Johnson 0       2.60229
DeAndre Jordan 0       2.04812
Enes Kanter 0          2.00037
Richard Jefferson 0    1.92116
C.J. Miles 0           1.85656
Anthony Tolliver 0     1.66044
Name: AST_G, dtype: object

TOV_G
Anthony Morrow 0       5.14906
Tyler Hansbrough 0     3.34793
Anthony Tolliver 0     2.25923
Jason Thompson 0       2.25143
Tayshaun Prince 0      2.21373
Nick Young 0           1.68462
Kirk Hinrich 0         1.34559
Wayne Ellington 0      1.31643
Richard Jefferson 0    1.18031
Gary Neal 0            1.16256
Name:

In [717]:
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1).reindex(allMetrics))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse.reindex(allMetrics))
print("RMSE for all: ", rmse.mean())

print()
print("*** R2 ***")
print(all_R2.mean(axis=1).reindex(allMetrics))
print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))
##############################################################

(10, 99)
*** MAPE ***
PTS_G    0.274777
AST_G    0.537414
TOV_G    0.309435
TRB_G    0.181637
STL_G    0.231816
BLK_G    0.374646
3P_G     1.778391
FG%      0.071516
FT%      0.099925
PER_w    0.190299
dtype: float64
MAPE for all:  0.38260637091850613

*** RMSE ***
PTS_G    5.052958
AST_G    1.798591
TOV_G    0.700045
TRB_G    1.167430
STL_G    0.234047
BLK_G    0.190006
3P_G     0.780453
FG%      0.051968
FT%      0.093497
PER_w    4.362244
dtype: float64
RMSE for all:  1.4431239627611094

*** R2 ***
PTS_G    -65.290399
AST_G   -178.406691
TOV_G    -68.639120
TRB_G      0.755287
STL_G      0.998910
BLK_G     -6.225443
3P_G    -116.758836
FG%        0.999972
FT%      -24.373353
PER_w     -3.446957
dtype: float64
R2 for all:  -46.03866302248279


# Monthly

In [12]:
def getMonthlyData(data, metrics):
    df = copy.deepcopy(data)
    df.index = df.date
    df_grouped = df.groupby(by =[df.playDispNm,pd.Grouper(freq='M')]).mean()
    
    my_dict = {}
    for i in range(len(metrics)):
        df_pivoted = pd.pivot_table(df_grouped, values = 'playPTS', index='playDispNm', columns="date")
        cols = df_pivoted.columns
        df_pivoted.columns = range(df_pivoted.shape[1])
        my_dict.update({metrics[i]: df_pivoted})
    return my_dict, cols

In [13]:
"""
import data
"""
print("*** importing data ***")
data = pd.read_csv("../data/nba-enhanced-stats/2012-18_playerBoxScore.csv")

metrics = ['playPTS', 'playAST', 'playTO','playFG%','playFT%','play3PM','playTRB','playSTL', 'playBLK']

date_col = pd.to_datetime(data.gmDate + " " + data.gmTime, format='%Y-%m-%d %H:%M').rename("date")
data = pd.concat([date_col,data], axis=1)

print("*** preparing data ***")

pred_date = pd.to_datetime('2013-10-31') # the year that we are living in
pred_interval = 3
pred_interval_time = pd.Timedelta("30 days") * pred_interval # we are making predictions for pred_year+1 and +2


########### Donor ##########
# filter stats by the year
donor_data = data[data.date <= pred_date]
donor_dict, cols_donor = getMonthlyData(donor_data, metrics)
donor = Donor(donor_dict)

########### Target ##########
# filter stats by the year
target_data = data[data.date <= pred_date + pred_interval_time]
target_dict, cols_target = getMonthlyData(target_data, metrics)

print()
print("targte columns")
print(cols_target)
print("donor columns")
print(cols_donor)

*** importing data ***


*** preparing data ***

targte columns
DatetimeIndex(['2012-10-31', '2012-11-30', '2012-12-31', '2013-01-31',
               '2013-02-28', '2013-03-31', '2013-04-30', '2013-10-31',
               '2013-11-30', '2013-12-31', '2014-01-31'],
              dtype='datetime64[ns]', name='date', freq=None)
donor columns
DatetimeIndex(['2012-10-31', '2012-11-30', '2012-12-31', '2013-01-31',
               '2013-02-28', '2013-03-31', '2013-04-30', '2013-10-31'],
              dtype='datetime64[ns]', name='date', freq=None)


In [15]:
""" target """
data_pred = data[(data.date > pred_date)&(data.date <= pred_date+pred_interval_time)]
# getDictionaryGameByGame(data_pred, ['playPTS'])['playPTS']
allPlayers = list(data_pred.playDispNm.unique())
allPlayers.sort()
print(len(allPlayers))

452


In [16]:
np.sum((target_dict['playPTS'].index).isin(playerNames))

95

In [20]:
"""
experiment setup
"""
# overall setup
donorSetup= [None,"sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics = ['playPTS', 'playAST', 'playTO','playFG%','playFT%','play3PM','playTRB','playSTL', 'playBLK']
metrics_list = [[x] for x in metrics]

"""
experiment
"""
print("1 metric at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
# for playerName in playerNames:
playerName = playerNames[0]
# print(playerName)
# print("*** year - year_count matching for this player")
# a = df_year[df_year.index == playerName]
# print(a.dropna(axis=1))

target = Target(playerName, target_dict)
# print("*** target - total index: ", target.total_index)
# print(target.concat(metrics_list[1]))

interv_index = donor_data[donor_data.playDispNm == playerName].shape[0]
pred_interval = target.total_index - interv_index
mrsc = mRSC(donor, target, pred_interval, probObservation=1)

# if (donor.concat([metrics[0]], target.total_index, method = "sliding").shape[0] <5):
#     continue

player_pred = pd.DataFrame()
player_true = pd.DataFrame()
for i in range(len(metrics_list)):
    mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
    pred = mrsc.predict()
    true = mrsc.getTrue()
    pred.columns = [playerName +" "+ str(a) for a in range(pred_interval)]
    true.columns = [playerName +" "+ str(a) for a in range(pred_interval)]
    player_pred = pd.concat([player_pred, pred], axis=0)
    player_true = pd.concat([player_true, true], axis=0)
all_pred = pd.concat([all_pred, player_pred], axis=1)
all_true = pd.concat([all_true, player_true], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())
##############################################################

1 metric at once


Exception: Donor pool size too small. Donor pool size: Tony Allen(0, 11)