In [6]:
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import copy
import pickle

from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.importData import *
import mrsc.src.utils as utils

from itertools import combinations, product
from statsmodels.tsa.arima_model import ARMA

def prepareData(stats):
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK","3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG","FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

#     metricsWeightedColNames = ["PER"]
#     metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

#     allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
    allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict}
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())
    return allPivotedTableDict, allMetrics

def getActivePlayers(stats, year, buffer):
    # list of name of the players who were active in this and last year
    thisYear = stats[stats.Year == year].copy()
    players = list(thisYear.Player.unique())
    for i in range(1, buffer+1):
        previousYear = stats[stats.Year == (year-i)].copy()
        players = list(set(players) & set(previousYear.Player.unique()))
    return players

def getTopPlayers(stats, year, metric, n):
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player"]][:n]

def getBenchmark(target, metrics_to_use, pred_interval):    
    target_data, nanIndex = target.concat(metrics_to_use)
    num_k = len(metrics_to_use)
    interv_index = int(target_data.shape[1]/num_k - pred_interval)
    total_index = int(interv_index + 1)
    
    # true
    true = utils.get_postint_data(target_data, interv_index, total_index, num_k).T
    true.index = metrics_to_use
    
    # predictions
    history = utils.get_preint_data(target_data, interv_index, total_index, num_k)
    pred = []
    for i in range(num_k):
        pred.append(history.iloc[:,i*interv_index:(i+1)*interv_index].mean(axis=1).to_list())

    pred = pd.DataFrame(pred, index=metrics_to_use, columns = [playerName])
    return true, pred

def getR2(true, pred, bench):
    ss_res = pd.DataFrame((true.values - pred.values)**2, index=true.index).sum(axis=1)
    ss_tot = pd.DataFrame((true.values - bench.values)**2, index=true.index).sum(axis=1)
    return (1-ss_res/ss_tot).to_frame(name = pred.columns.values[0])

In [7]:
"""
import data
"""
pred_year = 2015 # the year that we are living in
pred_interval = 1 # we are making predictions for pred_year+1 and +2
min_games = 40 # keeps ~72.3%(10096/13963) of the full data

print("*** importing data ***")
### 1. Players
players = pd.read_csv("../data/nba-players-stats/player_data.csv")
players = players[players.year_start >= 1980] # only choose players who started after 1980

### 2. Stats
stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
# remove multiple rows for the same [Year, Player]
totals = stats[stats.Tm == "TOT"]
duplicates_removed = stats.drop_duplicates(subset=["Year","Player"], keep=False)
stats = pd.concat([duplicates_removed, totals], axis=0).sort_values("Unnamed: 0")

# fix the name* issue
stats = stats.replace('\*','',regex=True)
stats = stats[stats.Player.isin(players.name)]

# only after 1980
stats = stats[stats.Year >= 1980]

# minimum number of games
stats = stats[stats.G >= min_games]

# without duplicated names --> to do: how to distinguish multiple player with the same name
stats = removeDuplicated(players, stats)
stats.Year = stats.Year.astype(int)
stats.year_count = stats.year_count.astype(int)

print("*** preparing data ***")

########### Donor ##########
# filter stats by the year
stats_donor = stats[stats.Year <= pred_year]
allPivotedTableDict_d, allMetrics = prepareData(stats_donor)
donor = Donor(allPivotedTableDict_d)

########### Target ##########
# filter stats by the year
stats_target = stats[stats.Year <= pred_year+pred_interval]
allPivotedTableDict, allMetrics = prepareData(stats_target)

# just to debug
df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

*** importing data ***
*** preparing data ***


In [8]:
"""
targets
"""
activePlayers = getActivePlayers(stats, pred_year+1, buffer=4)
topPlayers = getTopPlayers(stats, pred_year, 'PTS', 300)
topPlayers = topPlayers.values.flatten().tolist()

print("non-rookie active players in 2016: ", len(activePlayers))

# first group
print("***** First Group *****")
target_names_1 = list(set(activePlayers) & set(topPlayers))
max_total_index = 18
print("* not sufficient donor pool:")
for playerName in target_names_1:
    target = Target(playerName, allPivotedTableDict)
    if (target.total_index > max_total_index):
        target_names_1.remove(playerName)
        print(playerName)
print("* total number of target players: ", len(target_names_1))

# second group
print()
print("***** Second Group *****")
target_names_2 = activePlayers
max_total_index = 18
print("* not sufficient donor pool:")
for playerName in target_names_2:
    target = Target(playerName, allPivotedTableDict)
    if (target.total_index > max_total_index):
        target_names_2.remove(playerName)
        print(playerName)
print("* total number of target players: ", len(target_names_2))

##############
# final
playerNames = target_names_2

non-rookie active players in 2016:  136
***** First Group *****
* not sufficient donor pool:
Tim Duncan
* total number of target players:  134

***** Second Group *****
* not sufficient donor pool:
Tim Duncan
* total number of target players:  135


In [9]:
for year in range(2008,2018,1):
    print(year)
    print("mean   : ",stats[stats.Year == year].G.mean())
    print("median : ",stats[stats.Year == year].G.median())
    print()

2008
mean   :  69.50157728706624
median :  73.0

2009
mean   :  68.50621118012423
median :  71.0

2010
mean   :  68.6532507739938
median :  72.0

2011
mean   :  67.37611940298507
median :  71.0

2012
mean   :  57.075085324232084
median :  59.0

2013
mean   :  67.72403560830861
median :  72.0

2014
mean   :  67.54166666666667
median :  71.0

2015
mean   :  66.2507204610951
median :  68.0

2016
mean   :  68.09198813056379
median :  72.0

2017
mean   :  67.75811209439529
median :  71.0



In [10]:
metrics_to_use = allMetrics

print("Algo: outputs the mean of the player's history")
print("-----")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in topPlayers:
    target = Target(playerName, allPivotedTableDict)
    true, pred = getBenchmark(target, metrics_to_use, pred_interval)
    
    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

# rmse = utils.rmse_2d(true_all, pred_all)
# print()
# print("*** RMSE ***")
# print(rmse)
# print("RMSE for all: ", rmse.mean())    

Algo: outputs the mean of the player's history
-----
*** MAPE ***
PTS_G    0.408472
AST_G    0.399391
TOV_G    0.412920
TRB_G    0.297137
STL_G    0.331118
BLK_G    0.487898
3P_G     0.536412
FG%      0.065474
FT%      0.064764
dtype: float64
MAPE for all:  0.3289565653729327


In [6]:
mape

Unnamed: 0,James Harden,Stephen Curry,Russell Westbrook,LeBron James,Damian Lillard,Klay Thompson,LaMarcus Aldridge,Anthony Davis,Kyrie Irving,Chris Paul,...,Pablo Prigioni,Nick Collison,Tyler Hansbrough,Ian Mahinmi,Matt Bonner,Shabazz Napier,Udonis Haslem,Lou Amundson,Luke Babbitt,P.J. Hairston
PTS_G,0.323685,0.290152,0.08687,0.082032,0.192454,0.219412,0.07451,0.194001,0.062686,0.044315,...,0.550734,2.017572,2.028906,0.531092,0.663095,0.386555,1.004249,0.139634,0.37482,0.091382
AST_G,0.406868,0.047059,0.317354,0.022286,0.10978,0.128258,0.302974,0.168662,0.203122,0.003152,...,0.366444,0.218293,1.237524,0.79113,0.089256,0.44532,0.221665,0.741866,0.630889,0.189474
TOV_G,0.405381,0.032087,0.143363,0.029843,0.171034,0.025648,0.204931,0.258161,0.2321,0.08335,...,0.178807,0.175373,1.961325,0.412608,1.68159,0.898977,0.551616,0.416551,0.017402,0.213008
TRB_G,0.301042,0.232596,0.321604,0.038994,0.068204,0.186081,0.021386,0.077823,0.199589,0.040462,...,0.040129,0.880897,1.204242,0.441473,0.986282,1.215686,0.763083,0.226056,0.328177,0.224521
STL_G,0.167054,0.189723,0.157466,0.246367,0.102251,0.278218,0.568505,0.035129,0.326993,0.138545,...,0.142474,1.066091,0.868301,0.47903,1.718478,0.875533,0.595735,0.109119,0.061983,0.0375
BLK_G,0.350711,0.096629,0.112916,0.23239,0.324913,0.147828,0.074926,0.221964,0.014569,0.431197,...,0.393765,1.02286,0.348222,0.377052,0.720073,3.313725,0.558646,0.176961,0.20492,0.733333
3P_G,0.320843,0.429161,0.320383,0.20074,0.202524,0.267248,,0.974115,0.120659,0.345439,...,0.766546,,0.940541,,0.661989,0.363899,1.0,,0.072279,0.026667
FG%,0.010939,0.06664,0.051382,0.042238,0.023413,0.056886,0.050017,0.062377,0.014512,0.022124,...,0.171012,0.179957,0.019769,0.119714,0.130982,0.127902,0.099202,0.152312,0.007378,0.089963
FT%,0.015494,0.004238,0.007038,0.019212,0.036341,0.031723,0.079648,0.031821,0.024775,0.039253,...,0.009648,0.038182,0.262549,0.002459,0.056554,0.071429,0.058469,0.042992,0.015623,0.102222
PER_w,0.178524,0.307302,0.186335,0.01,0.163664,0.157258,0.096726,0.053333,0.060302,0.026336,...,0.248366,0.719008,0.1824,0.291566,0.354455,0.03125,0.210377,0.074153,0.086207,0.304348


# Fixed vs. Sliding

In [7]:
"""
experiment setup
"""
# overall setup
donorSetup= [None,"fixed", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics_list = [[x] for x in allMetrics]

"""
experiment
"""
print("1 metric at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
all_bench = pd.DataFrame()
all_R2 = pd.DataFrame()
for playerName in targetNames:
    target = Target(playerName, allPivotedTableDict)
    
    # benchmark
    true, benchmark = getBenchmark(target, allMetrics, pred_interval)
    
    # prediction
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)
    all_bench = pd.concat([all_bench, benchmark], axis=1)
    
    R2 = getR2(player_true, player_pred, benchmark)
    all_R2 = pd.concat([all_R2, R2], axis=1)

##################
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())

print()
print("*** R2 ***")
print(all_R2.mean(axis=1))
print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))

edited_R2 = copy.deepcopy(all_R2)
edited_R2[edited_R2 <0] = 0
print()
print("*** edited R2 ***")
print(edited_R2.mean(axis=1))
print("R2 for all: ", edited_R2.mean().mean())
##############################################################

1 metric at once
(10, 135)
*** MAPE ***
PTS_G    0.277258
AST_G    0.303469
TOV_G    0.280261
TRB_G    0.234877
STL_G    0.268088
BLK_G    0.368621
3P_G     0.985453
FG%      0.061605
FT%      0.061859
PER_w    0.175964
dtype: float64
MAPE for all:  0.2886828514130085

*** RMSE ***
PTS_G    3.068593
AST_G    0.936258
TOV_G    0.420056
TRB_G    1.398939
STL_G    0.235342
BLK_G    0.194938
3P_G     0.897590
FG%      0.036506
FT%      0.059223
PER_w    2.894064
dtype: float64
RMSE for all:  1.0141509818593006

*** R2 ***
PTS_G    -67.626844
AST_G    -81.817762
TOV_G   -531.174350
TRB_G    -16.179920
STL_G    -45.524027
BLK_G    -10.203595
3P_G    -173.168627
FG%     -404.627689
FT%     -318.188425
PER_w     -5.609394
dtype: float64
R2 for all:  -165.4120631543345

*** edited R2 ***
PTS_G    0.425976
AST_G    0.446762
TOV_G    0.422491
TRB_G    0.394274
STL_G    0.343459
BLK_G    0.351169
3P_G     0.263494
FG%      0.268233
FT%      0.265458
PER_w    0.324021
dtype: float64
R2 for all:  0.

In [15]:
"""
experiment setup
"""
# overall setup
donorSetup= [None,"sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics_list = [[x] for x in allMetrics]

"""
experiment
"""
print("1 metric at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
all_bench = pd.DataFrame()
all_R2 = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    
    # benchmark
    true, benchmark = getBenchmark(target, allMetrics, pred_interval)
    
    # prediction
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)
    all_bench = pd.concat([all_bench, benchmark], axis=1)
    
    R2 = getR2(player_true, player_pred, benchmark)
    all_R2 = pd.concat([all_R2, R2], axis=1)

##################
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())

print()
print("*** R2 ***")
print(all_R2.mean(axis=1))
print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))

edited_R2 = copy.deepcopy(all_R2)
edited_R2[edited_R2 <0] = 0
print()
print("*** edited R2 ***")
print(edited_R2.mean(axis=1))
print("R2 for all: ", edited_R2.mean().mean())
##############################################################

1 metric at once
(10, 135)
*** MAPE ***
PTS_G    0.283164
AST_G    0.296557
TOV_G    0.276936
TRB_G    0.220741
STL_G    0.270618
BLK_G    0.375914
3P_G     0.633310
FG%      0.063463
FT%      0.061988
PER_w    0.177502
dtype: float64
MAPE for all:  0.25906592051948624

*** RMSE ***
PTS_G    3.113604
AST_G    0.932696
TOV_G    0.419533
TRB_G    1.362510
STL_G    0.239959
BLK_G    0.203485
3P_G     0.490664
FG%      0.038089
FT%      0.058718
PER_w    3.146666
dtype: float64
RMSE for all:  1.0005924407133127

*** R2 ***
PTS_G    -87.998178
AST_G   -106.237513
TOV_G   -383.405825
TRB_G    -11.313574
STL_G    -39.888154
BLK_G    -10.675517
3P_G     -55.414178
FG%     -437.338500
FT%       -1.328307
PER_w    -92.268186
dtype: float64
R2 for all:  -122.58679314916817

*** edited R2 ***
PTS_G    0.415672
AST_G    0.434127
TOV_G    0.423864
TRB_G    0.341587
STL_G    0.348934
BLK_G    0.330662
3P_G     0.310874
FG%      0.313764
FT%      0.067343
PER_w    0.316708
dtype: float64
R2 for all:  

# Grouping

In [269]:
# check if grouping with n=2 would work
"""
experiment setup
"""
# overall setup
donorSetup= [None,"Sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

"""
experiment
"""
result = pd.DataFrame(index=allMetrics, columns=allMetrics)
for m1 in range(len(allMetrics)):
    for m2 in range(len(allMetrics)):
        metric1 = allMetrics[m1]
        metric2 = allMetrics[m2]
        metrics = [metric1, metric2]
        metrics_list = [metrics]
        print()
        print("***************************************")
        print(metrics)

        all_pred = pd.DataFrame()
        all_true = pd.DataFrame()
        all_bench = pd.DataFrame()
        all_R2 = pd.DataFrame()
        for playerName in playerNames:
            target = Target(playerName, allPivotedTableDict)

            # benchmark
            true, benchmark = getBenchmark(target, metrics, pred_interval)

            # prediction
            mrsc = mRSC(donor, target, pred_interval, probObservation=1)
            player_pred = pd.DataFrame()
            player_true = pd.DataFrame()
            for i in range(len(metrics_list)):
                mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
                pred = mrsc.predict()
                true = mrsc.getTrue()
                pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
                true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
                player_pred = pd.concat([player_pred, pred], axis=0)
                player_true = pd.concat([player_true, true], axis=0)
            all_pred = pd.concat([all_pred, player_pred], axis=1)
            all_true = pd.concat([all_true, player_true], axis=1)
            all_bench = pd.concat([all_bench, benchmark], axis=1)

            R2 = getR2(player_true, player_pred, benchmark)
            all_R2 = pd.concat([all_R2, R2], axis=1)

        ##################
#         print(all_pred.shape)
        mask = (all_true !=0 )
        mape = np.abs(all_pred - all_true) / all_true[mask]
        print("*** MAPE ***")
        print(mape.mean(axis=1))
        result.iloc[m1,m2] = mape.mean(axis=1).values[0]
print(result)
diff = (result.T - mape.mean(axis=1).values).T


***************************************
['PTS_G', 'PTS_G']
*** MAPE ***
PTS_G    0.285066
PTS_G    0.285066
dtype: float64

***************************************
['PTS_G', 'AST_G']
*** MAPE ***
PTS_G    0.283337
AST_G    0.409922
dtype: float64

***************************************
['PTS_G', 'TOV_G']
*** MAPE ***
PTS_G    0.285706
TOV_G    0.393103
dtype: float64

***************************************
['PTS_G', 'TRB_G']
*** MAPE ***
PTS_G    0.300432
TRB_G    0.244106
dtype: float64

***************************************
['PTS_G', 'STL_G']
*** MAPE ***
PTS_G    0.284957
STL_G    0.421654
dtype: float64

***************************************
['PTS_G', 'BLK_G']
*** MAPE ***
PTS_G    0.285237
BLK_G    1.371677
dtype: float64

***************************************
['PTS_G', '3P_G']
*** MAPE ***
PTS_G    0.285382
3P_G     2.160642
dtype: float64

***************************************
['PTS_G', 'FG%']
*** MAPE ***
PTS_G    0.285094
FG%      0.337157
dtype: float64

**********

*** MAPE ***
3P_G    0.620813
3P_G    0.620813
dtype: float64

***************************************
['3P_G', 'FG%']
*** MAPE ***
3P_G    0.609159
FG%     0.063301
dtype: float64

***************************************
['3P_G', 'FT%']
*** MAPE ***
3P_G    0.519625
FT%     0.061411
dtype: float64

***************************************
['3P_G', 'PER_w']
*** MAPE ***
3P_G     2.444566
PER_w    0.177107
dtype: float64

***************************************
['FG%', 'PTS_G']
*** MAPE ***
FG%      0.337157
PTS_G    0.285094
dtype: float64

***************************************
['FG%', 'AST_G']
*** MAPE ***
FG%      0.540236
AST_G    0.300862
dtype: float64

***************************************
['FG%', 'TOV_G']
*** MAPE ***
FG%      0.347302
TOV_G    0.280622
dtype: float64

***************************************
['FG%', 'TRB_G']
*** MAPE ***
FG%      0.362217
TRB_G    0.230544
dtype: float64

***************************************
['FG%', 'STL_G']
*** MAPE ***
FG%      0.062644


In [290]:
diff = (result.T - mape.mean(axis=1).values).T
diff[diff<0].astype(float).round(3)

Unnamed: 0,PTS_G,AST_G,TOV_G,TRB_G,STL_G,BLK_G,3P_G,FG%,FT%,PER_w
PTS_G,,-0.002,,,-0.0,,,,,
AST_G,,,,,,,-0.003,,-0.005,
TOV_G,,-0.017,,,,,-0.018,,-0.012,
TRB_G,,,,-0.0,,,,,,
STL_G,,-0.003,,,-0.0,-0.009,,,-0.001,
BLK_G,,,,,-0.006,,,-0.003,-0.004,
3P_G,,,,,,-0.018,-0.0,-0.012,-0.101,
FG%,,,,,,,,,,
FT%,,,,,,-0.001,-0.0,,,
PER_w,,,,,,-0.0,,-0.0,,


# Weighting

In [11]:
# metrics_list = [['PTS_G'],
#  ['AST_G'],
#  ['TOV_G'],
#  ['TRB_G'],
#  ['STL_G'],
#  ['BLK_G'],
#  ['3P_G'],
#  ['FG%'],
#  ['FT%','3P_G'],
#  ['PER_w']]

metrics_list = [[x] for x in allMetrics]

In [12]:
"""
experiment setup
"""
print("variance")
# overall setup
donorSetup= ["variance","sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

"""
experiment
"""
all_pred = pd.DataFrame()
all_true = pd.DataFrame()
all_bench = pd.DataFrame()
all_R2 = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    
    # benchmark
    true, benchmark = getBenchmark(target, allMetrics, pred_interval)
    
    # prediction
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)
    all_bench = pd.concat([all_bench, benchmark], axis=1)
    
    R2 = getR2(player_true, player_pred, benchmark)
    all_R2 = pd.concat([all_R2, R2], axis=1)

##################
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())

print()
print("*** R2 ***")
print(all_R2.mean(axis=1))
print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))

edited_R2 = copy.deepcopy(all_R2)
edited_R2[edited_R2 <0] = 0
print()
print("*** edited R2 ***")
print(edited_R2.mean(axis=1))
print("R2 for all: ", edited_R2.mean().mean())
##############################################################

variance
(9, 135)
*** MAPE ***
PTS_G    0.281461
AST_G    0.293750
TOV_G    0.275789
TRB_G    0.218020
STL_G    0.272111
BLK_G    0.423376
3P_G     0.561544
FG%      0.063466
FT%      0.062672
dtype: float64
MAPE for all:  0.26623736398924336

*** RMSE ***
PTS_G    3.097333
AST_G    0.932871
TOV_G    0.417253
TRB_G    1.331592
STL_G    0.233779
BLK_G    0.192403
3P_G     0.451468
FG%      0.038099
FT%      0.059288
dtype: float64
RMSE for all:  0.7504539210532833

*** R2 ***
PTS_G    -87.382284
AST_G   -108.104417
TOV_G   -330.539819
TRB_G    -10.512328
STL_G    -37.874064
BLK_G    -14.294298
3P_G     -48.952823
FG%     -412.376556
FT%       -2.733959
dtype: float64
R2 for all:  -116.97450539355762

*** edited R2 ***
PTS_G    0.416495
AST_G    0.445314
TOV_G    0.423021
TRB_G    0.349780
STL_G    0.353290
BLK_G    0.344232
3P_G     0.328416
FG%      0.313495
FT%      0.062983
dtype: float64
R2 for all:  0.3364757449761273


In [17]:
"""
experiment setup
"""

print("variance")
# overall setup
donorSetup= ["variance","sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

"""
experiment
"""
all_pred = pd.DataFrame()
all_true = pd.DataFrame()
all_bench = pd.DataFrame()
all_R2 = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    
    # benchmark
    true, benchmark = getBenchmark(target, allMetrics, pred_interval)
    
    # prediction
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        
        # ARMA
        data = mrsc.target_data.T.ewm(com=0.5).mean().T.values.flatten()
        data = data[:-1]
        ewm = data[-1]
#         if (np.sum(data != 0)==0):
#             pred_arima = 0
#         else:
#             model = ARMA(data, order=(1, 1))
#             model_fit = model.fit(disp=False)
#             pred_arma = model_fit.predict(len(data), len(data))
            
        pred = 0.5*pred + 0.5*ewm
        
        pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)
    all_bench = pd.concat([all_bench, benchmark], axis=1)
    
    R2 = getR2(player_true, player_pred, benchmark)
    all_R2 = pd.concat([all_R2, R2], axis=1)

##################
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())

print()
print("*** R2 ***")
print(all_R2.mean(axis=1))
print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))

edited_R2 = copy.deepcopy(all_R2)
edited_R2[edited_R2 <0] = 0
print()
print("*** edited R2 ***")
print(edited_R2.mean(axis=1))
print("R2 for all: ", edited_R2.mean().mean())
##############################################################

variance
(9, 135)
*** MAPE ***
PTS_G    0.273533
AST_G    0.286505
TOV_G    0.269199
TRB_G    0.201424
STL_G    0.258114
BLK_G    0.389951
3P_G     0.475659
FG%      0.058403
FT%      0.058346
dtype: float64
MAPE for all:  0.24742384009865614

*** RMSE ***
PTS_G    2.890715
AST_G    0.892749
TOV_G    0.389108
TRB_G    1.127958
STL_G    0.215299
BLK_G    0.175917
3P_G     0.384976
FG%      0.034376
FT%      0.055875
dtype: float64
RMSE for all:  0.6852192845635654

*** R2 ***
PTS_G    -40.855683
AST_G    -55.793611
TOV_G    -76.973691
TRB_G     -8.000855
STL_G    -26.738370
BLK_G     -7.733521
3P_G     -58.873472
FG%     -253.022922
FT%     -763.750862
dtype: float64
R2 for all:  -143.52699854766627

*** edited R2 ***
PTS_G    0.445671
AST_G    0.451657
TOV_G    0.431861
TRB_G    0.420058
STL_G    0.392972
BLK_G    0.355854
3P_G     0.375517
FG%      0.307453
FT%      0.276282
dtype: float64
R2 for all:  0.3833542846671866


In [24]:
"""
experiment setup
"""
print("variance")
# overall setup
donorSetup= ["variance","sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

"""
experiment
"""
all_pred = pd.DataFrame()
all_true = pd.DataFrame()
all_bench = pd.DataFrame()
all_R2 = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    
    # benchmark
    true, benchmark = getBenchmark(target, allMetrics, pred_interval)
    
    # prediction
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        
        try:
            # ARMA
            data = mrsc.target_data.T.ewm(com=0.5).mean().T.values.flatten()
            data = data[:-1]
            if (np.sum(data != 0)==0):
                pred_arima = 0
            else:
                model = ARMA(data, order=(1, 1))
                model_fit = model.fit(disp=False)
                pred_arma = model_fit.predict(len(data), len(data))

            pred = 0.5*pred + 0.5*pred_arma
        except:
            pass
        
        pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)
    all_bench = pd.concat([all_bench, benchmark], axis=1)
    
    R2 = getR2(player_true, player_pred, benchmark)
    all_R2 = pd.concat([all_R2, R2], axis=1)

##################
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())

print()
print("*** R2 ***")
print(all_R2.mean(axis=1))
print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))

edited_R2 = copy.deepcopy(all_R2)
edited_R2[edited_R2 <0] = 0
print()
print("*** edited R2 ***")
print(edited_R2.mean(axis=1))
print("R2 for all: ", edited_R2.mean().mean())
##############################################################

variance


  return np.log(self.sigma2) + (1 + self.df_model) * np.log(nobs)/nobs
  invmacoefs = -np.log((1-macoefs)/(1+macoefs))
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  **kwargs)).imag/2./hess[i, j]
  **kwargs)).imag/2./hess[i, j]
  Z_mat, R_mat, T_mat)
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()


  invarcoefs = 2*np.arctanh(params)
  newparams = np.tanh(params/2)
  tmp = np.tanh(params/2)
  Z_mat.astype(complex), R_mat, T_mat)
  Z_mat.astype(complex), R_mat, T_mat)


  invmacoefs = -np.log((1-macoefs)/(1+macoefs))
  **kwargs)).imag/2./hess[i, j]




(10, 135)
*** MAPE ***
PTS_G    0.270250
AST_G    0.302053
TOV_G    0.277848
TRB_G    0.212243
STL_G    0.269130
BLK_G    0.427215
3P_G     0.566596
FG%      0.061274
FT%      0.062858
PER_w    0.180152
dtype: float64
MAPE for all:  0.25715697633928564

*** RMSE ***
PTS_G    3.027305
AST_G    0.942371
TOV_G    0.409833
TRB_G    1.291124
STL_G    0.231220
BLK_G    0.184358
3P_G     0.762032
FG%      0.037117
FT%      0.060271
PER_w    3.083536
dtype: float64
RMSE for all:  1.0029166643624854

*** R2 ***
PTS_G    -85.024948
AST_G    -94.149912
TOV_G   -117.050360
TRB_G     -9.700171
STL_G    -20.229141
BLK_G    -13.013422
3P_G           -inf
FG%      -27.646854
FT%       -2.681711
PER_w    -63.584022
dtype: float64
R2 for all:  -inf

*** edited R2 ***
PTS_G    0.427767
AST_G    0.443109
TOV_G    0.421274
TRB_G    0.373757
STL_G    0.356683
BLK_G    0.348515
3P_G     0.310318
FG%      0.335834
FT%      0.139833
PER_w    0.323508
dtype: float64
R2 for all:  0.34787898029787556


In [44]:
all_R2.T['PTS_G'].sort_values()[:20]

Lance Stephenson 0    -8227.786894
Derrick Williams 0    -1872.882037
Timofey Mozgov 0       -886.454349
Amir Johnson 0         -231.173442
Evan Turner 0          -111.967003
Courtney Lee 0          -48.857270
Chandler Parsons 0      -32.060051
Markieff Morris 0       -27.489721
Marvin Williams 0       -15.076085
Kris Humphries 0        -12.613857
Chris Paul 0            -10.463894
Chris Bosh 0             -8.134807
Pau Gasol 0              -6.363354
LeBron James 0           -3.911438
Omer Asik 0              -3.804535
Marco Belinelli 0        -3.581063
Gary Neal 0              -3.044504
LaMarcus Aldridge 0      -2.962344
Darren Collison 0        -2.693599
Bismack Biyombo 0        -2.597664
Name: PTS_G, dtype: float64

In [56]:
(all_R2 <0).sum(axis=1)

PTS_G    51
AST_G    51
TOV_G    50
TRB_G    64
STL_G    54
BLK_G    60
3P_G     70
FG%      70
FT%      69
PER_w    67
dtype: int64

In [59]:
50/135

0.37037037037037035

In [38]:
51/135

0.37777777777777777

In [325]:
"""
experiment setup
"""
print("normalize")
# overall setup
donorSetup= ["normalize","sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

"""
experiment
"""
all_pred = pd.DataFrame()
all_true = pd.DataFrame()
all_bench = pd.DataFrame()
all_R2 = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    
    # benchmark
    true, benchmark = getBenchmark(target, ["PTS_G"], pred_interval)
    
    # prediction
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)
    all_bench = pd.concat([all_bench, benchmark], axis=1)
    
    R2 = getR2(player_true, player_pred, benchmark)
    all_R2 = pd.concat([all_R2, R2], axis=1)

##################
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())

print()
print("*** R2 ***")
print(all_R2.mean(axis=1))
print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))

edited_R2 = copy.deepcopy(all_R2)
edited_R2[edited_R2 <0] = 0
print()
print("*** edited R2 ***")
print(edited_R2.mean(axis=1))
print("R2 for all: ", edited_R2.mean().mean())
##############################################################

normalize
(10, 134)
*** MAPE ***
PTS_G     0.473115
AST_G     0.382205
TOV_G     0.398554
TRB_G     0.326884
STL_G     0.747976
BLK_G     0.641573
3P_G      1.034586
FG%       0.199636
FT%      11.435996
PER_w     0.539404
dtype: float64
MAPE for all:  1.6179340352100255

*** RMSE ***
PTS_G     4.930995
AST_G     1.202499
TOV_G     0.591635
TRB_G     1.817813
STL_G     0.637192
BLK_G     0.351938
3P_G      0.742832
FG%       0.133928
FT%      24.627570
PER_w    10.199659
dtype: float64
RMSE for all:  4.523606149794006

*** R2 ***
PTS_G   -229.886493
AST_G      0.977291
TOV_G      0.995292
TRB_G    -15.726024
STL_G      0.994958
BLK_G      0.997905
3P_G       0.992850
FG%        0.999770
FT%       -6.410267
PER_w    -97.628441
dtype: float64
R2 for all:  -34.36931588960094

*** edited R2 ***
PTS_G    0.302402
AST_G    0.977291
TOV_G    0.995292
TRB_G    0.834267
STL_G    0.994958
BLK_G    0.997905
3P_G     0.992850
FG%      0.999770
FT%      0.804334
PER_w    0.282870
dtype: float64
R2 