In [1]:
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import copy
import pickle

from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.importData import *
import mrsc.src.utils as utils

from itertools import combinations, product

def prepareData(stats):
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK","3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG","FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

    allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())
    return allPivotedTableDict, allMetrics

def getActivePlayers(stats, year, buffer):
    # list of name of the players who were active in this and last year
    thisYear = stats[stats.Year == year].copy()
    players = list(thisYear.Player.unique())
    for i in range(1, buffer+1):
        previousYear = stats[stats.Year == (year-i)].copy()
        players = list(set(players) & set(previousYear.Player.unique()))
    return players

def getTopPlayers(stats, year, metric, n):
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player"]][:n]

def getBenchmark(target, metrics_to_use, pred_interval):    
    target_data, nanIndex = target.concat(metrics_to_use)
    num_k = len(metrics_to_use)
    interv_index = int(target_data.shape[1]/num_k - pred_interval)
    total_index = int(interv_index + 1)
    
    # true
    true = utils.get_postint_data(target_data, interv_index, total_index, num_k).T
    true.index = metrics_to_use
    
    # predictions
    history = utils.get_preint_data(target_data, interv_index, total_index, num_k)
    pred = []
    for i in range(num_k):
        pred.append(history.iloc[:,i*interv_index:(i+1)*interv_index].mean(axis=1).to_list())

    pred = pd.DataFrame(pred, index=metrics_to_use, columns = [playerName])
    return true, pred

def getR2(true, pred, bench):
    ss_res = pd.DataFrame((true.values - pred.values)**2, index=true.index).sum(axis=1)
    ss_tot = pd.DataFrame((true.values - bench.values)**2, index=true.index).sum(axis=1)
    return (1-ss_res/ss_tot).to_frame(name = pred.columns.values[0])

In [2]:
"""
import data
"""
pred_year = 2015 # the year that we are living in
pred_interval = 1 # we are making predictions for pred_year+1 and +2
min_games = 1 # keeps ~72.3%(10096/13963) of the full data

print("*** importing data ***")
### 1. Players
players = pd.read_csv("../data/nba-players-stats/player_data.csv")
players = players[players.year_start >= 1980] # only choose players who started after 1980

### 2. Stats
stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
# remove multiple rows for the same [Year, Player]
totals = stats[stats.Tm == "TOT"]
duplicates_removed = stats.drop_duplicates(subset=["Year","Player"], keep=False)
stats = pd.concat([duplicates_removed, totals], axis=0).sort_values("Unnamed: 0")

# fix the name* issue
stats = stats.replace('\*','',regex=True)
stats = stats[stats.Player.isin(players.name)]

# only after 1980
stats = stats[stats.Year >= 1980]

# minimum number of games
stats = stats[stats.G >= min_games]

# without duplicated names --> to do: how to distinguish multiple player with the same name
stats = removeDuplicated(players, stats)
stats.Year = stats.Year.astype(int)
stats.year_count = stats.year_count.astype(int)

print("*** preparing data ***")

########### Donor ##########
# filter stats by the year
stats_donor = stats[stats.Year <= pred_year]
allPivotedTableDict_d, allMetrics = prepareData(stats_donor)
donor = Donor(allPivotedTableDict_d)

########### Target ##########
# filter stats by the year
stats_target = stats[stats.Year <= pred_year+pred_interval]
allPivotedTableDict, allMetrics = prepareData(stats_target)

# just to debug
df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

*** importing data ***
*** preparing data ***


In [268]:
"""
targets
"""
activePlayers = getActivePlayers(stats, pred_year+1, buffer=4)
topPlayers = getTopPlayers(stats, pred_year, 'PTS', 300)
topPlayers = topPlayers.values.flatten().tolist()

print("non-rookie active players in 2016: ", len(activePlayers))

# first group
print("***** First Group *****")
target_names_1 = list(set(activePlayers) & set(topPlayers))
max_total_index = 18
print("* not sufficient donor pool:")
for playerName in target_names_1:
    target = Target(playerName, allPivotedTableDict)
    if (target.total_index > max_total_index):
        target_names_1.remove(playerName)
        print(playerName)
print("* total number of target players: ", len(target_names_1))

# second group
print()
print("***** Second Group *****")
target_names_2 = activePlayers
max_total_index = 18
print("* not sufficient donor pool:")
for playerName in target_names_2:
    target = Target(playerName, allPivotedTableDict)
    if (target.total_index > max_total_index):
        target_names_2.remove(playerName)
        print(playerName)
print("* total number of target players: ", len(target_names_2))

##############
# final
targetNames = target_names_2

non-rookie active players in 2016:  136
***** First Group *****
* not sufficient donor pool:
Tim Duncan
* total number of target players:  134

***** Second Group *****
* not sufficient donor pool:
Tim Duncan
* total number of target players:  135


In [3]:
for year in range(2008,2018,1):
    print(year)
    print("mean   : ",stats[stats.Year == year].G.mean())
    print("median : ",stats[stats.Year == year].G.median())
    print()

2008
mean   :  55.2219730941704
median :  65.0

2009
mean   :  55.55125284738041
median :  63.0

2010
mean   :  56.26146788990825
median :  65.0

2011
mean   :  55.77752808988764
median :  62.0

2012
mean   :  43.54797441364605
median :  49.0

2013
mean   :  55.108932461873636
median :  61.0

2014
mean   :  53.80128205128205
median :  62.0

2015
mean   :  53.16804979253112
median :  61.0

2016
mean   :  54.850107066381156
median :  62.0

2017
mean   :  54.00421052631579
median :  63.0



In [265]:
metrics_to_use = allMetrics

print("Algo: outputs the mean of the player's history")
print("-----")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    true, pred = getBenchmark(target, metrics_to_use, pred_interval)
    
    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

# rmse = utils.rmse_2d(true_all, pred_all)
# print()
# print("*** RMSE ***")
# print(rmse)
# print("RMSE for all: ", rmse.mean())    

Algo: outputs the mean of the player's history
-----
*** MAPE ***
PTS_G    0.411294
AST_G    0.386515
TOV_G    0.399897
TRB_G    0.274153
STL_G    0.319638
BLK_G    0.447698
3P_G     0.535251
FG%      0.062816
FT%      0.061233
PER_w    0.201826
dtype: float64
MAPE for all:  0.30505741758293986


# Fixed vs. Sliding

In [295]:
"""
experiment setup
"""
# overall setup
donorSetup= [None,"fixed", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics_list = [[x] for x in allMetrics]

"""
experiment
"""
print("1 metric at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
all_bench = pd.DataFrame()
all_R2 = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    
    # benchmark
    true, benchmark = getBenchmark(target, ["PTS_G"], pred_interval)
    
    # prediction
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)
    all_bench = pd.concat([all_bench, benchmark], axis=1)
    
    R2 = getR2(player_true, player_pred, benchmark)
    all_R2 = pd.concat([all_R2, R2], axis=1)

##################
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())

print()
print("*** R2 ***")
print(all_R2.mean(axis=1))
print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))

edited_R2 = copy.deepcopy(all_R2)
edited_R2[edited_R2 <0] = 0
print()
print("*** edited R2 ***")
print(edited_R2.mean(axis=1))
print("R2 for all: ", edited_R2.mean().mean())
##############################################################

1 metric at once
(10, 134)
*** MAPE ***
PTS_G    0.277280
AST_G    0.303132
TOV_G    0.281037
TRB_G    0.232480
STL_G    0.268172
BLK_G    0.371174
3P_G     0.897202
FG%      0.060781
FT%      0.061157
PER_w    0.174422
dtype: float64
MAPE for all:  0.2810383781032388

*** RMSE ***
PTS_G    3.078215
AST_G    0.939275
TOV_G    0.421465
TRB_G    1.394864
STL_G    0.235441
BLK_G    0.195664
3P_G     0.874437
FG%      0.035825
FT%      0.058761
PER_w    2.891542
dtype: float64
RMSE for all:  1.012548902251213

*** R2 ***
PTS_G   -68.137314
AST_G     0.989566
TOV_G     0.997875
TRB_G    -4.560719
STL_G     0.999339
BLK_G     0.999462
3P_G      0.990676
FG%       0.999979
FT%       0.999935
PER_w   -14.244623
dtype: float64
R2 for all:  -7.996582330214866

*** edited R2 ***
PTS_G    0.423362
AST_G    0.989566
TOV_G    0.997875
TRB_G    0.856446
STL_G    0.999339
BLK_G    0.999462
3P_G     0.990676
FG%      0.999979
FT%      0.999935
PER_w    0.603814
dtype: float64
R2 for all:  0.88604549089

In [294]:
"""
experiment setup
"""
# overall setup
donorSetup= [None,"sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics_list = [[x] for x in allMetrics]

"""
experiment
"""
print("1 metric at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
all_bench = pd.DataFrame()
all_R2 = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    
    # benchmark
    true, benchmark = getBenchmark(target, ["PTS_G"], pred_interval)
    
    # prediction
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)
    all_bench = pd.concat([all_bench, benchmark], axis=1)
    
    R2 = getR2(player_true, player_pred, benchmark)
    all_R2 = pd.concat([all_R2, R2], axis=1)

##################
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())

print()
print("*** R2 ***")
print(all_R2.mean(axis=1))
print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))

edited_R2 = copy.deepcopy(all_R2)
edited_R2[edited_R2 <0] = 0
print()
print("*** edited R2 ***")
print(edited_R2.mean(axis=1))
print("R2 for all: ", edited_R2.mean().mean())
##############################################################

1 metric at once
(10, 134)
*** MAPE ***
PTS_G    0.285066
AST_G    0.296102
TOV_G    0.277888
TRB_G    0.221688
STL_G    0.271170
BLK_G    0.378055
3P_G     0.620813
FG%      0.062568
FT%      0.061464
PER_w    0.176787
dtype: float64
MAPE for all:  0.2583693415639397

*** RMSE ***
PTS_G    3.125182
AST_G    0.935674
TOV_G    0.420983
TRB_G    1.367314
STL_G    0.240406
BLK_G    0.204238
3P_G     0.490811
FG%      0.037340
FT%      0.058443
PER_w    3.152154
dtype: float64
RMSE for all:  1.003254594593438

*** R2 ***
PTS_G   -88.662326
AST_G     0.988888
TOV_G     0.997892
TRB_G    -1.950227
STL_G     0.999319
BLK_G     0.999350
3P_G      0.997055
FG%       0.999977
FT%       0.999936
PER_w   -10.868037
dtype: float64
R2 for all:  -9.449817350209434

*** edited R2 ***
PTS_G    0.411329
AST_G    0.988888
TOV_G    0.997892
TRB_G    0.836763
STL_G    0.999319
BLK_G    0.999350
3P_G     0.997055
FG%      0.999977
FT%      0.999936
PER_w    0.569363
dtype: float64
R2 for all:  0.87998723292

# Grouping

In [269]:
# check if grouping with n=2 would work
"""
experiment setup
"""
# overall setup
donorSetup= [None,"Sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

"""
experiment
"""
result = pd.DataFrame(index=allMetrics, columns=allMetrics)
for m1 in range(len(allMetrics)):
    for m2 in range(len(allMetrics)):
        metric1 = allMetrics[m1]
        metric2 = allMetrics[m2]
        metrics = [metric1, metric2]
        metrics_list = [metrics]
        print()
        print("***************************************")
        print(metrics)

        all_pred = pd.DataFrame()
        all_true = pd.DataFrame()
        all_bench = pd.DataFrame()
        all_R2 = pd.DataFrame()
        for playerName in playerNames:
            target = Target(playerName, allPivotedTableDict)

            # benchmark
            true, benchmark = getBenchmark(target, metrics, pred_interval)

            # prediction
            mrsc = mRSC(donor, target, pred_interval, probObservation=1)
            player_pred = pd.DataFrame()
            player_true = pd.DataFrame()
            for i in range(len(metrics_list)):
                mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
                pred = mrsc.predict()
                true = mrsc.getTrue()
                pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
                true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
                player_pred = pd.concat([player_pred, pred], axis=0)
                player_true = pd.concat([player_true, true], axis=0)
            all_pred = pd.concat([all_pred, player_pred], axis=1)
            all_true = pd.concat([all_true, player_true], axis=1)
            all_bench = pd.concat([all_bench, benchmark], axis=1)

            R2 = getR2(player_true, player_pred, benchmark)
            all_R2 = pd.concat([all_R2, R2], axis=1)

        ##################
#         print(all_pred.shape)
        mask = (all_true !=0 )
        mape = np.abs(all_pred - all_true) / all_true[mask]
        print("*** MAPE ***")
        print(mape.mean(axis=1))
        result.iloc[m1,m2] = mape.mean(axis=1).values[0]
print(result)
diff = (result.T - mape.mean(axis=1).values).T


***************************************
['PTS_G', 'PTS_G']
*** MAPE ***
PTS_G    0.285066
PTS_G    0.285066
dtype: float64

***************************************
['PTS_G', 'AST_G']
*** MAPE ***
PTS_G    0.283337
AST_G    0.409922
dtype: float64

***************************************
['PTS_G', 'TOV_G']
*** MAPE ***
PTS_G    0.285706
TOV_G    0.393103
dtype: float64

***************************************
['PTS_G', 'TRB_G']
*** MAPE ***
PTS_G    0.300432
TRB_G    0.244106
dtype: float64

***************************************
['PTS_G', 'STL_G']
*** MAPE ***
PTS_G    0.284957
STL_G    0.421654
dtype: float64

***************************************
['PTS_G', 'BLK_G']
*** MAPE ***
PTS_G    0.285237
BLK_G    1.371677
dtype: float64

***************************************
['PTS_G', '3P_G']
*** MAPE ***
PTS_G    0.285382
3P_G     2.160642
dtype: float64

***************************************
['PTS_G', 'FG%']
*** MAPE ***
PTS_G    0.285094
FG%      0.337157
dtype: float64

**********

*** MAPE ***
3P_G    0.620813
3P_G    0.620813
dtype: float64

***************************************
['3P_G', 'FG%']
*** MAPE ***
3P_G    0.609159
FG%     0.063301
dtype: float64

***************************************
['3P_G', 'FT%']
*** MAPE ***
3P_G    0.519625
FT%     0.061411
dtype: float64

***************************************
['3P_G', 'PER_w']
*** MAPE ***
3P_G     2.444566
PER_w    0.177107
dtype: float64

***************************************
['FG%', 'PTS_G']
*** MAPE ***
FG%      0.337157
PTS_G    0.285094
dtype: float64

***************************************
['FG%', 'AST_G']
*** MAPE ***
FG%      0.540236
AST_G    0.300862
dtype: float64

***************************************
['FG%', 'TOV_G']
*** MAPE ***
FG%      0.347302
TOV_G    0.280622
dtype: float64

***************************************
['FG%', 'TRB_G']
*** MAPE ***
FG%      0.362217
TRB_G    0.230544
dtype: float64

***************************************
['FG%', 'STL_G']
*** MAPE ***
FG%      0.062644


In [290]:
diff = (result.T - mape.mean(axis=1).values).T
diff[diff<0].astype(float).round(3)

Unnamed: 0,PTS_G,AST_G,TOV_G,TRB_G,STL_G,BLK_G,3P_G,FG%,FT%,PER_w
PTS_G,,-0.002,,,-0.0,,,,,
AST_G,,,,,,,-0.003,,-0.005,
TOV_G,,-0.017,,,,,-0.018,,-0.012,
TRB_G,,,,-0.0,,,,,,
STL_G,,-0.003,,,-0.0,-0.009,,,-0.001,
BLK_G,,,,,-0.006,,,-0.003,-0.004,
3P_G,,,,,,-0.018,-0.0,-0.012,-0.101,
FG%,,,,,,,,,,
FT%,,,,,,-0.001,-0.0,,,
PER_w,,,,,,-0.0,,-0.0,,


# Weighting

In [None]:
# metrics_list = [['PTS_G'],
#  ['AST_G'],
#  ['TOV_G'],
#  ['TRB_G'],
#  ['STL_G'],
#  ['BLK_G'],
#  ['3P_G'],
#  ['FG%'],
#  ['FT%','3P_G'],
#  ['PER_w']]

metrics_list = [[x] for x in allMetrics]

In [324]:
"""
experiment setup
"""
print("variance")
# overall setup
donorSetup= ["variance","sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

"""
experiment
"""
all_pred = pd.DataFrame()
all_true = pd.DataFrame()
all_bench = pd.DataFrame()
all_R2 = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    
    # benchmark
    true, benchmark = getBenchmark(target, ["PTS_G"], pred_interval)
    
    # prediction
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)
    all_bench = pd.concat([all_bench, benchmark], axis=1)
    
    R2 = getR2(player_true, player_pred, benchmark)
    all_R2 = pd.concat([all_R2, R2], axis=1)

##################
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())

print()
print("*** R2 ***")
print(all_R2.mean(axis=1))
print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))

edited_R2 = copy.deepcopy(all_R2)
edited_R2[edited_R2 <0] = 0
print()
print("*** edited R2 ***")
print(edited_R2.mean(axis=1))
print("R2 for all: ", edited_R2.mean().mean())
##############################################################

variance
(10, 134)
*** MAPE ***
PTS_G    0.283308
AST_G    0.293132
TOV_G    0.276494
TRB_G    0.218923
STL_G    0.272439
BLK_G    0.425095
3P_G     0.558330
FG%      0.062566
FT%      0.062136
PER_w    0.178784
dtype: float64
MAPE for all:  0.2574142131362448

*** RMSE ***
PTS_G    3.108842
AST_G    0.935796
TOV_G    0.418640
TRB_G    1.336256
STL_G    0.234032
BLK_G    0.193097
3P_G     0.452773
FG%      0.037343
FT%      0.059003
PER_w    3.196666
dtype: float64
RMSE for all:  0.9972447301011511

*** R2 ***
PTS_G   -88.041827
AST_G     0.988982
TOV_G     0.997898
TRB_G    -1.653613
STL_G     0.999367
BLK_G     0.999376
3P_G      0.997531
FG%       0.999976
FT%       0.999935
PER_w   -11.083148
dtype: float64
R2 for all:  -9.37955221414564

*** edited R2 ***
PTS_G    0.412167
AST_G    0.988982
TOV_G    0.997898
TRB_G    0.840461
STL_G    0.999367
BLK_G    0.999376
3P_G     0.997531
FG%      0.999976
FT%      0.999935
PER_w    0.563459
dtype: float64
R2 for all:  0.8799152469536123


In [325]:
"""
experiment setup
"""
print("normalize")
# overall setup
donorSetup= ["normalize","sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

"""
experiment
"""
all_pred = pd.DataFrame()
all_true = pd.DataFrame()
all_bench = pd.DataFrame()
all_R2 = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    
    # benchmark
    true, benchmark = getBenchmark(target, ["PTS_G"], pred_interval)
    
    # prediction
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)
    all_bench = pd.concat([all_bench, benchmark], axis=1)
    
    R2 = getR2(player_true, player_pred, benchmark)
    all_R2 = pd.concat([all_R2, R2], axis=1)

##################
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())

print()
print("*** R2 ***")
print(all_R2.mean(axis=1))
print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))

edited_R2 = copy.deepcopy(all_R2)
edited_R2[edited_R2 <0] = 0
print()
print("*** edited R2 ***")
print(edited_R2.mean(axis=1))
print("R2 for all: ", edited_R2.mean().mean())
##############################################################

normalize
(10, 134)
*** MAPE ***
PTS_G     0.473115
AST_G     0.382205
TOV_G     0.398554
TRB_G     0.326884
STL_G     0.747976
BLK_G     0.641573
3P_G      1.034586
FG%       0.199636
FT%      11.435996
PER_w     0.539404
dtype: float64
MAPE for all:  1.6179340352100255

*** RMSE ***
PTS_G     4.930995
AST_G     1.202499
TOV_G     0.591635
TRB_G     1.817813
STL_G     0.637192
BLK_G     0.351938
3P_G      0.742832
FG%       0.133928
FT%      24.627570
PER_w    10.199659
dtype: float64
RMSE for all:  4.523606149794006

*** R2 ***
PTS_G   -229.886493
AST_G      0.977291
TOV_G      0.995292
TRB_G    -15.726024
STL_G      0.994958
BLK_G      0.997905
3P_G       0.992850
FG%        0.999770
FT%       -6.410267
PER_w    -97.628441
dtype: float64
R2 for all:  -34.36931588960094

*** edited R2 ***
PTS_G    0.302402
AST_G    0.977291
TOV_G    0.995292
TRB_G    0.834267
STL_G    0.994958
BLK_G    0.997905
3P_G     0.992850
FG%      0.999770
FT%      0.804334
PER_w    0.282870
dtype: float64
R2 