In [1]:
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import copy
import pickle

from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.importData import *
import mrsc.src.utils as utils

from itertools import combinations, product

def prepareData(stats):
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK","3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG","FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

    allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())
    return allPivotedTableDict, allMetrics

def getActivePlayers(stats, year, buffer):
    # list of name of the players who were active in this and last year
    thisYear = stats[stats.Year == year].copy()
    players = list(thisYear.Player.unique())
    for i in range(1, buffer+1):
        previousYear = stats[stats.Year == (year-i)].copy()
        players = list(set(players) & set(previousYear.Player.unique()))
    return players

def topPlayers(stats, year, metric, n):
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player","player_id"]][:n]

def getBenchmark(target, metrics_to_use, pred_interval):    
    target_data, nanIndex = target.concat(metrics_to_use)
    num_k = len(metrics_to_use)
    interv_index = int(target_data.shape[1]/num_k - pred_interval)
    total_index = int(interv_index + 1)
    
    # true
    true = utils.get_postint_data(target_data, interv_index, total_index, num_k).T
    true.index = metrics_to_use
    
    # predictions
    history = utils.get_preint_data(target_data, interv_index, total_index, num_k)
    pred = []
    for i in range(num_k):
        pred.append(history.iloc[:,i*interv_index:(i+1)*interv_index].mean(axis=1).to_list())

    pred = pd.DataFrame(pred, index=metrics_to_use, columns = [playerName])
    return true, pred

In [2]:
"""
import data
"""
pred_year = 2015 # the year that we are living in
pred_interval = 1 # we are making predictions for pred_year+1 and +2

print("*** importing data ***")
players = pd.read_csv("../data/nba-players-stats/player_data.csv")
players = players[players.year_start >= 1980] # only choose players who started after 1980
# players["player_id"] = range(0,len(players.name)) # assign id

stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
stats = stats[stats.Player.isin(players.name)]

# only after 1980
stats = stats[stats.Year >= 1980]

# without duplicated names --> to do: how to distinguish multiple player with the same name
stats = removeDuplicated(players, stats)
stats.Year = stats.Year.astype(int)
stats.year_count = stats.year_count.astype(int)

print("*** preparing data ***")

########### Donor ##########
# filter stats by the year
stats_donor = stats[stats.Year <= pred_year]
allPivotedTableDict_d, allMetrics = prepareData(stats_donor)
donor = Donor(allPivotedTableDict_d)

########### Target ##########
# filter stats by the year
stats_target = stats[stats.Year <= pred_year+pred_interval]
allPivotedTableDict, allMetrics = prepareData(stats_target)

# just to debug
df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

"""
targets
"""
# targets
playerNames = getActivePlayers(stats, pred_year+1, buffer=4)
playerNames.sort()
playerNames.remove("Kevin Garnett")
playerNames.remove("Kobe Bryant")

*** importing data ***
*** preparing data ***


In [3]:
metrics_to_use= ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G","TRB_G","STL_G","BLK_G"]

print("Algo: outputs the mean of the player's history")
print("-----")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    true, pred = getBenchmark(target, metrics_to_use, pred_interval)
    
    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(true_all, pred_all)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())    

Algo: outputs the mean of the player's history
-----
*** MAPE ***
PTS_G    0.575711
AST_G    0.506711
TOV_G    0.596790
PER_w    0.287966
FG%      0.076904
FT%      0.081673
3P_G     0.682272
TRB_G    0.422301
STL_G    0.446710
BLK_G    0.685350
dtype: float64
MAPE for all:  0.4420163650526481

*** RMSE ***
PTS_G    4.691799
AST_G    1.136067
TOV_G    0.617640
PER_w    3.788750
FG%      0.052942
FT%      0.110670
3P_G     0.503330
TRB_G    1.709996
STL_G    0.299601
BLK_G    0.282306
dtype: float64
RMSE for all:  1.3193101146400346


In [19]:
"""
experiment setup
"""
# overall setup
donorSetup= ["variance","sliding", True]
# weighting = donorSetup[0] # None / "normalize"
# mat_form_method = donorSetup[1] # "fixed"
# skipNan = donorSetup[2] # (Boolean)
denoiseSetup = ["SVD", "all"]
# denoise_method = denoiseSetup[0] # "SVD"
# denoise_mat_method = denoiseSetup[1] # "all"
regression_method = "pinv"

threshold = 0.97
verbose = False

# offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"]
# defMetrics = ["TRB_G","STL_G","BLK_G"]

metrics_list = [allMetrics]

"""
experiment
"""
print("10 metrics at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
for playerName in playerNames:
    # print(playerName)
    # print("*** year - year_count matching for this player")
    # a = df_year[df_year.index == playerName]
    # print(a.dropna(axis=1))

    target = Target(playerName, allPivotedTableDict)
    # print("*** target - total index: ", target.total_index)
    # print(target.concat(metrics_list[1]))

    mrsc = mRSC(donor, target, pred_interval, probObservation=1)

    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())
##############################################################

10 metrics at once
(10, 228)
*** MAPE ***
PTS_G    0.399555
AST_G    0.564215
TOV_G    0.395530
TRB_G    0.370901
STL_G    0.415048
BLK_G    0.751559
3P_G     1.149345
FG%      0.092531
FT%      0.103185
PER_w    0.260219
dtype: float64
MAPE for all:  0.44184728421625746

*** RMSE ***
PTS_G    4.482597
AST_G    1.206892
TOV_G    0.574428
TRB_G    1.714422
STL_G    0.352951
BLK_G    0.294476
3P_G     0.604472
FG%      0.062385
FT%      0.120527
PER_w    4.329581
dtype: float64
RMSE for all:  1.3742730966944614


In [None]:
"""
experiment setup
"""
# overall setup
donorSetup= ["variance_batch","sliding", True]
# weighting = donorSetup[0] # None / "normalize"
# mat_form_method = donorSetup[1] # "fixed"
# skipNan = donorSetup[2] # (Boolean)
denoiseSetup = ["SVD", "all"]
# denoise_method = denoiseSetup[0] # "SVD"
# denoise_mat_method = denoiseSetup[1] # "all"
regression_method = "pinv"

threshold = 0.97
verbose = False

# offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"]
# defMetrics = ["TRB_G","STL_G","BLK_G"]

metrics_list = [allMetrics]

"""
experiment
"""
print("10 metrics at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
for playerName in playerNames:
    # print(playerName)
    # print("*** year - year_count matching for this player")
    # a = df_year[df_year.index == playerName]
    # print(a.dropna(axis=1))

    target = Target(playerName, allPivotedTableDict)
    # print("*** target - total index: ", target.total_index)
    # print(target.concat(metrics_list[1]))

    mrsc = mRSC(donor, target, pred_interval, probObservation=1)

    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())
##############################################################

10 metrics at once


In [18]:
"""
experiment setup
"""
# overall setup
donorSetup= [None,"sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics_list = [[x] for x in allMetrics]

"""
experiment
"""
print("1 metric at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
for playerName in playerNames:
    # print(playerName)
    # print("*** year - year_count matching for this player")
    # a = df_year[df_year.index == playerName]
    # print(a.dropna(axis=1))

    target = Target(playerName, allPivotedTableDict)
    # print("*** target - total index: ", target.total_index)
    # print(target.concat(metrics_list[1]))

    mrsc = mRSC(donor, target, pred_interval, probObservation=1)

    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())
##############################################################

1 metric at once
(10, 228)
*** MAPE ***
PTS_G    0.330647
AST_G    0.372639
TOV_G    0.339674
TRB_G    0.313756
STL_G    0.320293
BLK_G    0.523257
3P_G     0.642541
FG%      0.083542
FT%      0.086549
PER_w    0.241234
dtype: float64
MAPE for all:  0.32538716712421906

*** RMSE ***
PTS_G    3.212774
AST_G    0.948374
TOV_G    0.429059
TRB_G    1.526153
STL_G    0.261448
BLK_G    0.202434
3P_G     0.495202
FG%      0.055732
FT%      0.111807
PER_w    3.655378
dtype: float64
RMSE for all:  1.0898361617314343


In [11]:
"""
experiment setup
"""
# overall setup
donorSetup= ["variance","sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics_list = [[x] for x in allMetrics]

"""
experiment
"""
print("1 metric at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
for playerName in playerNames:
    # print(playerName)
    # print("*** year - year_count matching for this player")
    # a = df_year[df_year.index == playerName]
    # print(a.dropna(axis=1))

    target = Target(playerName, allPivotedTableDict)
    # print("*** target - total index: ", target.total_index)
    # print(target.concat(metrics_list[1]))

    mrsc = mRSC(donor, target, pred_interval, probObservation=1)

    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())
##############################################################

In [14]:
mrsc_test.model.target_data

Unnamed: 0_level_0,0,1,2,3,4,5
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Trevor Booker,5.323077,8.38,5.333333,6.819444,7.177215,5.860759


In [15]:
mrsc_test.target_data

Unnamed: 0_level_0,0,1,2,3,4,5
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Trevor Booker,5.323077,8.38,5.333333,6.819444,7.177215,5.860759


In [16]:
mrsc_test.model.donor_data

Unnamed: 0_level_0,0,1,2,3,4,5
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A.C. Green,7.984443,10.336889,11.321642,11.746766,11.796647,11.286086
A.J. Price,5.556351,6.591934,6.311457,5.182206,4.429257,3.505133
Aaron Gray,3.199295,3.907187,3.924901,3.539133,3.251753,2.825204
Aaron McKie,6.301746,7.610706,7.508483,6.546298,5.868408,4.947463
Adam Keefe,4.515517,5.739922,6.126646,6.115932,6.005336,5.616351
...,...,...,...,...,...,...
Winston Garland,10.561710,12.459451,11.812737,9.498627,7.974766,6.151263
Xavier McDaniel,17.641811,21.840752,22.412789,20.985528,19.787711,17.714993
Zach Randolph,6.120044,9.889431,13.801354,18.785882,21.399006,22.867174
Zaza Pachulia,6.050453,7.691502,8.210388,8.197072,8.049455,7.528661


In [17]:
mrsc_test.donor_data

Unnamed: 0_level_0,0,1,2,3,4,5
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A.C. Green,6.353659,10.784810,11.426829,13.268293,12.939024,9.146341
A.J. Price,7.321429,6.460000,3.909091,7.736842,1.571429,5.115385
Aaron Gray,4.295082,3.500000,3.250000,3.146341,3.897959,2.833333
Aaron McKie,6.511111,10.666667,5.216867,4.098765,4.800000,7.963415
Adam Keefe,6.609756,4.333333,6.146667,6.085366,3.790323,7.750000
...,...,...,...,...,...,...
Winston Garland,12.402985,14.493671,7.265823,8.173913,10.846154,5.924242
Xavier McDaniel,17.121951,23.048780,21.397436,20.451220,21.318841,16.950617
Zach Randolph,2.780488,8.441558,20.074074,18.934783,18.013514,23.647059
Zaza Pachulia,3.288136,6.189189,11.679487,12.152778,5.193548,6.246753


In [16]:
pred_all = pd.DataFrame()
true_all = pd.DataFrame()

metrics_to_use = allMetrics
target = Target(playerName, allPivotedTableDict)
target_data, nanIndex = target.concat(metrics_to_use)
num_k = len(metrics_to_use)
interv_index = int(target_data.shape[1]/num_k -1)
total_index = int(interv_index + 1)

# predictions
history = utils.get_preint_data(target_data, interv_index, total_index, num_k)
pred = []
for i in range(num_k):
    pred.append(history.iloc[:,i*interv_index:(i+1)*interv_index].mean(axis=1).to_list())

pred = pd.DataFrame(pred, index=metrics_to_use, columns = [playerName])
pred_all = pd.concat([pred_all, pred], axis=1)

In [19]:
pred_all

Unnamed: 0,James Johnson
PTS_G,6.768181
AST_G,1.602357
TOV_G,1.320692
TRB_G,3.327445
STL_G,0.779173
BLK_G,1.002024
3P_G,0.227501
FG%,0.470784
FT%,0.698869
PER_w,13.941228
