In [1]:
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import copy
import pickle

from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.importData import *
import mrsc.src.utils as utils

from itertools import combinations, product

def prepareData(stats):
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK","3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG","FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

    allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())
    return allPivotedTableDict, allMetrics

def getActivePlayers(stats, year, buffer):
    # list of name of the players who were active in this and last year
    thisYear = stats[stats.Year == year].copy()
    players = list(thisYear.Player.unique())
    for i in range(1, buffer+1):
        previousYear = stats[stats.Year == (year-i)].copy()
        players = list(set(players) & set(previousYear.Player.unique()))
    return players

def getTopPlayers(stats, year, metric, n):
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player"]][:n]

def getBenchmark(target, metrics_to_use, pred_interval):    
    target_data, nanIndex = target.concat(metrics_to_use)
    num_k = len(metrics_to_use)
    interv_index = int(target_data.shape[1]/num_k - pred_interval)
    total_index = int(interv_index + 1)
    
    # true
    true = utils.get_postint_data(target_data, interv_index, total_index, num_k).T
    true.index = metrics_to_use
    
    # predictions
    history = utils.get_preint_data(target_data, interv_index, total_index, num_k)
    pred = []
    for i in range(num_k):
        pred.append(history.iloc[:,i*interv_index:(i+1)*interv_index].mean(axis=1).to_list())

    pred = pd.DataFrame(pred, index=metrics_to_use, columns = [playerName])
    return true, pred

def getR2(true, pred, bench):
    true_mean = true.mean(axis=1).to_frame()
    ss_res = pd.DataFrame((true.values - pred.values)**2, index=true.index).sum(axis=1)
    ss_tot = pd.DataFrame((true.values - bench.values)**2, index=true.index).sum(axis=1)
    return (1-ss_res/ss_tot).to_frame()

# def get_r_squared(predicted, actual):
#     ss_res = np.sum(np.square(actual - predicted))
#     ss_tot = np.sum(np.square(actual - np.mean(actual)))
#     return 1 - (ss_res / ss_tot)

In [8]:
"""
import data
"""
pred_year = 2015 # the year that we are living in
pred_interval = 1 # we are making predictions for pred_year+1 and +2
# min_games = 40

print("*** importing data ***")
players = pd.read_csv("../data/nba-players-stats/player_data.csv")
players = players[players.year_start >= 1980] # only choose players who started after 1980
# players["player_id"] = range(0,len(players.name)) # assign id

stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
stats = stats[stats.Player.isin(players.name)]

# only after 1980
stats = stats[stats.Year >= 1980]
# stats = stats[stats.G >= min_games]

# without duplicated names --> to do: how to distinguish multiple player with the same name
stats = removeDuplicated(players, stats)
stats.Year = stats.Year.astype(int)
stats.year_count = stats.year_count.astype(int)

print("*** preparing data ***")

########### Donor ##########
# filter stats by the year
stats_donor = stats[stats.Year <= pred_year]
allPivotedTableDict_d, allMetrics = prepareData(stats_donor)
donor = Donor(allPivotedTableDict_d)

########### Target ##########
# filter stats by the year
stats_target = stats[stats.Year <= pred_year+pred_interval]
allPivotedTableDict, allMetrics = prepareData(stats_target)

# just to debug
df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

*** importing data ***
*** preparing data ***


In [9]:
"""
targets
"""
activePlayers = getActivePlayers(stats, pred_year+1, buffer=4)
activePlayers.sort()
activePlayers.remove("Kevin Garnett")
activePlayers.remove("Kobe Bryant")

# Benchmark

In [11]:
# metrics_to_use= ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G","TRB_G","STL_G","BLK_G"]
metrics_to_use = allMetrics

print("Algo: outputs the mean of the player's history")
print("-----")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in activePlayers:
    target = Target(playerName, allPivotedTableDict)
    true, pred = getBenchmark(target, metrics_to_use, pred_interval)
    
    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(true_all, pred_all)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())    

Algo: outputs the mean of the player's history
-----
*** MAPE ***
PTS_G    0.575711
AST_G    0.506711
TOV_G    0.596790
TRB_G    0.422301
STL_G    0.446710
BLK_G    0.685350
3P_G     0.682272
FG%      0.076904
FT%      0.081673
PER_w    0.287966
dtype: float64
MAPE for all:  0.4420163650526481

*** RMSE ***
PTS_G    4.691799
AST_G    1.136067
TOV_G    0.617640
TRB_G    1.709996
STL_G    0.299601
BLK_G    0.282306
3P_G     0.503330
FG%      0.052942
FT%      0.110670
PER_w    3.788750
dtype: float64
RMSE for all:  1.3193101146400346


# Grouping - Baseline

In [12]:
"""
experiment setup
"""
# overall setup
donorSetup= [None,"sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics_list = [[x] for x in allMetrics]

"""
experiment
"""
print("1 metric at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
all_bench = pd.DataFrame()
all_R2 = pd.DataFrame()
for playerName in activePlayers:
    # print(playerName)
    # print("*** year - year_count matching for this player")
    # a = df_year[df_year.index == playerName]
    # print(a.dropna(axis=1))

    target = Target(playerName, allPivotedTableDict)
    # print("*** target - total index: ", target.total_index)
    # print(target.concat(metrics_list[1]))
    
    # benchmark
    true, benchmark = getBenchmark(target, allMetrics, pred_interval)
    
    # prediction
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)
    all_bench = pd.concat([all_bench, benchmark], axis=1)
    
    R2 = getR2(player_true, player_pred, benchmark)
    all_R2 = pd.concat([all_R2, R2], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())

print()
print("*** R2 ***")
print(all_R2.mean(axis=1))
print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))
##############################################################

1 metric at once
(10, 228)
*** MAPE ***
PTS_G    0.330647
AST_G    0.372639
TOV_G    0.339674
TRB_G    0.313756
STL_G    0.320293
BLK_G    0.523257
3P_G     0.642541
FG%      0.083542
FT%      0.086549
PER_w    0.241234
dtype: float64
MAPE for all:  0.32538716712421906

*** RMSE ***
PTS_G    3.212774
AST_G    0.948374
TOV_G    0.429059
TRB_G    1.526153
STL_G    0.261448
BLK_G    0.202434
3P_G     0.495202
FG%      0.055732
FT%      0.111807
PER_w    3.655378
dtype: float64
RMSE for all:  1.0898361617314343

*** R2 ***
PTS_G   -3.060526e+01
AST_G   -1.394150e+05
TOV_G   -1.368096e+02
TRB_G   -2.987561e+01
STL_G   -6.843799e+01
BLK_G   -2.027625e+01
3P_G    -1.286444e+02
FG%     -1.232705e+08
FT%     -8.834818e+01
PER_w   -2.295992e+02
dtype: float64
R2 for all:  -12341062.69028894


In [14]:
"""
experiment setup
"""
# overall setup
donorSetup= ["variance_batch","sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics_list = [allMetrics]

"""
experiment
"""
print("10 metrics at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
all_bench = pd.DataFrame()
all_R2 = pd.DataFrame()
for playerName in activePlayers:
    # print(playerName)
    # print("*** year - year_count matching for this player")
    # a = df_year[df_year.index == playerName]
    # print(a.dropna(axis=1))

    target = Target(playerName, allPivotedTableDict)
    # print("*** target - total index: ", target.total_index)
    # print(target.concat(metrics_list[1]))
    
    # benchmark
    true, benchmark = getBenchmark(target, allMetrics, pred_interval)
    
    # prediction
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)
    all_bench = pd.concat([all_bench, benchmark], axis=1)
    
    R2 = getR2(player_true, player_pred, benchmark)
    all_R2 = pd.concat([all_R2, R2], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())

print()
print("*** R2 ***")
print(all_R2.mean(axis=1))
print("R2 for all: ", all_R2.mean(axis=1).mean(axis=0))
##############################################################

10 metrics at once
(10, 228)
*** MAPE ***
PTS_G    0.367120
AST_G    0.598520
TOV_G    0.392989
TRB_G    0.346319
STL_G    0.427427
BLK_G    0.733845
3P_G     1.204553
FG%      0.092947
FT%      0.100229
PER_w    0.254007
dtype: float64
MAPE for all:  0.4420708928402392

*** RMSE ***
PTS_G    4.051636
AST_G    1.236445
TOV_G    0.569640
TRB_G    1.684309
STL_G    0.359940
BLK_G    0.293392
3P_G     0.550777
FG%      0.062353
FT%      0.119759
PER_w    4.139142
dtype: float64
RMSE for all:  1.3067391912364323

*** R2 ***
PTS_G   -8.995938e+01
AST_G   -1.904628e+04
TOV_G   -1.782756e+03
TRB_G   -9.665420e+01
STL_G   -5.556138e+01
BLK_G   -1.173255e+02
3P_G             -inf
FG%     -9.264038e+07
FT%     -2.786185e+03
PER_w   -8.764996e+02
dtype: float64
R2 for all:  -inf
