In [1]:
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import copy
import pickle

from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.importData import *
import mrsc.src.utils as utils

In [2]:
def getActivePlayers(stats, year, buffer):
    # list of name of the players who were active in this and last year
    thisYear = stats[stats.Year == year].copy()
    players = list(thisYear.Player.unique())
    for i in range(1, buffer+1):
        previousYear = stats[stats.Year == (year-i)].copy()
        players = list(set(players) & set(previousYear.Player.unique()))
    return players

def topPlayers(stats, year, metric, n):
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player","player_id"]][:n]

def removeDuplicated(players, stats):
    """
    players: "../data/nba-players-stats/player_data.csv"
    stats: "../data/nba-players-stats/Seasons_Stats.csv"
    """
    # players with the same name
    names = players.name.unique()
    duplicated = np.array([])

    for name in names:
        numrows = len(players[players.name == name])
        if numrows != 1:
            duplicated = np.append(duplicated, name)

    duplicated = np.sort(duplicated)

    start_year = players.copy()
    start_year = start_year.rename(columns={"name":"Player"})

    # for non-duplicated players
    stats_not_duplicated = stats[~stats.Player.isin(duplicated)]
    stats_not_duplicated = pd.merge(stats_not_duplicated, start_year, on="Player", how="left")

    # only take the values that make sense
    stats_not_duplicated = stats_not_duplicated[(stats_not_duplicated.Year >= stats_not_duplicated.year_start) & (stats_not_duplicated.Year <= stats_not_duplicated.year_end )]
    stats_not_duplicated["year_count"] = stats_not_duplicated.Year - stats_not_duplicated.year_start

    return stats_not_duplicated

# Clean Data

In [3]:
"""
import data
"""
players = pd.read_csv("../data/nba-players-stats/player_data.csv")
players = players[players.year_start >= 1980] # only choose players who started after 1980
players["player_id"] = range(0,len(players.name)) # assign id

stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
stats = stats[stats.Player.isin(players.name)]

# only after 1980
stats = stats[stats.Year >= 1980]

# without duplicated names --> to do: how to distinguish multiple player with the same name
stats = removeDuplicated(players, stats)
stats.Year = stats.Year.astype(int)
stats.year_count = stats.year_count.astype(int)

# transform stats to a dictionary composed of df's for each stat
# the stats are re-calculated to get one stat for each year

metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK"]
metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

metricsPerCentColNames = ["FG","FT","3P"]
metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

metricsWeightedColNames = ["PER"]
metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
allPivotedTableDict = getPivotedTableDict(allMetricsDict)

# this matrix will be used to mask the table
df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

In [4]:
# targets to test
activePlayers = getActivePlayers(stats, 2016, 4)
activePlayers.sort()
# to few donors
activePlayers.remove("Kevin Garnett")
activePlayers.remove("Kobe Bryant")
# weird beta behavior
activePlayers.remove("Jamal Crawford")
activePlayers.remove("Mike Miller")

# overall setup
expSetup = ["sliding", "SVD", "all", "pinv", False]
threshold = 0.97

# Baseline

In [56]:
metrics_to_use= ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%","TRB_G","STL_G","BLK_G"]

print("Algo: outputs the mean of the player's history")
print("-----")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in activePlayers:
    target = Target(playerName, allPivotedTableDict, df_year)
    target_data = target.concat(metrics_to_use, 2016, pred_length=1)
    num_k = len(metrics_to_use)
    interv_index = int(target_data.shape[1]/num_k -1)
    total_index = int(interv_index + 1)

    # true
    true = utils.get_postint_data(target_data, interv_index, total_index, num_k).T
    true.index = metrics_to_use

    # predictions
    history = utils.get_preint_data(target_data, interv_index, total_index, num_k)
    pred = []
    for i in range(num_k):
        pred.append(history.iloc[:,i*interv_index:(i+1)*interv_index].mean(axis=1).to_list())

    pred = pd.DataFrame(pred, index=metrics_to_use, columns = [playerName])

    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())
rmse = utils.rmse_2d(true_all, pred_all)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())    

Algo: outputs the mean of the player's history
-----
*** MAPE ***
PTS_G    0.549117
AST_G    0.499593
TOV_G    0.586009
PER_w    0.285259
FG%      0.076325
FT%      0.081761
3P%      0.221556
TRB_G    0.411837
STL_G    0.443918
BLK_G    0.682211
dtype: float64
MAPE for all:  0.39739103180264496

*** RMSE ***
PTS_G    4.669743
AST_G    1.132015
TOV_G    0.614409
PER_w    3.775067
FG%      0.052802
FT%      0.099701
3P%      0.126288
TRB_G    1.703413
STL_G    0.299635
BLK_G    0.283370
dtype: float64
RMSE for all:  1.2756443185018567


### Off vs. Def

In [13]:
def getWeitghts(target, donor, metrics_list, expSetup, method = "mean"):   
    # get mat_form_method
    mat_form_method = expSetup[0] # "fixed"
    
    # get weights for metrics
    weights_list = []
    for metrics in metrics_list:
        target_data = target.concat(metrics, 2016, pred_length=1)
        num_k = len(metrics)
        total_index = int(target_data.shape[1] / num_k)
        donor_data = donor.concat(metrics, 2016, total_index, method = mat_form_method)
    
        if (method == "mean"):
            weights = []
            for i in range(num_k):
                weights.append(1/(donor_data.iloc[:,i*total_index:(i+1)*total_index].mean().mean()))
            weights_list.append(weights)
        elif (method == "var"):
            weights = []
            for i in range(num_k):
                weights.append(1/(1+np.var(donor_data.iloc[:,i*total_index:(i+1)*total_index].to_numpy().flatten())))
            weights_list.append(weights)
        else:
            raise ValueError("invalid method")
    return weights_list

In [None]:
offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"]
defMetrics = ["TRB_G","STL_G","BLK_G"]
metrics_list = [offMetrics, defMetrics]

print("start experiment - off/def with mean-standardized weights")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in activePlayers:
    target = Target(playerName, allPivotedTableDict, df_year)
    donor = Donor(allPivotedTableDict, df_year)
    
    weights_list = getWeitghts(target, donor, metrics_list, expSetup, method="var")
    print(weights_list[1])

    mrsc = mRSC(donor, target, probObservation=1)
    mrsc.fit_threshold(offMetrics, weights_list[0], 2016, pred_length = 1, threshold = threshold, setup = expSetup)

    predOff = mrsc.predict()
    trueOff = mrsc.getTrue()
    predOff.columns = [playerName]
    trueOff.columns = [playerName]

    mrsc.fit_threshold(defMetrics, weights_list[1], 2016, pred_length = 1, threshold = threshold, setup = expSetup)
    predDef = mrsc.predict()
    trueDef = mrsc.getTrue()
    predDef.columns = [playerName]
    trueDef.columns = [playerName]

    pred = pd.concat([predOff, predDef], axis=0)
    true = pd.concat([trueOff, trueDef], axis=0)

    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())
rmse = utils.rmse_2d(true_all, pred_all)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())    

start experiment - off/def with mean-standardized weights
[0.11972513301257333, 0.8185841866995628, 0.7506972245869964]
[0.11100931073512388, 0.8188967286230577, 0.7354691363765063]
[0.12710986859243598, 0.8175529306188869, 0.7660445110191912]
[0.11420402319122931, 0.8190496756900922, 0.7413119417138359]
[0.12939941079555828, 0.8183155463995372, 0.771903567288724]
[0.12442341371275807, 0.8173619336248287, 0.7609333807670234]
[0.10286712987077665, 0.8232294396001423, 0.7030384268151966]
[0.11420402319122931, 0.8190496756900922, 0.7413119417138359]
[0.11100931073512388, 0.8188967286230577, 0.7354691363765063]
[0.11100931073512388, 0.8188967286230577, 0.7354691363765063]
[0.09297674205157047, 0.8105737560800742, 0.6966015868897473]
[0.11739256109065692, 0.8194441440968739, 0.7459070178014936]
[0.11420402319122931, 0.8190496756900922, 0.7413119417138359]
[0.1221898102812113, 0.818079720375203, 0.7559576169775366]
[0.1221898102812113, 0.818079720375203, 0.7559576169775366]
[0.11972513301257

# grouping

In [95]:
metrics1 = ["PTS_G","PER_w"]
metrics2 = ["AST_G","TOV_G","FG%","FT%","3P%"]
metrics3 = ["TRB_G","STL_G","BLK_G"]

metrics_list = [metrics1, metrics2, metrics3]
    
print("start experiment - off/def with mean-standardized weights")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in activePlayers:
    target = Target(playerName, allPivotedTableDict, df_year)
    donor = Donor(allPivotedTableDict, df_year)
    
    weights_list = getWeitghts(target, donor, metrics_list, expSetup, method="var")

    mrsc = mRSC(donor, target, probObservation=1)
    
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], weights_list[i], 2016, pred_length = 1, threshold = threshold, setup = expSetup)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
        
    pred_all = pd.concat([pred_all, player_pred], axis=1)
    true_all = pd.concat([true_all, player_true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())
rmse = utils.rmse_2d(true_all, pred_all)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())    

start experiment - off/def with mean-standardized weights


TypeError: getWeitghts() got an unexpected keyword argument 'method'

In [None]:
34
24