In [1]:
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import copy

from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.importData import *
import mrsc.src.utils as utils

In [2]:
def getActivePlayers(stats, year, buffer):
    # list of name of the players who were active in this and last year
    thisYear = stats[stats.Year == year].copy()
    players = list(thisYear.Player.unique())
    for i in range(1, buffer+1):
        previousYear = stats[stats.Year == (year-i)].copy()
        players = list(set(players) & set(previousYear.Player.unique()))
    return players

def topPlayers(stats, year, metric, n):
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player","player_id"]][:n]

def removeDuplicated(players, stats):
    """
    players: "../data/nba-players-stats/player_data.csv"
    stats: "../data/nba-players-stats/Seasons_Stats.csv"
    """
    # players with the same name
    names = players.name.unique()
    duplicated = np.array([])

    for name in names:
        numrows = len(players[players.name == name])
        if numrows != 1:
            duplicated = np.append(duplicated, name)

    duplicated = np.sort(duplicated)

    start_year = players.copy()
    start_year = start_year.rename(columns={"name":"Player"})

    # for non-duplicated players
    stats_not_duplicated = stats[~stats.Player.isin(duplicated)]
    stats_not_duplicated = pd.merge(stats_not_duplicated, start_year, on="Player", how="left")

    # only take the values that make sense
    stats_not_duplicated = stats_not_duplicated[(stats_not_duplicated.Year >= stats_not_duplicated.year_start) & (stats_not_duplicated.Year <= stats_not_duplicated.year_end )]
    stats_not_duplicated["year_count"] = stats_not_duplicated.Year - stats_not_duplicated.year_start

    return stats_not_duplicated

# Clean Data

In [3]:
"""
import data
"""
players = pd.read_csv("../data/nba-players-stats/player_data.csv")
players = players[players.year_start >= 1980] # only choose players who started after 1980
players["player_id"] = range(0,len(players.name)) # assign id

stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
stats = stats[stats.Player.isin(players.name)]

# only after 1980
stats = stats[stats.Year >= 1980]

# without duplicated names --> to do: how to distinguish multiple player with the same name
stats = removeDuplicated(players, stats)
stats.Year = stats.Year.astype(int)
stats.year_count = stats.year_count.astype(int)

# transform stats to a dictionary composed of df's for each stat
# the stats are re-calculated to get one stat for each year

metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK"]
metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

metricsPerCentColNames = ["FG","FT","3P"]
metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

metricsWeightedColNames = ["PER"]
metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
allPivotedTableDict = getPivotedTableDict(allMetricsDict)

# this matrix will be used to mask the table
df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

In [4]:
# targets to test
activePlayers = getActivePlayers(stats, 2016, 4)
activePlayers.sort()
activePlayers.remove("Kevin Garnett")
activePlayers.remove("Kobe Bryant")

# overall setup
expSetup = ["sliding", "SVD", "all", "pinv", False]
threshold = 0.97

### Baseline Performance (we want at least better than this)

In [14]:
metrics_to_use= ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%","TRB_G","STL_G","BLK_G"]
weights = [1.] * 10

means = pd.DataFrame([7.9220039916884835,1.7957411223657396, 1.2177917024718974, 12.461764871776813, 
                      0.43785559339244096, 0.69908195642175319, 0.21029194254679157, 3.4789347250141578, 
                      0.65261301463080668, 0.40023620475586968], index = metrics_to_use)

print("compute the error for mean prediction")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in activePlayers:
    target = Target(playerName, allPivotedTableDict, df_year)
    donor = Donor(allPivotedTableDict, df_year)

    mrsc = mRSC(donor, target, probObservation=1)
    mrsc.fit_threshold(metrics_to_use, weights, 2016, pred_length = 1, threshold = threshold, setup = expSetup)

    pred = means
    true = mrsc.getTrue()
    pred.columns = [playerName]
    true.columns = [playerName]

    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())
rmse = utils.rmse_2d(true_all, pred_all)
print(rmse)
print("RMSE for all: ", rmse.mean())    

compute the error for mean prediction
PTS_G    0.636141
AST_G    1.267579
TOV_G    0.794315
PER_w    0.307144
FG%      0.108014
FT%      0.147521
3P%      0.390554
TRB_G    0.523951
STL_G    0.670364
BLK_G    1.458091
dtype: float64
MAPE for all:  0.6471657222310905
PTS_G    6.400906
AST_G    2.130397
TOV_G    0.853258
PER_w    5.255235
FG%      0.074854
FT%      0.144391
3P%      0.175301
TRB_G    2.522428
STL_G    0.476071
BLK_G    0.426444
dtype: float64
RMSE for all:  1.8459285994834036


### Off vs. Def

In [16]:
offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"]
defMetrics = ["TRB_G","STL_G","BLK_G"]
weightsOff = [0.030226243506617984, 0.23767435579974203, 0.62302081521153241, 0.028496590283710845, 0.99135485530619705, 0.96678243679381637, 0.96723382349958986]
weightsDef = [0.14231010741961231, 0.82630141067410789, 0.8168122805751753]
    
print("start experiment - off/def with var-standardized weights")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in activePlayers:
    target = Target(playerName, allPivotedTableDict, df_year)
    donor = Donor(allPivotedTableDict, df_year)

    mrsc = mRSC(donor, target, probObservation=1)
    mrsc.fit_threshold(offMetrics, weightsOff, 2016, pred_length = 1, threshold = threshold, setup = expSetup)

    predOff = mrsc.predict()
    trueOff = mrsc.getTrue()
    predOff.columns = [playerName]
    trueOff.columns = [playerName]

    mrsc.fit_threshold(defMetrics, weightsDef, 2016, pred_length = 1, threshold = threshold, setup = expSetup)
    predDef = mrsc.predict()
    trueDef = mrsc.getTrue()
    predDef.columns = [playerName]
    trueDef.columns = [playerName]

    pred = pd.concat([predOff, predDef], axis=0)
    true = pd.concat([trueOff, trueDef], axis=0)

    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())
rmse = utils.rmse_2d(true_all, pred_all)
print(rmse)
print("RMSE for all: ", rmse.mean())    

start experiment - off/def with var-standardized weights
PTS_G    1.098576e+10
AST_G    6.349704e+09
TOV_G    1.183434e+10
PER_w    5.213857e+09
FG%      3.705333e+09
FT%      3.632898e+07
3P%      2.140308e+09
TRB_G    3.170361e-01
STL_G    3.822576e-01
BLK_G    5.877068e-01
dtype: float64
MAPE for all:  4384566416.280988
PTS_G    2.207733e+11
AST_G    7.745200e+10
TOV_G    6.266807e+10
PER_w    4.831932e+11
FG%      1.761269e+10
FT%      3.226915e+09
3P%      8.182474e+09
TRB_G    1.457368e+00
STL_G    2.771510e-01
BLK_G    2.258253e-01
dtype: float64
RMSE for all:  87310871012.85197


In [39]:
mape.T[mape.T.PTS_G > 100].T

Unnamed: 0,Jamal Crawford,Mike Miller
PTS_G,35820100000.0,2457950000000.0
AST_G,76909400000.0,1358120000000.0
TOV_G,100778000000.0,2585620000000.0
PER_w,79389600000.0,1109370000000.0
FG%,100374000000.0,740736000000.0
FT%,8210350000.0,
3P%,55338600000.0,334198000000.0
TRB_G,0.0867643,1.03874
STL_G,0.101015,0.267144
BLK_G,0.299372,0.80583


In [11]:
playerName = "Jamal Crawford"
target = Target(playerName, allPivotedTableDict, df_year)
donor = Donor(allPivotedTableDict, df_year)

offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"]
defMetrics = ["TRB_G","STL_G","BLK_G"]
weightsOff = [0.030226243506617984, 0.23767435579974203, 0.62302081521153241, 0.028496590283710845, 0.99135485530619705, 0.96678243679381637, 0.96723382349958986]
weightsDef = [0.14231010741961231, 0.82630141067410789, 0.8168122805751753]
   
mrsc = mRSC(donor, target, probObservation=1)
mrsc.fit_threshold(offMetrics, weightsOff, 2016, pred_length = 1, threshold = threshold, setup = ["sliding", "SVD", "all", "pinv", False])

predOff = mrsc.predict()
trueOff = mrsc.getTrue()
predOff.columns = [playerName]
trueOff.columns = [playerName]

# mrsc.fit_threshold(defMetrics, weightsDef, 2016, pred_length = 1, threshold = threshold, setup = expSetup)
# predDef = mrsc.predict()
# trueDef = mrsc.getTrue()
# predDef.columns = [playerName]
# trueDef.columns = [playerName]

# pred = pd.concat([predOff, predDef], axis=0)
# true = pd.concat([trueOff, trueDef], axis=0)

In [12]:
mrsc.model.beta

array([[  1.07402644e+11],
       [ -2.98845609e+10],
       [  1.07645252e+09],
       [ -1.97823601e+11],
       [  1.60982142e+10],
       [ -7.41875883e+09],
       [ -9.62699046e+08],
       [  6.62738078e+09],
       [  8.86756552e+09],
       [  4.83092203e+09],
       [ -1.75336117e+09],
       [  6.21659984e+09],
       [ -6.26839739e+09],
       [  1.26999419e+09],
       [  1.69852004e+08],
       [  2.36087000e+09],
       [  1.37859761e+09],
       [ -6.25790575e+09],
       [  7.69486136e+09],
       [  2.86952327e+09],
       [  3.81969579e+09],
       [ -5.93612879e+09],
       [  7.76480857e+09],
       [  8.12154995e+09],
       [ -2.78208430e+09],
       [  4.87401248e+08],
       [ -3.21551465e+09],
       [ -9.46515972e+09],
       [ -9.62144389e+09],
       [  9.71178846e+08],
       [  2.56347040e+09],
       [  6.51564063e+09],
       [ -6.99155016e+09],
       [ -7.64430519e+08],
       [ -1.11009633e+08],
       [  1.74624415e+09],
       [  1.13386684e+10],
 

In [13]:
predOff

Unnamed: 0,Jamal Crawford
PTS_G,507829000000.0
AST_G,178157000000.0
TOV_G,144151000000.0
PER_w,1111450000000.0
FG%,40513200000.0
FT%,7422640000.0
3P%,-18821600000.0


In [14]:
trueOff

Unnamed: 0,Jamal Crawford
PTS_G,14.1772
AST_G,2.31646
TOV_G,1.43038
PER_w,14.0
FG%,0.403621
FT%,0.904059
3P%,0.340116


In [8]:
beta = np.linalg.pinv(mrsc.model.donor_pre.T).dot(mrsc.model.target_pre.T)
beta

array([[  1.07402644e+11],
       [ -2.98845609e+10],
       [  1.07645252e+09],
       [ -1.97823601e+11],
       [  1.60982142e+10],
       [ -7.41875883e+09],
       [ -9.62699046e+08],
       [  6.62738078e+09],
       [  8.86756552e+09],
       [  4.83092203e+09],
       [ -1.75336117e+09],
       [  6.21659984e+09],
       [ -6.26839739e+09],
       [  1.26999419e+09],
       [  1.69852004e+08],
       [  2.36087000e+09],
       [  1.37859761e+09],
       [ -6.25790575e+09],
       [  7.69486136e+09],
       [  2.86952327e+09],
       [  3.81969579e+09],
       [ -5.93612879e+09],
       [  7.76480857e+09],
       [  8.12154995e+09],
       [ -2.78208430e+09],
       [  4.87401248e+08],
       [ -3.21551465e+09],
       [ -9.46515972e+09],
       [ -9.62144389e+09],
       [  9.71178846e+08],
       [  2.56347040e+09],
       [  6.51564063e+09],
       [ -6.99155016e+09],
       [ -7.64430519e+08],
       [ -1.11009633e+08],
       [  1.74624415e+09],
       [  1.13386684e+10],
 

In [56]:
mrsc.model.beta

array([[  1.07402644e+11],
       [ -2.98845609e+10],
       [  1.07645252e+09],
       [ -1.97823601e+11],
       [  1.60982142e+10],
       [ -7.41875883e+09],
       [ -9.62699046e+08],
       [  6.62738078e+09],
       [  8.86756552e+09],
       [  4.83092203e+09],
       [ -1.75336117e+09],
       [  6.21659984e+09],
       [ -6.26839739e+09],
       [  1.26999419e+09],
       [  1.69852004e+08],
       [  2.36087000e+09],
       [  1.37859761e+09],
       [ -6.25790575e+09],
       [  7.69486136e+09],
       [  2.86952327e+09],
       [  3.81969579e+09],
       [ -5.93612879e+09],
       [  7.76480857e+09],
       [  8.12154995e+09],
       [ -2.78208430e+09],
       [  4.87401248e+08],
       [ -3.21551465e+09],
       [ -9.46515972e+09],
       [ -9.62144389e+09],
       [  9.71178846e+08],
       [  2.56347040e+09],
       [  6.51564063e+09],
       [ -6.99155016e+09],
       [ -7.64430519e+08],
       [ -1.11009633e+08],
       [  1.74624415e+09],
       [  1.13386684e+10],
 

In [57]:
np.__version__

'1.13.3'

In [6]:
metrics_to_use= ["PTS_G","PER_w"]

weights1 = [1.,1.]
weights2 = [0.031168345507630011, 0.029332465536104278]
weights_list = [weights1, weights2]

print("start experiment")
for weights in weigts_list:
    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    for playerName in activePlayers:
        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(allPivotedTableDict, df_year)

        mrsc = mRSC(donor, target, probObservation=1)
        mrsc.fit_threshold(metrics_to_use, weights, 2016, pred_length = 1, threshold = threshold, setup = expSetup)

        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]

        pred_all = pd.concat([pred_all, pred], axis=1)
        true_all = pd.concat([true_all, true], axis=1)

    ###################
    mask = (true_all !=0 )
    mape = np.abs(pred_all - true_all) / true_all[mask]
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())
    rmse = utils.rmse_2d(true_all, pred_all)
    print(rmse)
    print("RMSE for all: ", rmse.mean())    

start experiment
PTS_G    0.330567
PER_w    0.227677
dtype: float64
MAPE for all:  0.28638929217150383
PTS_G    3.242921
PER_w    3.291527
dtype: float64
RMSE for all:  3.267223740754302


In [7]:
metrics_to_use= ["AST_G","TOV_G", "FG%","FT%","3P%"]
weights = [1.,1.,1.,1.,1.]

print("start experiment")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in activePlayers:
    target = Target(playerName, allPivotedTableDict, df_year)
    donor = Donor(allPivotedTableDict, df_year)

    mrsc = mRSC(donor, target, probObservation=1)
    mrsc.fit_threshold(metrics_to_use, weights, 2016, pred_length = 1, threshold = threshold, setup = expSetup)

    pred = mrsc.predict()
    true = mrsc.getTrue()
    pred.columns = [playerName]
    true.columns = [playerName]

    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())
rmse = utils.rmse_2d(true_all, pred_all)
print(rmse)
print("RMSE for all: ", rmse.mean())    

start experiment
AST_G    0.561490
TOV_G    0.446343
FG%      0.249764
FT%      0.257253
3P%      0.405253
dtype: float64
MAPE for all:  0.383793410476059
AST_G    1.218134
TOV_G    0.544744
FG%      0.150243
FT%      0.244952
3P%      0.177820
dtype: float64
RMSE for all:  0.4671785601786508
