In [16]:
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import copy
import pickle

from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.importData import *
import mrsc.src.utils as utils

In [2]:
def getActivePlayers(stats, year, buffer):
    # list of name of the players who were active in this and last year
    thisYear = stats[stats.Year == year].copy()
    players = list(thisYear.Player.unique())
    for i in range(1, buffer+1):
        previousYear = stats[stats.Year == (year-i)].copy()
        players = list(set(players) & set(previousYear.Player.unique()))
    return players

def topPlayers(stats, year, metric, n):
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player","player_id"]][:n]

def removeDuplicated(players, stats):
    """
    players: "../data/nba-players-stats/player_data.csv"
    stats: "../data/nba-players-stats/Seasons_Stats.csv"
    """
    # players with the same name
    names = players.name.unique()
    duplicated = np.array([])

    for name in names:
        numrows = len(players[players.name == name])
        if numrows != 1:
            duplicated = np.append(duplicated, name)

    duplicated = np.sort(duplicated)

    start_year = players.copy()
    start_year = start_year.rename(columns={"name":"Player"})

    # for non-duplicated players
    stats_not_duplicated = stats[~stats.Player.isin(duplicated)]
    stats_not_duplicated = pd.merge(stats_not_duplicated, start_year, on="Player", how="left")

    # only take the values that make sense
    stats_not_duplicated = stats_not_duplicated[(stats_not_duplicated.Year >= stats_not_duplicated.year_start) & (stats_not_duplicated.Year <= stats_not_duplicated.year_end )]
    stats_not_duplicated["year_count"] = stats_not_duplicated.Year - stats_not_duplicated.year_start

    return stats_not_duplicated

# Clean Data

In [19]:
"""
import data
"""
players = pd.read_csv("../data/nba-players-stats/player_data.csv")
players = players[players.year_start >= 1980] # only choose players who started after 1980
players["player_id"] = range(0,len(players.name)) # assign id

stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
stats = stats[stats.Player.isin(players.name)]

# only after 1980
stats = stats[stats.Year >= 1980]

# without duplicated names --> to do: how to distinguish multiple player with the same name
stats = removeDuplicated(players, stats)
stats.Year = stats.Year.astype(int)
stats.year_count = stats.year_count.astype(int)

# transform stats to a dictionary composed of df's for each stat
# the stats are re-calculated to get one stat for each year

metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK"]
metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

metricsPerCentColNames = ["FG","FT","3P"]
metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

metricsWeightedColNames = ["PER"]
metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
allPivotedTableDict = getPivotedTableDict(allMetricsDict)

# this matrix will be used to mask the table
df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

In [20]:
# targets to test
activePlayers = getActivePlayers(stats, 2016, 4)
activePlayers.sort()
# to few donors
activePlayers.remove("Kevin Garnett")
activePlayers.remove("Kobe Bryant")
# weird beta behavior
activePlayers.remove("Jamal Crawford")
activePlayers.remove("Mike Miller")

# overall setup
expSetup = ["sliding", "SVD", "all", "pinv", False]
threshold = 0.97

### Baseline Performance (we want at least better than this)

In [21]:
metrics_to_use= ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%","TRB_G","STL_G","BLK_G"]
weights = [1.] * 10

means = pd.DataFrame([7.9220039916884835,1.7957411223657396, 1.2177917024718974, 12.461764871776813, 
                      0.43785559339244096, 0.69908195642175319, 0.21029194254679157, 3.4789347250141578, 
                      0.65261301463080668, 0.40023620475586968], index = metrics_to_use)

print("compute the error for mean prediction")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in activePlayers:
    target = Target(playerName, allPivotedTableDict, df_year)
    donor = Donor(allPivotedTableDict, df_year)

    mrsc = mRSC(donor, target, probObservation=1)
    mrsc.fit_threshold(metrics_to_use, weights, 2016, pred_length = 1, threshold = threshold, setup = expSetup)

    pred = means
    true = mrsc.getTrue()
    pred.columns = [playerName]
    true.columns = [playerName]

    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())
rmse = utils.rmse_2d(true_all, pred_all)
print(rmse)
print("RMSE for all: ", rmse.mean())    

compute the error for mean prediction
PTS_G    0.618013
AST_G    1.272938
TOV_G    0.790195
PER_w    0.305318
FG%      0.107557
FT%      0.147169
3P%      0.390415
TRB_G    0.515323
STL_G    0.670241
BLK_G    1.446932
dtype: float64
MAPE for all:  0.64264173902046
PTS_G    6.400733
AST_G    2.138599
TOV_G    0.855015
PER_w    5.262524
FG%      0.074946
FT%      0.136692
3P%      0.175561
TRB_G    2.526368
STL_G    0.477519
BLK_G    0.427495
dtype: float64
RMSE for all:  1.8475451612551297


### Off vs. Def

In [22]:
offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"]
defMetrics = ["TRB_G","STL_G","BLK_G"]
weightsOff = [0.030226243506617984, 0.23767435579974203, 0.62302081521153241, 0.028496590283710845, 0.99135485530619705, 0.96678243679381637, 0.96723382349958986]
weightsDef = [0.14231010741961231, 0.82630141067410789, 0.8168122805751753]
    
print("start experiment - off/def with var-standardized weights")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in activePlayers:
    target = Target(playerName, allPivotedTableDict, df_year)
    donor = Donor(allPivotedTableDict, df_year)

    mrsc = mRSC(donor, target, probObservation=1)
    mrsc.fit_threshold(offMetrics, weightsOff, 2016, pred_length = 1, threshold = threshold, setup = expSetup)

    predOff = mrsc.predict()
    trueOff = mrsc.getTrue()
    predOff.columns = [playerName]
    trueOff.columns = [playerName]

    mrsc.fit_threshold(defMetrics, weightsDef, 2016, pred_length = 1, threshold = threshold, setup = expSetup)
    predDef = mrsc.predict()
    trueDef = mrsc.getTrue()
    predDef.columns = [playerName]
    trueDef.columns = [playerName]

    pred = pd.concat([predOff, predDef], axis=0)
    true = pd.concat([trueOff, trueDef], axis=0)

    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())
rmse = utils.rmse_2d(true_all, pred_all)
print(rmse)
print("RMSE for all: ", rmse.mean())    

start experiment - off/def with var-standardized weights
PTS_G    0.441639
AST_G    0.488677
TOV_G    0.375900
PER_w    0.263511
FG%      0.131360
FT%      0.100613
3P%      0.270641
TRB_G    0.314842
STL_G    0.384051
BLK_G    0.588030
dtype: float64
MAPE for all:  0.3453567203167574
PTS_G    4.417351
AST_G    1.013456
TOV_G    0.480871
PER_w    4.224764
FG%      0.089354
FT%      0.112717
3P%      0.147596
TRB_G    1.461689
STL_G    0.278296
BLK_G    0.226756
dtype: float64
RMSE for all:  1.2452850431044076


In [23]:
mape.T[mape.T.PTS_G > 100].T

PTS_G
AST_G
TOV_G
PER_w
FG%
FT%
3P%
TRB_G
STL_G
BLK_G


In [25]:
metrics_to_use= ["PTS_G","PER_w"]

weights1 = [1.,1.]
weights2 = [0.031168345507630011, 0.029332465536104278]
weights_list = [weights1, weights2]

print("start experiment")
for weights in weights_list:
    print()
    print(weights)
    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    for playerName in activePlayers:
        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(allPivotedTableDict, df_year)

        mrsc = mRSC(donor, target, probObservation=1)
        mrsc.fit_threshold(metrics_to_use, weights, 2016, pred_length = 1, threshold = threshold, setup = expSetup)

        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]

        pred_all = pd.concat([pred_all, pred], axis=1)
        true_all = pd.concat([true_all, true], axis=1)

    ###################
    mask = (true_all !=0 )
    mape = np.abs(pred_all - true_all) / true_all[mask]
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())
    rmse = utils.rmse_2d(true_all, pred_all)
    print(rmse)
    print("RMSE for all: ", rmse.mean())    

start experiment

[1.0, 1.0]
PTS_G    0.326406
PER_w    0.227708
dtype: float64
MAPE for all:  0.28439772542183495
PTS_G    3.251259
PER_w    3.301353
dtype: float64
RMSE for all:  3.2763061215056464

[0.03116834550763001, 0.029332465536104278]
PTS_G    0.323816
PER_w    0.227367
dtype: float64
MAPE for all:  0.282924819277604
PTS_G    3.242725
PER_w    3.297933
dtype: float64
RMSE for all:  3.270329080954553


In [7]:
metrics_to_use= ["AST_G","TOV_G", "FG%","FT%","3P%"]
weights1 = [1.,1.,1.,1.,1.]
weights2 = [0.23767435579974203, 0.62302081521153241, 0.99135485530619705, 0.96678243679381637, 0.96723382349958986]


print("start experiment")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in activePlayers:
    target = Target(playerName, allPivotedTableDict, df_year)
    donor = Donor(allPivotedTableDict, df_year)

    mrsc = mRSC(donor, target, probObservation=1)
    mrsc.fit_threshold(metrics_to_use, weights, 2016, pred_length = 1, threshold = threshold, setup = expSetup)

    pred = mrsc.predict()
    true = mrsc.getTrue()
    pred.columns = [playerName]
    true.columns = [playerName]

    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())
rmse = utils.rmse_2d(true_all, pred_all)
print(rmse)
print("RMSE for all: ", rmse.mean())    

start experiment
AST_G    0.561490
TOV_G    0.446343
FG%      0.249764
FT%      0.257253
3P%      0.405253
dtype: float64
MAPE for all:  0.383793410476059
AST_G    1.218134
TOV_G    0.544744
FG%      0.150243
FT%      0.244952
3P%      0.177820
dtype: float64
RMSE for all:  0.4671785601786508


In [30]:
import pkg_resources
import types
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to had
        # exceptions to this list manually!
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]

        yield name
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

pandas==0.24.2
numpy==1.13.3
matplotlib==2.1.0


In [38]:
donor_pre_ok = pd.read_pickle("donor_pre_ok")
donor_pre_not_ok = pd.read_pickle("donor_pre_not_ok")
utils.rmse_2d(donor_pre_ok, donor_pre_not_ok).mean()

4.811166011565746e-16

In [45]:
target_pre_ok = pd.read_pickle("target_pre_ok")
target_pre_not_ok = pd.read_pickle("target_pre_not_ok")
utils.rmse_2d(target_pre_ok, target_pre_not_ok)

Player
Brandon Knight    0.0
dtype: float64

In [44]:
(target_pre_ok == target_pre_not_ok)

Unnamed: 0_level_0,0,1,2,3,5,6,7,8,10,11,...,22,23,25,26,27,28,30,31,32,33
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Brandon Knight,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [46]:
beta_ok = np.linalg.pinv(donor_pre_ok.T).dot(target_pre_ok.T)

In [47]:
beta_ok

array([[  1.21083607e-04],
       [ -7.99215763e-05],
       [ -6.41249914e-05],
       ..., 
       [ -9.38451703e-05],
       [ -1.35922110e-04],
       [ -2.06877420e-04]])

In [48]:
beta_not_ok = np.linalg.pinv(donor_pre_not_ok.T).dot(target_pre_ok.T)

In [49]:
beta_not_ok

array([[  1.21083607e-04],
       [ -7.99215763e-05],
       [ -6.41249914e-05],
       ..., 
       [ -9.38451703e-05],
       [ -1.35922110e-04],
       [ -2.06877420e-04]])

In [54]:
utils.rmse_2d(beta_ok, beta_not_ok).mean()

6.8720890755282711e-19