In [1]:
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import copy
import pickle

from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.importData import *
import mrsc.src.utils as utils

from itertools import combinations, product

def prepareData(stats):
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK","3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG","FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

    allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())
    return allPivotedTableDict, allMetrics

def getActivePlayers(stats, year, buffer):
    # list of name of the players who were active in this and last year
    thisYear = stats[stats.Year == year].copy()
    players = list(thisYear.Player.unique())
    for i in range(1, buffer+1):
        previousYear = stats[stats.Year == (year-i)].copy()
        players = list(set(players) & set(previousYear.Player.unique()))
    return players

def topPlayers(stats, year, metric, n):
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player","player_id"]][:n]

def getBenchmark(target, metrics_to_use, pred_interval):    
    target_data, nanIndex = target.concat(metrics_to_use)
    num_k = len(metrics_to_use)
    interv_index = int(target_data.shape[1]/num_k - pred_interval)
    total_index = int(interv_index + 1)
    
    # true
    true = utils.get_postint_data(target_data, interv_index, total_index, num_k).T
    true.index = metrics_to_use
    
    # predictions
    history = utils.get_preint_data(target_data, interv_index, total_index, num_k)
    pred = []
    for i in range(num_k):
        pred.append(history.iloc[:,i*interv_index:(i+1)*interv_index].mean(axis=1).to_list())

    pred = pd.DataFrame(pred, index=metrics_to_use, columns = [playerName])
    return true, pred

In [2]:
"""
import data
"""
pred_year = 2015 # the year that we are living in
pred_interval = 1 # we are making predictions for pred_year+1 and +2

print("*** importing data ***")
players = pd.read_csv("../data/nba-players-stats/player_data.csv")
players = players[players.year_start >= 1980] # only choose players who started after 1980
# players["player_id"] = range(0,len(players.name)) # assign id

stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
stats = stats[stats.Player.isin(players.name)]

# only after 1980
stats = stats[stats.Year >= 1980]

# without duplicated names --> to do: how to distinguish multiple player with the same name
stats = removeDuplicated(players, stats)
stats.Year = stats.Year.astype(int)
stats.year_count = stats.year_count.astype(int)

print("*** preparing data ***")

########### Donor ##########
# filter stats by the year
stats_donor = stats[stats.Year <= pred_year]
allPivotedTableDict_d, allMetrics = prepareData(stats_donor)
donor = Donor(allPivotedTableDict_d)

########### Target ##########
# filter stats by the year
stats_target = stats[stats.Year <= pred_year+pred_interval]
allPivotedTableDict, allMetrics = prepareData(stats_target)

# just to debug
df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

"""
targets
"""
# targets
playerNames = getActivePlayers(stats, pred_year+1, buffer=4)
playerNames.sort()
playerNames.remove("Kevin Garnett")
playerNames.remove("Kobe Bryant")

*** importing data ***
*** preparing data ***


In [3]:
metrics_to_use= ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G","TRB_G","STL_G","BLK_G"]

print("Algo: outputs the mean of the player's history")
print("-----")
pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    true, pred = getBenchmark(target, metrics_to_use, pred_interval)
    
    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

###################
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(true_all, pred_all)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())    

Algo: outputs the mean of the player's history
-----
*** MAPE ***
PTS_G    0.575711
AST_G    0.506711
TOV_G    0.596790
PER_w    0.287966
FG%      0.076904
FT%      0.081673
3P_G     0.682272
TRB_G    0.422301
STL_G    0.446710
BLK_G    0.685350
dtype: float64
MAPE for all:  0.4420163650526481

*** RMSE ***
PTS_G    4.691799
AST_G    1.136067
TOV_G    0.617640
PER_w    3.788750
FG%      0.052942
FT%      0.110670
3P_G     0.503330
TRB_G    1.709996
STL_G    0.299601
BLK_G    0.282306
dtype: float64
RMSE for all:  1.3193101146400346


In [19]:
"""
experiment setup
"""
# overall setup
donorSetup= ["variance","sliding", True]
# weighting = donorSetup[0] # None / "normalize"
# mat_form_method = donorSetup[1] # "fixed"
# skipNan = donorSetup[2] # (Boolean)
denoiseSetup = ["SVD", "all"]
# denoise_method = denoiseSetup[0] # "SVD"
# denoise_mat_method = denoiseSetup[1] # "all"
regression_method = "pinv"

threshold = 0.97
verbose = False

# offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"]
# defMetrics = ["TRB_G","STL_G","BLK_G"]

metrics_list = [allMetrics]

"""
experiment
"""
print("10 metrics at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
for playerName in playerNames:
    # print(playerName)
    # print("*** year - year_count matching for this player")
    # a = df_year[df_year.index == playerName]
    # print(a.dropna(axis=1))

    target = Target(playerName, allPivotedTableDict)
    # print("*** target - total index: ", target.total_index)
    # print(target.concat(metrics_list[1]))

    mrsc = mRSC(donor, target, pred_interval, probObservation=1)

    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())
##############################################################

10 metrics at once
(10, 228)
*** MAPE ***
PTS_G    0.399555
AST_G    0.564215
TOV_G    0.395530
TRB_G    0.370901
STL_G    0.415048
BLK_G    0.751559
3P_G     1.149345
FG%      0.092531
FT%      0.103185
PER_w    0.260219
dtype: float64
MAPE for all:  0.44184728421625746

*** RMSE ***
PTS_G    4.482597
AST_G    1.206892
TOV_G    0.574428
TRB_G    1.714422
STL_G    0.352951
BLK_G    0.294476
3P_G     0.604472
FG%      0.062385
FT%      0.120527
PER_w    4.329581
dtype: float64
RMSE for all:  1.3742730966944614


In [4]:
"""
experiment setup
"""
# overall setup
donorSetup= ["variance_batch","sliding", True]
# weighting = donorSetup[0] # None / "normalize"
# mat_form_method = donorSetup[1] # "fixed"
# skipNan = donorSetup[2] # (Boolean)
denoiseSetup = ["SVD", "all"]
# denoise_method = denoiseSetup[0] # "SVD"
# denoise_mat_method = denoiseSetup[1] # "all"
regression_method = "pinv"

threshold = 0.97
verbose = False

# offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"]
# defMetrics = ["TRB_G","STL_G","BLK_G"]

metrics_list = [allMetrics]

"""
experiment
"""
print("10 metrics at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
for playerName in playerNames:
    # print(playerName)
    # print("*** year - year_count matching for this player")
    # a = df_year[df_year.index == playerName]
    # print(a.dropna(axis=1))

    target = Target(playerName, allPivotedTableDict)
    # print("*** target - total index: ", target.total_index)
    # print(target.concat(metrics_list[1]))

    mrsc = mRSC(donor, target, pred_interval, probObservation=1)

    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())
##############################################################

10 metrics at once
(10, 228)
*** MAPE ***
PTS_G    0.367120
AST_G    0.598520
TOV_G    0.392989
TRB_G    0.346319
STL_G    0.427427
BLK_G    0.733845
3P_G     1.204553
FG%      0.092947
FT%      0.100229
PER_w    0.254007
dtype: float64
MAPE for all:  0.4420708928402392

*** RMSE ***
PTS_G    4.051636
AST_G    1.236445
TOV_G    0.569640
TRB_G    1.684309
STL_G    0.359940
BLK_G    0.293392
3P_G     0.550777
FG%      0.062353
FT%      0.119759
PER_w    4.139142
dtype: float64
RMSE for all:  1.3067391912364323


In [8]:
"""
experiment setup
"""
# overall setup
donorSetup= [None,"fixed", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics_list = [[x] for x in allMetrics]

"""
experiment
"""
print("1 metric at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
for playerName in playerNames:
    # print(playerName)
    # print("*** year - year_count matching for this player")
    # a = df_year[df_year.index == playerName]
    # print(a.dropna(axis=1))

    target = Target(playerName, allPivotedTableDict)
    # print("*** target - total index: ", target.total_index)
    # print(target.concat(metrics_list[1]))

    mrsc = mRSC(donor, target, pred_interval, probObservation=1)

    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())
##############################################################

1 metric at once
(10, 228)
*** MAPE ***
PTS_G    0.310222
AST_G    0.428037
TOV_G    0.351920
TRB_G    0.294842
STL_G    0.351011
BLK_G    0.565994
3P_G     3.207398
FG%      0.078055
FT%      0.082143
PER_w    0.223067
dtype: float64
MAPE for all:  0.545640005218527

*** RMSE ***
PTS_G    3.047117
AST_G    1.101291
TOV_G    0.477015
TRB_G    1.454538
STL_G    0.288449
BLK_G    0.231871
3P_G     3.453088
FG%      0.053541
FT%      0.109810
PER_w    3.483153
dtype: float64
RMSE for all:  1.369987314054584


In [5]:
"""
experiment setup
"""
# overall setup
donorSetup= [None,"sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics_list = [[x] for x in allMetrics]

"""
experiment
"""
print("1 metric at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
for playerName in playerNames:
    # print(playerName)
    # print("*** year - year_count matching for this player")
    # a = df_year[df_year.index == playerName]
    # print(a.dropna(axis=1))

    target = Target(playerName, allPivotedTableDict)
    # print("*** target - total index: ", target.total_index)
    # print(target.concat(metrics_list[1]))

    mrsc = mRSC(donor, target, pred_interval, probObservation=1)

    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())
##############################################################

1 metric at once
(10, 228)
*** MAPE ***
PTS_G    0.330647
AST_G    0.372639
TOV_G    0.339674
TRB_G    0.313756
STL_G    0.320293
BLK_G    0.523257
3P_G     0.642541
FG%      0.083542
FT%      0.086549
PER_w    0.241234
dtype: float64
MAPE for all:  0.32538716712421906

*** RMSE ***
PTS_G    3.212774
AST_G    0.948374
TOV_G    0.429059
TRB_G    1.526153
STL_G    0.261448
BLK_G    0.202434
3P_G     0.495202
FG%      0.055732
FT%      0.111807
PER_w    3.655378
dtype: float64
RMSE for all:  1.0898361617314343


In [6]:
"""
experiment setup
"""
# overall setup
donorSetup= ["variance","sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"

threshold = 0.97
verbose = False

metrics_list = [[x] for x in allMetrics]

"""
experiment
"""
print("1 metric at once")

all_pred = pd.DataFrame()
all_true = pd.DataFrame()
for playerName in playerNames:
    # print(playerName)
    # print("*** year - year_count matching for this player")
    # a = df_year[df_year.index == playerName]
    # print(a.dropna(axis=1))

    target = Target(playerName, allPivotedTableDict)
    # print("*** target - total index: ", target.total_index)
    # print(target.concat(metrics_list[1]))

    mrsc = mRSC(donor, target, pred_interval, probObservation=1)

    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)

###################
# print(all_pred)
print(all_pred.shape)
mask = (all_true !=0 )
mape = np.abs(all_pred - all_true) / all_true[mask]
print("*** MAPE ***")
print(mape.mean(axis=1))
print("MAPE for all: ", mape.mean().mean())

rmse = utils.rmse_2d(all_true, all_pred)
print()
print("*** RMSE ***")
print(rmse)
print("RMSE for all: ", rmse.mean())
##############################################################

1 metric at once
(10, 228)
*** MAPE ***
PTS_G    0.303243
AST_G    0.387360
TOV_G    0.342906
TRB_G    0.310335
STL_G    0.326637
BLK_G    0.778316
3P_G     0.648774
FG%      0.084888
FT%      0.089048
PER_w    0.247779
dtype: float64
MAPE for all:  0.35135294299721703

*** RMSE ***
PTS_G    3.027092
AST_G    0.965848
TOV_G    0.435086
TRB_G    1.498031
STL_G    0.272305
BLK_G    0.313579
3P_G     0.503187
FG%      0.056533
FT%      0.112442
PER_w    3.797523
dtype: float64
RMSE for all:  1.098162519058182


# Game by Game

In [57]:
def getDictionaryGameByGame(data, metrics):
    my_dict = {}
    for i in range(len(metrics)):
        data_pivot = pd.pivot_table(data, values=metrics[i], index="playDispNm", columns = "gmDate")
        shifted_df = data_pivot.apply(lambda x: pd.Series(x.dropna().values), axis=1).fillna(np.nan)
        my_dict.update({metrics[i]: shifted_df})
    return my_dict

In [58]:
"""
import data
"""
print("*** importing data ***")
data = pd.read_csv("../data/nba-enhanced-stats/2012-18_playerBoxScore.csv")

# metrics = ['playPTS', 'playAST', 'playTO', 'playSTL', 'playBLK',
#        'playPF', 'playFGA', 'playFGM', 'playFG%', 'play2PA', 'play2PM',
#        'play2P%', 'play3PA', 'play3PM', 'play3P%', 'playFTA', 'playFTM',
#        'playFT%', 'playORB', 'playDRB', 'playTRB']

metrics = ['playPTS', 'playAST', 'playTO','playFG%','playFT%','play3PM','playTRB','playSTL', 'playBLK']

date_col = pd.to_datetime(data.gmDate + " " + data.gmTime, format='%Y-%m-%d %H:%M').rename("date")
data = pd.concat([date_col,data], axis=1)

# data_dict = {}
# for i in range(len(metrics)):
#     data_pivot = pd.pivot_table(data, values=metrics[i], index="playDispNm", columns = "gmDate")
#     shifted_df = data_pivot.apply(lambda x: pd.Series(x.dropna().values), axis=1).fillna(np.nan)
#     data_dict.update({metrics[i]: shifted_df})

*** importing data ***


In [78]:
print("*** preparing data ***")

pred_date = pd.to_datetime('2012-12-01') # the year that we are living in
pred_interval = pd.Timedelta("1 day") # we are making predictions for pred_year+1 and +2


########### Donor ##########
# filter stats by the year
donor_data = data[data.date <= pred_date]
donor_dict = getDictionaryGameByGame(donor_data, metrics)
donor = Donor(donor_dict)

########### Target ##########
# filter stats by the year
target_data = data[data.date <= pred_date + pred_interval]
target_dict = getDictionaryGameByGame(target_data, metrics)


*** preparing data ***


In [85]:
data.groupby('playDispNm').mean()

Unnamed: 0_level_0,teamDayOff,playMin,playHeight,playWeight,playPTS,playAST,playTO,playSTL,playBLK,playPF,...,play3PA,play3PM,play3P%,playFTA,playFTM,playFT%,playORB,playDRB,playTRB,opptDayOff
playDispNm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.J. Hammons,1.545455,7.272727,84.0,250.0,2.181818,0.181818,0.454545,0.045455,0.590909,0.954545,...,0.454545,0.227273,0.174241,0.909091,0.409091,0.170455,0.363636,1.318182,1.681818,1.863636
A.J. Price,2.099099,15.333333,74.0,195.0,5.567568,2.378378,0.765766,0.369369,0.027027,0.837838,...,2.513514,0.819820,0.221242,0.792793,0.585586,0.195474,0.243243,1.162162,1.405405,2.018018
Aaron Brooks,1.884718,17.790885,72.0,161.0,7.675603,2.501340,1.337802,0.528150,0.152815,1.825737,...,2.785523,1.050938,0.295815,1.168901,0.957105,0.349343,0.359249,1.176944,1.536193,1.916890
Aaron Gordon,1.923954,26.133080,81.0,225.0,11.418251,1.688213,1.144487,0.771863,0.619772,1.973384,...,3.030418,0.939163,0.253851,2.646388,1.847909,0.483481,1.532319,4.311787,5.844106,1.798479
Aaron Gray,2.000000,10.974684,84.0,270.0,2.329114,0.696203,0.848101,0.215190,0.164557,1.886076,...,0.012658,0.000000,0.000000,0.810127,0.430380,0.178270,1.139241,1.949367,3.088608,2.063291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zach Randolph,1.955504,30.276347,81.0,253.0,15.477752,1.995316,1.899297,0.718970,0.238876,2.262295,...,0.772834,0.208431,0.101156,3.398126,2.576112,0.620127,2.948478,6.231850,9.180328,1.906323
Zaza Pachulia,1.949109,21.442748,83.0,240.0,7.045802,1.938931,1.468193,0.814249,0.302799,2.267176,...,0.030534,0.000000,0.000000,2.259542,1.778626,0.524839,2.412214,4.236641,6.648855,1.979644
Zhou Qi,1.611111,6.833333,86.0,219.0,1.222222,0.111111,0.555556,0.111111,0.777778,0.777778,...,1.055556,0.111111,0.027778,0.666667,0.444444,0.222222,0.333333,0.888889,1.222222,1.888889
Zoran Dragić,1.937500,4.687500,77.0,200.0,1.750000,0.312500,0.312500,0.125000,0.000000,0.375000,...,0.875000,0.187500,0.026787,0.312500,0.187500,0.125000,0.312500,0.187500,0.500000,2.312500


In [80]:
# just to debug
df_date = pd.pivot_table(data, values="gmDate", index="Player", columns = "year_count")

KeyError: 'Player'

In [28]:
pred_date = '2012-12-01'
data[data.gmDate <= pred_date]


Unnamed: 0,gmDate,gmTime,seasTyp,playLNm,playFNm,teamAbbr,teamConf,teamDiv,teamLoc,teamRslt,...,playFT%,playORB,playDRB,playTRB,opptAbbr,opptConf,opptDiv,opptLoc,opptRslt,opptDayOff
0,2012-10-30,19:00,Regular,Price,A.J.,WAS,East,Southeast,Away,Loss,...,1.0,1,1,2,CLE,East,Central,Home,Win,0
1,2012-10-30,19:00,Regular,Ariza,Trevor,WAS,East,Southeast,Away,Loss,...,0.5,1,2,3,CLE,East,Central,Home,Win,0
2,2012-10-30,19:00,Regular,Okafor,Emeka,WAS,East,Southeast,Away,Loss,...,0.5,5,2,7,CLE,East,Central,Home,Win,0
3,2012-10-30,19:00,Regular,Beal,Bradley,WAS,East,Southeast,Away,Loss,...,1.0,0,3,3,CLE,East,Central,Home,Win,0
4,2012-10-30,19:00,Regular,Booker,Trevor,WAS,East,Southeast,Away,Loss,...,0.0,1,0,1,CLE,East,Central,Home,Win,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5159,2012-12-01,22:30,Regular,Landry,Carl,GS,West,Pacific,Home,Win,...,1.0,2,2,4,IND,East,Central,Away,Loss,1
5160,2012-12-01,22:30,Regular,Jack,Jarrett,GS,West,Pacific,Home,Win,...,0.0,0,5,5,IND,East,Central,Away,Loss,1
5161,2012-12-01,22:30,Regular,Green,Draymond,GS,West,Pacific,Home,Win,...,0.0,1,0,1,IND,East,Central,Away,Loss,1
5162,2012-12-01,22:30,Regular,Biedrins,Andris,GS,West,Pacific,Home,Win,...,0.0,0,1,1,IND,East,Central,Away,Loss,1


In [50]:
date_col = pd.to_datetime(data.gmDate + " " + data.gmTime, format='%Y-%m-%d %H:%M').rename("date")
pd.concat([date_col,data], axis=1)

Unnamed: 0,date,gmDate,gmTime,seasTyp,playLNm,playFNm,teamAbbr,teamConf,teamDiv,teamLoc,...,playFT%,playORB,playDRB,playTRB,opptAbbr,opptConf,opptDiv,opptLoc,opptRslt,opptDayOff
0,2012-10-30 19:00:00,2012-10-30,19:00,Regular,Price,A.J.,WAS,East,Southeast,Away,...,1.0,1,1,2,CLE,East,Central,Home,Win,0
1,2012-10-30 19:00:00,2012-10-30,19:00,Regular,Ariza,Trevor,WAS,East,Southeast,Away,...,0.5,1,2,3,CLE,East,Central,Home,Win,0
2,2012-10-30 19:00:00,2012-10-30,19:00,Regular,Okafor,Emeka,WAS,East,Southeast,Away,...,0.5,5,2,7,CLE,East,Central,Home,Win,0
3,2012-10-30 19:00:00,2012-10-30,19:00,Regular,Beal,Bradley,WAS,East,Southeast,Away,...,1.0,0,3,3,CLE,East,Central,Home,Win,0
4,2012-10-30 19:00:00,2012-10-30,19:00,Regular,Booker,Trevor,WAS,East,Southeast,Away,...,0.0,1,0,1,CLE,East,Central,Home,Win,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155708,2018-04-11 10:30:00,2018-04-11,10:30,Regular,Sampson,JaKarr,SAC,West,Pacific,Home,...,1.0,0,4,4,HOU,West,Southwest,Away,Loss,1
155709,2018-04-11 10:30:00,2018-04-11,10:30,Regular,Hield,Chavano,SAC,West,Pacific,Home,...,0.0,2,3,5,HOU,West,Southwest,Away,Loss,1
155710,2018-04-11 10:30:00,2018-04-11,10:30,Regular,Hayes,Nigel,SAC,West,Pacific,Home,...,0.0,1,3,4,HOU,West,Southwest,Away,Loss,1
155711,2018-04-11 10:30:00,2018-04-11,10:30,Regular,Carter,Vincent,SAC,West,Pacific,Home,...,0.0,0,3,3,HOU,West,Southwest,Away,Loss,1


In [75]:
date_col+ pd.Timedelta("1 day")

0        2012-10-31 19:00:00
1        2012-10-31 19:00:00
2        2012-10-31 19:00:00
3        2012-10-31 19:00:00
4        2012-10-31 19:00:00
                 ...        
155708   2018-04-12 10:30:00
155709   2018-04-12 10:30:00
155710   2018-04-12 10:30:00
155711   2018-04-12 10:30:00
155712   2018-04-12 10:30:00
Name: date, Length: 155713, dtype: datetime64[ns]

In [16]:
pred_all = pd.DataFrame()
true_all = pd.DataFrame()

metrics_to_use = allMetrics
target = Target(playerName, allPivotedTableDict)
target_data, nanIndex = target.concat(metrics_to_use)
num_k = len(metrics_to_use)
interv_index = int(target_data.shape[1]/num_k -1)
total_index = int(interv_index + 1)

# predictions
history = utils.get_preint_data(target_data, interv_index, total_index, num_k)
pred = []
for i in range(num_k):
    pred.append(history.iloc[:,i*interv_index:(i+1)*interv_index].mean(axis=1).to_list())

pred = pd.DataFrame(pred, index=metrics_to_use, columns = [playerName])
pred_all = pd.concat([pred_all, pred], axis=1)