In [1]:
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import copy
import pickle

from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.importData import *
import mrsc.src.utils as utils

from itertools import combinations, product

def prepareData(stats):
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK","3P","MP"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG","FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

    allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())
    return allPivotedTableDict, allMetrics

# I think there's a bug here
def getActivePlayers(stats, year, buffer, min_games):
    # list of name of the players who were active in this and last year
    thisYear = stats[stats.Year == year].copy()
    thisYear = thisYear[thisYear.G >= min_games]
    players = list(thisYear.Player.unique())
    for i in range(1, buffer+1):
        previousYear = stats[stats.Year == (year-i)].copy()
        players = list(set(players) & set(previousYear.Player.unique()))
    return players

def topPlayers(stats, year, metric, n):
    # n = number of top players 
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player","player_id"]][:n]

def getBenchmark(target, metrics_to_use, pred_interval):    
    target_data, nanIndex = target.concat(metrics_to_use)
    num_k = len(metrics_to_use)
    interv_index = int(target_data.shape[1]/num_k - pred_interval)
    total_index = int(interv_index + 1)
    
    # true
    true = utils.get_postint_data(target_data, interv_index, total_index, num_k).T
    true.index = metrics_to_use
    
    # predictions
    history = utils.get_preint_data(target_data, interv_index, total_index, num_k)
    pred = []
    for i in range(num_k):
        pred.append(history.iloc[:,i*interv_index:(i+1)*interv_index].mean(axis=1).to_list())

    pred = pd.DataFrame(pred, index=metrics_to_use, columns = [playerName])
    return true, pred

In [None]:
all_pred = pd.DataFrame()
all_true = pd.DataFrame()
all_bench = pd.DataFrame()
all_R2 = pd.DataFrame()
for playerName in playerNames:
    target = Target(playerName, allPivotedTableDict)
    
    # benchmark
    true, benchmark = getBenchmark(target, ["PTS_G"], pred_interval)
    
    # prediction
    mrsc = mRSC(donor, target, pred_interval, probObservation=1)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i], threshold, donorSetup, denoiseSetup,regression_method, verbose)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)
    all_pred = pd.concat([all_pred, player_pred], axis=1)
    all_true = pd.concat([all_true, player_true], axis=1)
    all_bench = pd.concat([all_bench, benchmark], axis=1)
    
    R2 = getR2(player_true, player_pred, benchmark)
    all_R2 = pd.concat([all_R2, R2], axis=1)

In [2]:
def plot_pred(pred_traj, true_traj, markers_on, metric, playerName, dir_name_metric):
    dir_name_metric = dir_name + metric + '/'
    plt.figure()
    plt.plot(pred_traj, marker='o', markevery=markers_on, color='blue', label='Prediction')
    plt.plot(true_traj, marker='o', color='red', label='True')
    plt.xticks(range(len(true_traj)), range(1, len(true_traj)+1))
    plt.legend(loc='best')
    plt.xlabel('Years in NBA')
    plt.ylabel(metric)
    plt.title(playerName + ': ' + metric)
    file_name = dir_name_metric + playerName + '.png'
    plt.savefig(file_name, bbox_inches='tight')
    plt.close()

def annual_predictions(playerNames, allPivotedTableDict, donor, pred_interval, metrics_list,
                      threshold, donorSetup, denoiseSetup, regression_method, verbose, dir_name):
    all_pred = pd.DataFrame()
    all_true = pd.DataFrame()
    for playerName in playerNames:
        # print(playerName)
        target = Target(playerName, allPivotedTableDict)
        mrsc = mRSC(donor, target, pred_interval, probObservation=1)

        player_pred = pd.DataFrame()
        player_true = pd.DataFrame()
        
        for metric in metrics_list:
            mrsc.fit_threshold(metric, threshold, donorSetup, denoiseSetup,regression_method, verbose)
            pred = mrsc.predict()
            true = mrsc.getTrue()
            pred.columns = [playerName]
            true.columns = [playerName]
            player_pred = pd.concat([player_pred, pred], axis=0)
            player_true = pd.concat([player_true, true], axis=0)

            # plot
            pred_list = mrsc.predict().values.flatten()
            
            for i in range(mrsc.num_k):
                metric = mrsc.metrics[i]
                true_traj = mrsc.target.data[metric].dropna(axis='columns').values.flatten()
                pred_traj = np.dot(mrsc.model.donor_pre.iloc[:, i*mrsc.model.interv_index:(i+1)*mrsc.model.interv_index].T, 
                               mrsc.model.beta).flatten()
                
                if mrsc.weighting != None:
                    mean_pre = utils.get_preint_data(combinedDF=mrsc.weights[0].to_frame().T,
                                                    intervIndex=mrsc.interv_index, totalIndex=mrsc.total_index,
                                                    nbrMetrics=mrsc.num_k, reindex=True).values
                    var_pre = utils.get_preint_data(combinedDF=mrsc.weights[1].to_frame().T,
                                                    intervIndex=mrsc.interv_index, totalIndex=mrsc.total_index,
                                                    nbrMetrics=mrsc.num_k, reindex=True).values
                    mean_pre = mean_pre[:, i*mrsc.model.interv_index:(i+1)*mrsc.model.interv_index].flatten()
                    var_pre = var_pre[:, i*mrsc.model.interv_index:(i+1)*mrsc.model.interv_index].flatten()
                    pred_traj = (pred_traj * np.sqrt(var_pre.T)) + mean_pre.T
                
                pred_traj = np.append(pred_traj, pred_list[i])
                markers_on = [true_traj.shape[0]-mrsc.pred_interval]
                plot_pred(pred_traj, true_traj, markers_on, metric, playerName, dir_name)
    
        all_pred = pd.concat([all_pred, player_pred], axis=1)
        all_true = pd.concat([all_true, player_true], axis=1)

    ###################
    # print(all_pred)
    print(all_pred.shape)
    mask = (all_true !=0 )
    mape = np.abs(all_pred - all_true) / all_true[mask]
    print("*** MAPE ***")
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())

    rmse = utils.rmse_2d(all_true, all_pred)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())
    ##############################################################

In [3]:
""" USER PARAMETERS """
starting_year = 1970 
min_games_donor = 30
min_games_target = 40
pred_year = 2015 # the year that we are living in 
pred_interval = 1 # we are making predictions for pred_year+1 and +2
buffer = 4
total_stats = ["PTS","AST","TOV","TRB","STL","BLK","3P","MP"]
normalized_stats = ["FG%", "3P%", "FT%", "PER"]

"""
import data
"""
print("*** importing data ***")
players = pd.read_csv("../data/nba-players-stats/player_data.csv")
players = players[(players.year_start >= starting_year) & (players.year_start < 2018)]

stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
stats['Player'] = stats['Player'].str.replace('*', '')
stats = stats[stats.Player.isin(players.name)]

""" Filter valid players """
# only look at years in which player played more than 'min_games' games
#stats = stats[stats.G >= min_games_donor]

# without duplicated names --> to do: how to distinguish multiple player with the same name
stats = removeDuplicated(players, stats)
stats.Year = stats.Year.astype(int)
stats.year_count = stats.year_count.astype(int) 

print("*** preparing data ***")

########### Donor ##########
# filter stats by the year
stats_donor = stats[stats.Year <= pred_year]
stats_donor = stats_donor[stats_donor.G >= min_games_donor] # donors have to have played certain num. years
allPivotedTableDict_d, allMetrics = prepareData(stats_donor) # prepareData converts to PTS/G
donor = Donor(allPivotedTableDict_d)

########### Target ##########
# filter stats by the year
stats_target = stats[stats.Year <= pred_year+pred_interval]
allPivotedTableDict, allMetrics = prepareData(stats_target)

"""
targets
"""
# targets
playerNames = getActivePlayers(stats, pred_year+pred_interval, buffer=buffer, min_games=min_games_target)
playerNames.sort()
if 'Kevin Garnett' in playerNames: 
    playerNames.remove("Kevin Garnett")
if 'Kobe Bryant' in playerNames:
    playerNames.remove("Kobe Bryant")

print("*** DONE! ***")

*** importing data ***
*** preparing data ***
*** DONE! ***


# *** EXPERIMENTAL SECTION ***

In [4]:
predMetrics = ["PTS_G","AST_G","TOV_G","FG%","FT%","3P_G","TRB_G","STL_G","BLK_G"]
offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"]
defMetrics = ["TRB_G","STL_G","BLK_G"]

# *** EXPERIMENT  ***

In [28]:
"""
experiment setup
"""
# user input
donor_window = 'sliding/'
normalize_metric = None 
threshold = 0.98
helper_metrics = ['MP_G', 'PER_w']

In [29]:
# dir_name
if len(helper_metrics):
    pred_method = 'mrsc/'
else:
    pred_method = 'rsc/'

if normalize_metric == None:
    normalize_metric_label = 'no_normalization/'
else:
    normalize_metric_label = normalize_metric
    
threshold_label = str(threshold*100)[:2] + '/'
helper_metrics_label = ''
for helper_metric in helper_metrics: 
    helper_metrics_label = helper_metrics_label + helper_metric + '_'
helper_metrics_label += '/'

dir_name = 'plots/' + pred_method + donor_window + normalize_metric_label + helper_metrics_label
print(dir_name)

# setup 
donorSetup= [normalize_metric,"sliding", True]
denoiseSetup = ["SVD", "all"]
regression_method = "pinv"
threshold = 0.98
verbose = False
metrics_list = [[metric] + helper_metrics for metric in predMetrics]

plots/mrsc/sliding/no_normalization/MP_G_PER_w_/


In [24]:
"""
experiment
"""
print("Computing...")
annual_predictions(playerNames, allPivotedTableDict, donor, pred_interval, metrics_list,
                   threshold, donorSetup, denoiseSetup, regression_method, verbose, dir_name) 

plots/mrsc/sliding/no_normalization/MP_G_PER_w_/


In [None]:
predMetrics = ["PTS_G","AST_G","TOV_G","FG%","FT%","3P_G","TRB_G","STL_G","BLK_G"]
offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"]
defMetrics = ["TRB_G","STL_G","BLK_G"]

In [None]:
metrics = [predMetrics]
#metrics = [[pred_metric] + [] for pred_metric in predMetrics]
#metrics = [[pred_metric] + ['PER_w'] for pred_metric in predMetrics]

print(metrics)
print()

player_pred = pd.DataFrame()
player_true = pd.DataFrame()

for metric in metrics:
    #pred_label = list(set(predMetrics) & set(metric))
    #print(pred_label)
    #print()
    mrsc.fit_threshold(metric, threshold, donorSetup, denoiseSetup,regression_method, verbose)
    pred = mrsc.predict()
    pred = pred[pred.index.isin(predMetrics)]
    true = mrsc.getTrue()
    pred.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
    true.columns = [playerName+" "+ str(a) for a in range(pred_interval)]
    player_pred = pd.concat([player_pred, pred], axis=0)
    player_true = pd.concat([player_true, true], axis=0)
    print(player_pred)
    print()

player_pred

In [34]:
# debug
playerName = playerNames[0]
playerName = 'James Harden'
metrics_list = [[metric] for metric in predMetrics]
metric = metrics_list[0]

# print(playerName)
target = Target(playerName, allPivotedTableDict)
mrsc = mRSC(donor, target, pred_interval, probObservation=1)

mrsc.fit_threshold(metric, threshold, donorSetup, denoiseSetup,regression_method, verbose)

# plot
pred_list = mrsc.predict()
pred_list

Unnamed: 0,0,1
PTS_G,27.8184,25.1594


In [35]:
stats[stats.Player == playerName]['PTS']/stats[stats.Player == playerName]['G']

15787     9.907895
16412    12.170732
16975    16.838710
17511    25.935897
18116    25.356164
18710    27.370370
19303    28.975610
19874    29.086420
dtype: float64

In [113]:
i = 0
metric = mrsc.metrics[i]
true_traj = mrsc.target.data[metric].dropna(axis='columns').values.flatten()
pred_traj = np.dot(mrsc.model.donor_pre.iloc[:, i*mrsc.model.interv_index:(i+1)*mrsc.model.interv_index].T, 
                   mrsc.model.beta).flatten()

mean_pre = utils.get_preint_data(combinedDF=mrsc.weights[0].to_frame().T,
                                intervIndex=mrsc.interv_index, totalIndex=mrsc.total_index,
                                nbrMetrics=mrsc.num_k, reindex=True)

var_pre = utils.get_preint_data(combinedDF=mrsc.weights[1].to_frame().T,
                                intervIndex=mrsc.interv_index, totalIndex=mrsc.total_index,
                                nbrMetrics=mrsc.num_k, reindex=True)

['PTS_G', 'MP_G']

In [None]:
# for individual player prediction 
def getNormalizedMetric(stats, metric, newColName):
    """
    stats: (df) stats dataframe
    metric: (string) column name of stats df
    newColName: (string) new column name for the processed data
    
    output: (df) for each player, for each year, the per-game metric is computed.
    """
    columnsOfInterest = ["year_count", "Player", "G", metric]
    df = stats.loc[:,columnsOfInterest].groupby(["Player","year_count"]).sum()
    df[newColName] = df[metric]/df["G"]
    return df.iloc[:,-1:]

def getNormalizedMetricDict(stats, metrics):
    """
    stats: (df) stats dataframe
    metrics: (list) column names (strings) of stats df
    
    output: (dict) dict of df's.
    """
    metricsPerGameDict = {}
    for metric in metrics:
        newColName = metric+"_G"
        metricsPerGameDict.update({newColName : getNormalizedMetric(stats, metric, newColName)})
    return metricsPerGameDict


def prepareData_(stats, total_stats, normalized_stats):
    # convert 'total' statistics to 'per game' statistics -> dictionary
    metricsConvertedDict = getConvertedMetricsPerGameDict(stats, total_stats)
    
    # convert 'per game' statistics to dictionary
    metricsNormalizedDict = getNormalizedMetricsDict(stats, normalized_stats)
    
    allMetricsDict = {**metricsConvertedDict, **metricsNormalizedDict}
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    return allPivotedTableDict
    
    
    """# transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK","3P","MP"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG","FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

    allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())
    return allPivotedTableDict, allMetrics"""