In [1]:
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import copy

from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.importData import *
import mrsc.src.utils as utils

In [2]:
def getActivePlayers(stats, year):
    # list of name of the players who were active in this and last year
    thisYear = stats[stats.Year == year].copy()
    lastYear = stats[stats.Year == (year-1)].copy()
    return list(set(thisYear.Player.unique()) & set(lastYear.Player.unique()))

def topPlayers(stats, year, metric, n):
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player","player_id"]][:n]

def removeDuplicated(players, stats):
    """
    players: "../data/nba-players-stats/player_data.csv"
    stats: "../data/nba-players-stats/Seasons_Stats.csv"
    """
    # players with the same name
    names = players.name.unique()
    duplicated = np.array([])

    for name in names:
        numrows = len(players[players.name == name])
        if numrows != 1:
            duplicated = np.append(duplicated, name)

    duplicated = np.sort(duplicated)

    start_year = players.copy()
    start_year = start_year.rename(columns={"name":"Player"})

    # for non-duplicated players
    stats_not_duplicated = stats[~stats.Player.isin(duplicated)]
    stats_not_duplicated = pd.merge(stats_not_duplicated, start_year, on="Player", how="left")

    # only take the values that make sense
    stats_not_duplicated = stats_not_duplicated[(stats_not_duplicated.Year >= stats_not_duplicated.year_start) & (stats_not_duplicated.Year <= stats_not_duplicated.year_end )]
    stats_not_duplicated["year_count"] = stats_not_duplicated.Year - stats_not_duplicated.year_start

    return stats_not_duplicated

# Clean Data

In [3]:
"""
import data
"""
players = pd.read_csv("../data/nba-players-stats/player_data.csv")
players = players[players.year_start >= 1980] # only choose players who started after 1980
players["player_id"] = range(0,len(players.name)) # assign id

stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
stats = stats[stats.Player.isin(players.name)]

# only after 1980
stats = stats[stats.Year >= 1980]

# without duplicated names --> to do: how to distinguish multiple player with the same name
stats = removeDuplicated(players, stats)
stats.Year = stats.Year.astype(int)
stats.year_count = stats.year_count.astype(int)

# transform stats to a dictionary composed of df's for each stat
# the stats are re-calculated to get one stat for each year

metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK"]
metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

metricsPerCentColNames = ["FG","FT","3P"]
metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

metricsWeightedColNames = ["PER"]
metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
allPivotedTableDict = getPivotedTableDict(allMetricsDict)

# this matrix will be used to mask the table
df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

In [43]:
activePlayers = getActivePlayers(stats, 2017)
activePlayers.sort()
offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"]
expSetup = ["sliding", "SVD", "pre", "pinv", False]

pred_all = pd.DataFrame()
true_all = pd.DataFrame()
for playerName in activePlayers:
    target = Target(playerName, allPivotedTableDict, df_year)
    donor = Donor(allPivotedTableDict, df_year)

    mrsc = mRSC(donor, target, probObservation=1)
    mrsc.fit(offMetrics, 2017, pred_length =1, singvals=8, setup = expSetup)
    
    pred = mrsc.predict()
    true = mrsc.getTrue()
    pred.columns = [playerName]
    true.columns = [playerName]
    
    pred_all = pd.concat([pred_all, pred], axis=1)
    true_all = pd.concat([true_all, true], axis=1)

mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
mape.mean(axis=1)

In [44]:
true_all

Unnamed: 0,Aaron Brooks,Aaron Gordon,Aaron Harrison,Adreian Payne,Al Horford,Al Jefferson,Al-Farouq Aminu,Alan Anderson,Alan Williams,Alec Burks,...,Vince Carter,Wayne Ellington,Wesley Johnson,Wesley Matthews,Will Barton,Willie Cauley-Stein,Willie Reed,Zach LaVine,Zach Randolph,Zaza Pachulia
PTS_G,4.95385,12.7375,0.2,3.5,14.0,8.10606,8.72131,2.86667,7.3617,6.7381,...,8.0274,10.4516,2.73529,13.5068,13.6667,8.14667,5.26761,18.9149,14.0822,6.08571
AST_G,1.92308,1.875,0.6,0.388889,4.95588,0.863636,1.62295,0.366667,0.489362,0.714286,...,1.82192,1.12903,0.338235,2.87671,3.43333,1.06667,0.366197,2.95745,1.67123,1.88571
TOV_G,1.01538,1.1125,0.0,0.444444,1.70588,0.5,1.54098,0.233333,0.787234,0.833333,...,0.684932,0.483871,0.264706,1.39726,1.61667,0.92,0.43662,1.80851,1.35616,1.25714
PER_w,9.5,14.4,-2.2,14.4,17.7,18.9,11.3,5.0,19.5,11.6,...,11.7,12.6,8.4,11.9,15.5,16.4,17.1,14.6,18.5,16.1
FG%,0.403333,0.454335,0.0,0.425926,0.473159,0.498938,0.392704,0.375,0.516854,0.399194,...,0.393878,0.416216,0.365,0.393152,0.442943,0.530146,0.568421,0.459155,0.44917,0.534202
FT%,0.8,0.718894,0.5,0.736842,0.8,0.764706,0.705882,0.75,0.625,0.769231,...,0.765217,0.860465,0.647059,0.815642,0.752632,0.668874,0.556818,0.835714,0.73057,0.777778
3P%,0.375,0.28839,0.0,0.2,0.355372,0.0,0.330189,0.318182,0.0,0.328947,...,0.378378,0.378173,0.245763,0.363257,0.370213,0.0,0.25,0.387097,0.223404,0.0


In [45]:
pred_all

Unnamed: 0,Aaron Brooks,Aaron Gordon,Aaron Harrison,Adreian Payne,Al Horford,Al Jefferson,Al-Farouq Aminu,Alan Anderson,Alan Williams,Alec Burks,...,Vince Carter,Wayne Ellington,Wesley Johnson,Wesley Matthews,Will Barton,Willie Cauley-Stein,Willie Reed,Zach LaVine,Zach Randolph,Zaza Pachulia
PTS_G,8.80332,8.61646,1.29845,2.95119,13.3484,12.519,7.88967,2.79557,4.29973,12.3335,...,6.12448,6.76789,7.30779,11.3713,11.7503,7.02413,5.26899,12.4123,11.5391,6.28947
AST_G,2.70244,1.58476,0.220534,0.524649,2.73049,1.56287,1.54553,0.561274,0.845174,2.48071,...,1.4424,1.178,1.22876,1.78528,2.33242,0.689271,0.495917,3.00002,1.94073,1.48559
TOV_G,1.28259,1.0694,0.226826,0.708262,1.73739,1.43614,1.10103,0.527201,0.891259,1.61636,...,0.685474,0.902499,0.941436,1.36375,1.55472,0.876306,0.735865,1.96662,1.51276,0.961559
PER_w,11.6985,13.883,4.89038,7.0793,17.6722,17.1499,12.0342,5.11515,13.1621,13.4981,...,12.4994,9.86415,9.36364,12.5927,14.1921,13.568,13.7472,14.1879,15.0011,11.8629
FG%,0.36741,0.45323,0.250392,0.364883,0.542023,0.542374,0.396657,0.251263,0.387799,0.386378,...,0.522249,0.322705,0.296551,0.372955,0.409476,0.519889,0.498381,0.47066,0.452391,0.391585
FT%,0.63151,0.725495,0.426404,0.620816,0.880982,0.867957,0.614521,0.447366,0.55928,0.63976,...,0.895393,0.504048,0.476313,0.625778,0.642635,0.680412,0.596637,0.84192,0.695439,0.580706
3P%,0.24329,0.227018,0.181653,0.190327,0.282385,0.245892,0.172785,0.100218,0.066596,0.218626,...,0.0652979,0.142557,0.153567,0.222875,0.194937,0.0913868,0.0452409,0.298838,0.269236,0.127424


In [49]:
mask = (true_all !=0 )
mape = np.abs(pred_all - true_all) / true_all[mask]
mape.mean(axis=1)

PTS_G    0.382870
AST_G    0.465873
TOV_G    0.439564
PER_w    0.178842
FG%      0.176911
FT%      0.204882
3P%      0.374395
dtype: float64