In [1]:
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import copy

from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.importData import *
import mrsc.src.utils as utils

def getActivePlayers(stats, year, buffer):
    # list of name of the players who were active in this and last year
    thisYear = stats[stats.Year == year].copy()
    players = list(thisYear.Player.unique())
    for i in range(1, buffer+1):
        previousYear = stats[stats.Year == (year-i)].copy()
        players = list(set(players) & set(previousYear.Player.unique()))
    return players

def topPlayers(stats, year, metric, n):
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player","player_id"]][:n]

def removeDuplicated(players, stats):
    """
    players: "../data/nba-players-stats/player_data.csv"
    stats: "../data/nba-players-stats/Seasons_Stats.csv"
    """
    # players with the same name
    names = players.name.unique()
    duplicated = np.array([])

    for name in names:
        numrows = len(players[players.name == name])
        if numrows != 1:
            duplicated = np.append(duplicated, name)

    duplicated = np.sort(duplicated)

    start_year = players.copy()
    start_year = start_year.rename(columns={"name":"Player"})

    # for non-duplicated players
    stats_not_duplicated = stats[~stats.Player.isin(duplicated)]
    stats_not_duplicated = pd.merge(stats_not_duplicated, start_year, on="Player", how="left")

    # only take the values that make sense
    stats_not_duplicated = stats_not_duplicated[(stats_not_duplicated.Year >= stats_not_duplicated.year_start) & (stats_not_duplicated.Year <= stats_not_duplicated.year_end )]
    stats_not_duplicated["year_count"] = stats_not_duplicated.Year - stats_not_duplicated.year_start

    return stats_not_duplicated

"""
import data
"""
players = pd.read_csv("../data/nba-players-stats/player_data.csv")
players = players[players.year_start >= 1980] # only choose players who started after 1980
players["player_id"] = range(0,len(players.name)) # assign id

stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
stats = stats[stats.Player.isin(players.name)]

# only after 1980
stats = stats[stats.Year >= 1980]

# without duplicated names --> to do: how to distinguish multiple player with the same name
stats = removeDuplicated(players, stats)
stats.Year = stats.Year.astype(int)
stats.year_count = stats.year_count.astype(int)

# transform stats to a dictionary composed of df's for each stat
# the stats are re-calculated to get one stat for each year

metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK"]
metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

metricsPerCentColNames = ["FG","FT","3P"]
metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

metricsWeightedColNames = ["PER"]
metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
allPivotedTableDict = getPivotedTableDict(allMetricsDict)
allMetrics = list(allMetricsDict.keys())

# this matrix will be used to mask the table
df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

donor = Donor(allPivotedTableDict, df_year)

In [2]:
pred_year = 2016
window =5

print("all appended")
df = donor.concat(allMetrics, pred_year, window, method = "fixed")
u, s, v = np.linalg.svd(df, full_matrices=False)
for k in range(1,10,1):
    print(k, " : ", np.sum(s[:k]**2) / np.sum(s ** 2))
    
for metric in allMetrics:
    print(metric)
    metric = [metric]
    df = donor.concat(metric, pred_year, window, method = "fixed")
    u, s, v = np.linalg.svd(df, full_matrices=False)

    for k in range(1,10,1):
        print(k, " : ", np.sum(s[:k]**2) / np.sum(s ** 2))

all appended
1  :  0.9203951046990634
2  :  0.9433739675505767
3  :  0.9563490223449069
4  :  0.9677644790474129
5  :  0.9747006571579189
6  :  0.9803023977674028
7  :  0.9845665152878789
8  :  0.9883089353846004
9  :  0.9916062285654382
PTS_G
1  :  0.9476598340842909
2  :  0.973717413752437
3  :  0.9854896127336391
4  :  0.994123034497006
5  :  1.0
6  :  1.0
7  :  1.0
8  :  1.0
9  :  1.0
AST_G
1  :  0.9369129449182593
2  :  0.9676052554495106
3  :  0.9834412207143985
4  :  0.9933376123307397
5  :  1.0
6  :  1.0
7  :  1.0
8  :  1.0
9  :  1.0
TOV_G
1  :  0.9468469209102359
2  :  0.9713674391348321
3  :  0.9839556331273784
4  :  0.9932346577979597
5  :  1.0
6  :  1.0
7  :  1.0
8  :  1.0
9  :  1.0
TRB_G
1  :  0.9505154535227764
2  :  0.9730967705012866
3  :  0.9844040512211585
4  :  0.993131368484196
5  :  1.0
6  :  1.0
7  :  1.0
8  :  1.0
9  :  1.0
STL_G
1  :  0.9386025675629686
2  :  0.9646332596355865
3  :  0.9786465177319699
4  :  0.9913623080824963
5  :  1.0
6  :  1.0
7  :  1.0
8  : 

In [3]:
pred_year = 2016

for window in [5,6,7,8,9,10]:
    print(window)
    df = donor.concat(allMetrics, pred_year, window, method = "fixed")
    u, s, v = np.linalg.svd(df, full_matrices=False)
    for k in range(1,10,1):
        print(k, " : ", np.sum(s[:k]**2) / np.sum(s ** 2))

5
1  :  0.9203951046990634
2  :  0.9433739675505767
3  :  0.9563490223449069
4  :  0.9677644790474129
5  :  0.9747006571579189
6  :  0.9803023977674028
7  :  0.9845665152878789
8  :  0.9883089353846004
9  :  0.9916062285654382
6
1  :  0.9213891890304468
2  :  0.9418055039746284
3  :  0.9548592226018466
4  :  0.965976297590812
5  :  0.9732187679341158
6  :  0.9777052047056335
7  :  0.981736330276799
8  :  0.9855181992326862
9  :  0.9886586803307197
7
1  :  0.9228612935932926
2  :  0.9417588025446199
3  :  0.9545676838551568
4  :  0.9655921765715119
5  :  0.9723976435109617
6  :  0.9765208812462156
7  :  0.980121120481511
8  :  0.9835917395113659
9  :  0.9866885603464957
8
1  :  0.9230618247818819
2  :  0.9409143751672893
3  :  0.9535714995887844
4  :  0.9645425348046555
5  :  0.9711709601703379
6  :  0.9753941505800546
7  :  0.9789101342745803
8  :  0.9820459752549308
9  :  0.9849549441223056
9
1  :  0.9233944552390896
2  :  0.9410453552468935
3  :  0.9538485297053614
4  :  0.9645473279

In [4]:
# counting zero's
for metric in allMetrics:
    print(metric)
    df = allPivotedTableDict[metric]
    total = df.count().sum()
    zeros = (df == 0).sum().sum()
    print("total: ", total)
    print("zeros: ", zeros)
    print("%    : ", (zeros/total*100).round(3) , "%")

PTS_G
total:  13579
zeros:  128
%    :  0.943 %
AST_G
total:  13579
zeros:  395
%    :  2.909 %
TOV_G
total:  13579
zeros:  262
%    :  1.929 %
TRB_G
total:  13579
zeros:  119
%    :  0.876 %
STL_G
total:  13579
zeros:  591
%    :  4.352 %
BLK_G
total:  13579
zeros:  1210
%    :  8.911 %
FG%
total:  13579
zeros:  176
%    :  1.296 %
FT%
total:  13579
zeros:  491
%    :  3.616 %
3P%
total:  13579
zeros:  4826
%    :  35.54 %
PER_w
total:  13579
zeros:  16
%    :  0.118 %


In [5]:
# mean, variance
means = []
variances = []
allMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%","TRB_G","STL_G","BLK_G"]
# offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"]
# defMetrics = ["TRB_G","STL_G","BLK_G"]
for metric in allMetrics:
    print(metric)
    df = allPivotedTableDict[metric]
    vals = df.values.flatten()
    vals = vals[~np.isnan(vals)]
    vals = vals/vals.mean()
    means.append(vals.mean())
    variances.append(np.var(vals))
    print("mean: ", vals.mean())
    print("var : ", np.var(vals))

PTS_G
mean:  1.0
var :  0.511229771121998
AST_G
mean:  0.9999999999999998
var :  0.9946510554585649
TOV_G
mean:  0.9999999999999999
var :  0.40800810744671595
PER_w
mean:  1.0000000000000002
var :  0.21952921550988166
FG%
mean:  1.0000000000000002
var :  0.04548637469474005
FT%
mean:  1.0
var :  0.07030445274278505
3P%
mean:  1.0
var :  0.766035728063237
TRB_G
mean:  1.0
var :  0.49796863056682394
STL_G
mean:  1.0
var :  0.49356725167626314
BLK_G
mean:  1.0000000000000002
var :  1.400042933728878


In [6]:
means = []
variances = []
metrics = ["AST_G","TOV_G", "FG%","FT%","3P%"]
for metric in metrics:
    print(metric)
    df = allPivotedTableDict[metric]
    vals = df.values.flatten()
    vals = vals[~np.isnan(vals)]
    means.append(vals.mean())
    variances.append(np.var(vals))
    print("mean: ", vals.mean())
    print("var : ", np.var(vals))

print()
print("mean")
print([1/x for x in means])
print()
print("var")
print([1/(x+1)for x in variances])

AST_G
mean:  1.7957411223657396
var :  3.2074375110227415
TOV_G
mean:  1.2177917024718974
var :  0.6050828087669478
FG%
mean:  0.43785559339244096
var :  0.00872053498051688
FT%
mean:  0.6990819564217532
var :  0.03435888152493174
3P%
mean:  0.21029194254679157
var :  0.033876169034140376
mean
[0.556873141426189, 0.8211584936653621, 2.28385800042463, 1.4304474472757014, 4.755293939887841]

var
[0.23767435579974203, 0.6230208152115324, 0.991354855306197, 0.9667824367938164, 0.9672338234995899]
