In [1]:
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import copy

from mrsc.src.model.SVDmodel import SVDmodel
from mrsc.src.model.Target import Target
from mrsc.src.model.Donor import Donor
from mrsc.src.synthcontrol.mRSC import mRSC
from mrsc.src.importData import *
import mrsc.src.utils as utils

In [2]:
def topPlayers(stats, year, metric, n):
    stats = stats[stats.Year == year]
    stats = stats.groupby('Player').mean().reset_index()
    stats_sorted = stats[stats.Year == year].sort_values(metric, ascending = False).reset_index(drop=True)
    return stats_sorted[["Player","player_id"]][:n]

def removeDuplicated(players, stats):
    """
    players: "../data/nba-players-stats/player_data.csv"
    stats: "../data/nba-players-stats/Seasons_Stats.csv"
    """
    # players with the same name
    names = players.name.unique()
    duplicated = np.array([])

    for name in names:
        numrows = len(players[players.name == name])
        if numrows != 1:
            duplicated = np.append(duplicated, name)

    duplicated = np.sort(duplicated)

    start_year = players.copy()
    start_year = start_year.rename(columns={"name":"Player"})

    # for non-duplicated players
    stats_not_duplicated = stats[~stats.Player.isin(duplicated)]
    stats_not_duplicated = pd.merge(stats_not_duplicated, start_year, on="Player", how="left")

    # only take the values that make sense
    stats_not_duplicated = stats_not_duplicated[(stats_not_duplicated.Year >= stats_not_duplicated.year_start) & (stats_not_duplicated.Year <= stats_not_duplicated.year_end )]
    stats_not_duplicated["year_count"] = stats_not_duplicated.Year - stats_not_duplicated.year_start

    return stats_not_duplicated

# Clean Data

In [3]:
"""
import data
"""
players = pd.read_csv("../data/nba-players-stats/player_data.csv")
players = players[players.year_start >= 1980] # only choose players who started after 1980
players["player_id"] = range(0,len(players.name)) # assign id

stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
stats = stats[stats.Player.isin(players.name)]

# only after 1980
stats = stats[stats.Year >= 1980]

# without duplicated names --> to do: how to distinguish multiple player with the same name
stats = removeDuplicated(players, stats)
stats.Year = stats.Year.astype(int)
stats.year_count = stats.year_count.astype(int)

# transform stats to a dictionary composed of df's for each stat
# the stats are re-calculated to get one stat for each year

metricsPerGameColNames = ["PTS","AST","TOV","TRB","STL","BLK"]
metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

metricsPerCentColNames = ["FG","FT","3P"]
metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

metricsWeightedColNames = ["PER"]
metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames)

allMetricsDict = {**metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict}
allPivotedTableDict = getPivotedTableDict(allMetricsDict)

# this matrix will be used to mask the table
df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

In [4]:
target = Target("A.C. Green", allPivotedTableDict, df_year)
donor = Donor(allPivotedTableDict, df_year)

mrsc = mRSC(donor, target, probObservation=1)
mrsc.fit(["PTS_G","AST_G"], 1991, pred_length =1, singvals=4)

NameError: name 'getPlayerFromDict' is not defined

In [6]:
mrsc.model.beta

array([[ 0.01021604],
       [ 0.00686351],
       [-0.00463282],
       [ 0.00954095],
       [ 0.00517331],
       [ 0.00646694],
       [ 0.00672477],
       [ 0.00044276],
       [ 0.01704045],
       [ 0.00901003],
       [ 0.01546417],
       [ 0.00713033],
       [ 0.01513654],
       [ 0.01628498],
       [ 0.00985618],
       [ 0.01610172],
       [ 0.0004886 ],
       [ 0.0029531 ],
       [ 0.00265517],
       [ 0.02886593],
       [ 0.00256436],
       [ 0.00145352],
       [ 0.00158446],
       [-0.0042913 ],
       [-0.00683516],
       [-0.00306054],
       [-0.00483921],
       [ 0.00179352],
       [ 0.00309316],
       [ 0.02509695],
       [-0.01032495],
       [ 0.0033395 ],
       [ 0.00731574],
       [ 0.00844815],
       [-0.0030833 ],
       [-0.01029547],
       [-0.00035235],
       [ 0.01152368],
       [-0.00687635],
       [-0.00192736],
       [ 0.00153107],
       [ 0.00215423],
       [-0.0044992 ],
       [ 0.00046183],
       [ 0.00156041],
       [ 0