In [213]:
import numpy as np
import pandas as pd
import datetime
import dtale
from statistics import mean
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split

In [182]:
raw_data = pd.read_csv('train.csv')
pd.set_option('display.max_rows', None)

In [183]:
def clean_transform(raw_data):
    #Drop duplicates
    raw_data = raw_data.drop_duplicates()
    
    #Change data types, column names, percentages to decimals
    rd = raw_data.astype({'season': str, 'starter': str, 'did_not_play': str, 'is_inactive': str})
    rd['game_date'] = pd.to_datetime(rd['game_date'], infer_datetime_format = True)
    rd = rd.rename(columns = {'Team_efg_pct': 'Team_efg_rate', 'Team_tov_pct' : 'Team_tov_rate', 
                              'Team_orb_pct': 'Team_orb_rate', 'Opponent_efg_pct': 'Opponent_efg_rate', 
                              'Opponent_tov_pct': 'Opponent_tov_rate', 'Opponent_orb_pct': 'Opponent_orb_rate',
                             'ts_pct': 'ts_rate', 'efg_pct': 'efg_rate', 'ts_pct': 'ts_rate', 'orb_pct': 'orb_rate', 
                             'drb_pct': 'drb_rate', 'trb_pct': 'trb_rate', 'ast_pct': 'ast_rate', 
                              'stl_pct': 'stl_rate','blk_pct': 'blk_rate', 'tov_pct': 'tov_rate', 
                              'usg_pct': 'usg_rate', 'ft_pct': 'ft_rate', 'fg_pct': 'fg_rate', 'fg3_pct': 'fg3_rate'})
    rd[['Team_tov_rate', 'Team_orb_rate', 'Opponent_orb_rate', 'Opponent_tov_rate', 'orb_rate', 
        'drb_rate', 'trb_rate', 'ast_rate', 'stl_rate', 'blk_rate', 'tov_rate', 'usg_rate']] = rd[['Team_tov_rate', 'Team_orb_rate', 'Opponent_orb_rate', 'Opponent_tov_rate', 'orb_rate', 
        'drb_rate', 'trb_rate', 'ast_rate', 'stl_rate', 'blk_rate', 'tov_rate', 'usg_rate']]/100
    raw_data = rd
    
    #Create inactives table
    global inactives
    inactives = raw_data[['game_id', 'Inactives']]
    
    #Drop rows for games that went to overtime
    raw_data = raw_data[raw_data['OT'] == 0]
    
    #Remove unnecessary columns
    global clean_data
    clean_data = raw_data.drop(['OT', 'player', 'mp', 'double_double', 'triple_double', 'active_position_minutes', 
                   'PG%', 'SG%', 'SF%', 'C%', 'Inactives', 'pf', 'is_inactive', 'PF%',
                    'last_60_minutes_per_game_bench', 'last_60_minutes_per_game_starting',
                    'DKP', 'FDP', 'SDP', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'ts'], axis = 1, 
                               inplace = False)
    
    #New col concatenate game_id and player_id
    clean_data.loc[:, 'game_id_player_id'] = clean_data['game_id'].astype('str') + clean_data['player_id'].astype('str')
    
    #Dedupe new column, move to first
    duplicates = clean_data.duplicated(['game_id_player_id'], keep='first')
    clean_data = clean_data[~duplicates]
    new_id = clean_data.pop('game_id_player_id')
    clean_data.insert(0, 'game_id_player_id', new_id)
    
    #Filter out did not play
    #clean_data = clean_data[clean_data['did_not_play'] == '0']
    
    #Sort by player, date
    clean_data = clean_data.sort_values(by = ['player_id', 'game_date'])
    
    #Reset index
    clean_data.reset_index(drop = True, inplace = True)

In [184]:
clean_transform(raw_data)

In [185]:
def add_feat(clean_data):
    #Create player numeric id dictionary, add player number as column
    unique_players = pd.DataFrame(clean_data['player_id'].unique())
    dict = unique_players.to_dict()[0]
    player_dict = {v:k  for k,v in dict.items()}
    clean_data['player_num'] = clean_data['player_id'].map(player_dict)

    #Create player game number column
    clean_data.insert(56, 'player_game_num', 0)
    clean_data['player_game_num'].iloc[0] = 1
    for i in range(1, len(clean_data)):
        if clean_data.iloc[i, 55] != clean_data.iloc[i-1, 55]:
            clean_data['player_game_num'].iloc[i] = 1
        else:
            clean_data['player_game_num'].iloc[i] = (clean_data['player_game_num'].iloc[i-1] + 1)

In [186]:
add_feat(clean_data)

In [187]:
def describe(dataframe):
    print('Shape: ', dataframe.shape)
    print(dataframe.columns)
    print(dataframe.dtypes)

In [188]:
def transpose(dataframe):
    transpose = dataframe.iloc[0:9].T
    return transpose.head(len(dataframe))

In [189]:
def open_dtale(dataframe):
    dtale.show(dataframe).open_browser()

In [190]:
open_dtale(clean_data)

In [254]:
def minute_data(clean_data):
    global min_data
    min_data = clean_data[['game_date', 'starter', 'did_not_play', 'game_id_player_id', 'player_num','player_id', 'did_not_play', 'minutes', 'player_game_num']]
    min_data['pred_min'] = min_data['minutes']

In [255]:
minute_data(clean_data)

In [261]:
def build(min_data):
    #Set rolling weights
    global datfrm
    df = min_data.copy()
    weights3 = np.array([.15, .3, .55])
    weights5 = np.array([.05, .1, .2, .3, .35])
    weights10 = np.array([.01, .03, .05, .07, .09, .11, .13, .15, .17, .19])
    datfrm = pd.DataFrame()
    for i in df['player_id'].unique():
        min_build = df[df['player_id'] == i].sort_values(by = 'game_date')
        min_build['prevgm'] = min_build['minutes'].shift(1)
        min_build['prev3'] = min_build['minutes'].rolling(3).mean().shift(1)
        min_build['prev3wtd'] = min_build['minutes'].rolling(3).apply(lambda x: np.sum(weights3*x)).shift(1)
        min_build['prev5'] = min_build['minutes'].rolling(5).mean().shift(1)
        min_build['prev5wtd'] = min_build['minutes'].rolling(5).apply(lambda x: np.sum(weights5*x)).shift(1)
        min_build['prev10'] = min_build['minutes'].rolling(10).mean().shift(1)
        min_build['prev10wtd'] = min_build['minutes'].rolling(10).apply(lambda x: np.sum(weights10*x)).shift(1)
        min_build['prev3med'] = min_build['minutes'].rolling(3).median().shift(1)
        min_build['prev5med'] = min_build['minutes'].rolling(5).median().shift(1)
        min_build['prev10med'] = min_build['minutes'].rolling(10).median().shift(1)
        min_build['prev3std'] = min_build['minutes'].rolling(3).std().shift(1)
        min_build['prev5std'] = min_build['minutes'].rolling(5).std().shift(1)
        min_build['prev10std'] = min_build['minutes'].rolling(10).std().shift(1)
        min_build['prev_starter'] = min_build['starter'].shift(1)
        #min_build['prev_dnp'] = min_build['did_not_play'].shift(1)
        datfrm = datfrm.append(min_build.dropna())

In [262]:
build(min_data)

In [263]:
def test_metrics(datfrm):
    prevgm = ((datfrm.prevgm - datfrm.minutes) ** 2).mean() ** .5
    prev3 = ((datfrm.prev3 - datfrm.minutes) ** 2).mean() ** .5
    prev3wtd = ((datfrm.prev3wtd - datfrm.minutes) ** 2).mean() ** .5
    prev5 = ((datfrm.prev5 - datfrm.minutes) ** 2).mean() ** .5
    prev5wtd = ((datfrm.prev5wtd - datfrm.minutes) ** 2).mean() ** .5
    prev10 = ((datfrm.prev10 - datfrm.minutes) ** 2).mean() ** .5
    prev10wtd = ((datfrm.prev10wtd - datfrm.minutes) ** 2).mean() ** .5
    
    print('RMSE Previous Game = ', prevgm)
    print('RMSE Avg. Previous 3 Games = ', prev3)
    print('RMSE Weighted Avg. Prev 3 Games = ', prev3wtd)
    print('RMSE Avg. Previous 5 Games = ', prev5)
    print('RMSE Weighted Avg. Prev 5 Games = ', prev5wtd)
    print('RMSE Avg. Previous 10 Games = ', prev10)
    print('RMSE Weighted Avg. Prev 10 Games = ', prev10wtd)

In [266]:
test_metrics(datfrm)

RMSE Previous Game =  8.41845680820872
RMSE Avg. Previous 3 Games =  7.743383673098178
RMSE Weighted Avg. Prev 3 Games =  7.613273196931532
RMSE Avg. Previous 5 Games =  7.759838651287435
RMSE Weighted Avg. Prev 5 Games =  7.540822749766764
RMSE Avg. Previous 10 Games =  7.978526754681877
RMSE Weighted Avg. Prev 10 Games =  7.618897477978181


In [193]:
def train_test(min_data):
    x = datfrm[['starter', 'did_not_play', ]]

In [258]:
min_data.head()

Unnamed: 0,game_date,starter,did_not_play,game_id_player_id,player_num,player_id,did_not_play.1,minutes,player_game_num,pred_min
0,2020-12-23,0,0,202012230ORLachiupr01,0,achiupr01,0,13.65,1,13.65
1,2020-12-25,0,0,202012250MIAachiupr01,0,achiupr01,0,19.466667,2,19.466667
2,2020-12-29,0,0,202012290MIAachiupr01,0,achiupr01,0,17.633333,3,17.633333
3,2020-12-30,0,0,202012300MIAachiupr01,0,achiupr01,0,12.016667,4,12.016667
4,2021-01-01,0,0,202101010DALachiupr01,0,achiupr01,0,11.983333,5,11.983333


In [237]:
for i in min_data['player_id'].unique():
    print (i)

achiupr01
adamsja01
adamsst01
adebaba01
aldrila01
alexaky01
alexani01
alexaty01
allengr01
allenja01
allenka01
aminual01
anderju01
anderky01
anderry01
antetgi01
antetko01
antetth01
anthoca01
anthoco01
anunoog01
arcidry01
arizatr01
augusdj01
avdijde01
aytonde01
azubuud01
bacondw01
baglema01
ballla01
balllo01
bambamo01
banede01
bareajo01
barneha02
barrerj01
bartowi01
bateske01
batumni01
baynear01
bazemke01
bazleda01
bealbr01
beaslma01
belinma01
belljo01
bembrde01
bendedr01
bertada01
beverpa01
beysa01
beyty01
birchkh01
bitadgo01
biyombi01
bjeline01
bledser01
blevike01
bogdabo01
bogdabo02
bolbo01
boldejo01
boldema01
bonejo01
bongais01
bookede01
bouchch01
bowenbr02
bowmaky01
bradlav01
bradlto01
brantja01
brazdig01
breweco01
bridgmi01
bridgmi02
brissos01
broekry01
brogdma01
brookdi01
brownbr01
brownch02
brownja02
brownmo01
brownst02
browntr01
brunsja01
bryanth01
bullore01
burketr01
burksal01
burtode02
butleji01
cabocbr01
cacokde01
caldwke01
campafa01
cancavl01
capelca01
careyve01
carrode01
ca