In [60]:
import numpy as np
import pandas as pd
import datetime
import dtale
from statistics import mean

In [141]:
raw_data = pd.read_csv('train.csv')
pd.set_option('display.max_rows', None)

In [142]:
def clean_transform(raw_data):
    #Drop duplicates
    raw_data = raw_data.drop_duplicates()
    
    #Change data types, column names, percentages to decimals
    rd = raw_data.astype({'season': str, 'starter': str, 'did_not_play': str, 'is_inactive': str})
    rd['game_date'] = pd.to_datetime(rd['game_date'], infer_datetime_format = True)
    rd = rd.rename(columns = {'Team_efg_pct': 'Team_efg_rate', 'Team_tov_pct' : 'Team_tov_rate', 
                              'Team_orb_pct': 'Team_orb_rate', 'Opponent_efg_pct': 'Opponent_efg_rate', 
                              'Opponent_tov_pct': 'Opponent_tov_rate', 'Opponent_orb_pct': 'Opponent_orb_rate',
                             'ts_pct': 'ts_rate', 'efg_pct': 'efg_rate', 'ts_pct': 'ts_rate', 'orb_pct': 'orb_rate', 
                             'drb_pct': 'drb_rate', 'trb_pct': 'trb_rate', 'ast_pct': 'ast_rate', 
                              'stl_pct': 'stl_rate','blk_pct': 'blk_rate', 'tov_pct': 'tov_rate', 
                              'usg_pct': 'usg_rate', 'ft_pct': 'ft_rate', 'fg_pct': 'fg_rate', 'fg3_pct': 'fg3_rate'})
    rd[['Team_tov_rate', 'Team_orb_rate', 'Opponent_orb_rate', 'Opponent_tov_rate', 'orb_rate', 
        'drb_rate', 'trb_rate', 'ast_rate', 'stl_rate', 'blk_rate', 'tov_rate', 'usg_rate']] = rd[['Team_tov_rate', 'Team_orb_rate', 'Opponent_orb_rate', 'Opponent_tov_rate', 'orb_rate', 
        'drb_rate', 'trb_rate', 'ast_rate', 'stl_rate', 'blk_rate', 'tov_rate', 'usg_rate']]/100
    raw_data = rd
    
    #Create inactives table
    global inactives
    inactives = raw_data[['game_id', 'Inactives']]
    
    #Drop rows for games that went to overtime
    raw_data = raw_data[raw_data['OT'] == 0]
    
    #Remove unnecessary columns
    global clean_data
    clean_data = raw_data.drop(['OT', 'player', 'mp', 'double_double', 'triple_double', 'active_position_minutes', 
                   'PG%', 'SG%', 'SF%', 'C%', 'Inactives', 'pf', 'is_inactive', 'PF%',
                    'last_60_minutes_per_game_bench', 'last_60_minutes_per_game_starting',
                    'DKP', 'FDP', 'SDP', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'ts'], axis = 1, 
                               inplace = False)
    
    #New col concatenate game_id and player_id
    clean_data.loc[:, 'game_id_player_id'] = clean_data['game_id'].astype('str') + clean_data['player_id'].astype('str')
    
    #Dedupe new column, move to first
    duplicates = clean_data.duplicated(['game_id_player_id'], keep='first')
    clean_data = clean_data[~duplicates]
    new_id = clean_data.pop('game_id_player_id')
    clean_data.insert(0, 'game_id_player_id', new_id)
    
    #Filter out did not play
    #clean_data = clean_data[clean_data['did_not_play'] == '0']
    
    #Sort by player, date
    clean_data = clean_data.sort_values(by = ['player_id', 'game_date'])
    
    #Reset index
    clean_data.reset_index(drop = True, inplace = True)

In [143]:
clean_transform(raw_data)

In [144]:
def add_feat(clean_data):
    #Create player numeric id dictionary, add player number as column
    unique_players = pd.DataFrame(clean_data['player_id'].unique())
    dict = unique_players.to_dict()[0]
    player_dict = {v:k  for k,v in dict.items()}
    clean_data['player_num'] = clean_data['player_id'].map(player_dict)

    #Create player game number column
    clean_data.insert(56, 'player_game_num', 0)
    clean_data['player_game_num'].iloc[0] = 1
    for i in range(1, len(clean_data)):
        if clean_data.iloc[i, 55] != clean_data.iloc[i-1, 55]:
            clean_data['player_game_num'].iloc[i] = 1
        else:
            clean_data['player_game_num'].iloc[i] = (clean_data['player_game_num'].iloc[i-1] + 1)

In [145]:
add_feat(clean_data)

In [None]:
def describe(dataframe):
    print('Shape: ', dataframe.shape)
    print(dataframe.columns)
    print(dataframe.dtypes)

In [None]:
def transpose(dataframe):
    transpose = dataframe.iloc[0:9].T
    return transpose.head(len(dataframe))

In [None]:
def open_dtale(dataframe):
    dtale.show(dataframe).open_browser()

In [129]:
open_dtale(clean_data)

In [195]:
def minute_data(clean_data):
    global min_data
    min_data = clean_data[['game_id_player_id', 'player_num','player_id', 'did_not_play', 'minutes', 'player_game_num']]
    min_data['pred_min'] = min_data['minutes']

In [196]:
minute_data(clean_data)

In [197]:
def rolling_weighted_avg(min_data):
    #Apply rolling weighted averages to pred_min column
    weights = np.array([.1, .2, .3, .4, 0])
    min_data['pred_min'] = min_data['minutes'].rolling(5).apply(lambda x: np.sum(weights*x))
    
    #Account for games early in season or injuries
    for i in range(0, len(min_data)):
        if min_data['player_game_num'].iloc[i] == 1:
            min_data['pred_min'].iloc[i] = 0
        elif min_data['player_game_num'].iloc[i] == 2:
            min_data['pred_min'].iloc[i] = min_data['minutes'].iloc[i-1]
        elif min_data['player_game_num'].iloc[i] == 3:
            min_data['pred_min'].iloc[i] = (((min_data['minutes'].iloc[i-1])* 0.65) 
            + ((min_data['minutes'].iloc[i-2])* 0.35))
        elif min_data['player_game_num'].iloc[i] == 4:
            min_data['pred_min'].iloc[i] = (((min_data['minutes'].iloc[i-1])* 0.55) 
            + ((min_data['minutes'].iloc[i-2])* 0.3) 
            + ((min_data['minutes'].iloc[i-3])* 0.15))
        elif min_data['minutes'].iloc[i-1] == 0:
            min_data['pred_min'].iloc[i] = 0

In [198]:
rolling_weighted_avg(min_data)

In [201]:
print('RMSE Rolling Weight Avg. = ', ((min_data.pred_min - min_data.minutes) ** 2).mean() ** .5)

RMSE Rolling Weight Avg. =  8.106838480343951
