In [416]:
import numpy as np
import pandas as pd
import datetime
import dtale
from statistics import mean
import matplotlib.pyplot as plt
import random

In [141]:
raw_data = pd.read_csv('train.csv')
pd.set_option('display.max_rows', None)

In [142]:
def clean_transform(raw_data):
    #Drop duplicates
    raw_data = raw_data.drop_duplicates()
    
    #Change data types, column names, percentages to decimals
    rd = raw_data.astype({'season': str, 'starter': str, 'did_not_play': str, 'is_inactive': str})
    rd['game_date'] = pd.to_datetime(rd['game_date'], infer_datetime_format = True)
    rd = rd.rename(columns = {'Team_efg_pct': 'Team_efg_rate', 'Team_tov_pct' : 'Team_tov_rate', 
                              'Team_orb_pct': 'Team_orb_rate', 'Opponent_efg_pct': 'Opponent_efg_rate', 
                              'Opponent_tov_pct': 'Opponent_tov_rate', 'Opponent_orb_pct': 'Opponent_orb_rate',
                             'ts_pct': 'ts_rate', 'efg_pct': 'efg_rate', 'ts_pct': 'ts_rate', 'orb_pct': 'orb_rate', 
                             'drb_pct': 'drb_rate', 'trb_pct': 'trb_rate', 'ast_pct': 'ast_rate', 
                              'stl_pct': 'stl_rate','blk_pct': 'blk_rate', 'tov_pct': 'tov_rate', 
                              'usg_pct': 'usg_rate', 'ft_pct': 'ft_rate', 'fg_pct': 'fg_rate', 'fg3_pct': 'fg3_rate'})
    rd[['Team_tov_rate', 'Team_orb_rate', 'Opponent_orb_rate', 'Opponent_tov_rate', 'orb_rate', 
        'drb_rate', 'trb_rate', 'ast_rate', 'stl_rate', 'blk_rate', 'tov_rate', 'usg_rate']] = rd[['Team_tov_rate', 'Team_orb_rate', 'Opponent_orb_rate', 'Opponent_tov_rate', 'orb_rate', 
        'drb_rate', 'trb_rate', 'ast_rate', 'stl_rate', 'blk_rate', 'tov_rate', 'usg_rate']]/100
    raw_data = rd
    
    #Create inactives table
    global inactives
    inactives = raw_data[['game_id', 'Inactives']]
    
    #Drop rows for games that went to overtime
    raw_data = raw_data[raw_data['OT'] == 0]
    
    #Remove unnecessary columns
    global clean_data
    clean_data = raw_data.drop(['OT', 'player', 'mp', 'double_double', 'triple_double', 'active_position_minutes', 
                   'PG%', 'SG%', 'SF%', 'C%', 'Inactives', 'pf', 'is_inactive', 'PF%',
                    'last_60_minutes_per_game_bench', 'last_60_minutes_per_game_starting',
                    'DKP', 'FDP', 'SDP', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'ts'], axis = 1, 
                               inplace = False)
    
    #New col concatenate game_id and player_id
    clean_data.loc[:, 'game_id_player_id'] = clean_data['game_id'].astype('str') + clean_data['player_id'].astype('str')
    
    #Dedupe new column, move to first
    duplicates = clean_data.duplicated(['game_id_player_id'], keep='first')
    clean_data = clean_data[~duplicates]
    new_id = clean_data.pop('game_id_player_id')
    clean_data.insert(0, 'game_id_player_id', new_id)
    
    #Filter out did not play
    #clean_data = clean_data[clean_data['did_not_play'] == '0']
    
    #Sort by player, date
    clean_data = clean_data.sort_values(by = ['player_id', 'game_date'])
    
    #Reset index
    clean_data.reset_index(drop = True, inplace = True)

In [143]:
clean_transform(raw_data)

In [144]:
def add_feat(clean_data):
    #Create player numeric id dictionary, add player number as column
    unique_players = pd.DataFrame(clean_data['player_id'].unique())
    dict = unique_players.to_dict()[0]
    player_dict = {v:k  for k,v in dict.items()}
    clean_data['player_num'] = clean_data['player_id'].map(player_dict)

    #Create player game number column
    clean_data.insert(56, 'player_game_num', 0)
    clean_data['player_game_num'].iloc[0] = 1
    for i in range(1, len(clean_data)):
        if clean_data.iloc[i, 55] != clean_data.iloc[i-1, 55]:
            clean_data['player_game_num'].iloc[i] = 1
        else:
            clean_data['player_game_num'].iloc[i] = (clean_data['player_game_num'].iloc[i-1] + 1)

In [145]:
add_feat(clean_data)

In [None]:
def describe(dataframe):
    print('Shape: ', dataframe.shape)
    print(dataframe.columns)
    print(dataframe.dtypes)

In [None]:
def transpose(dataframe):
    transpose = dataframe.iloc[0:9].T
    return transpose.head(len(dataframe))

In [None]:
def open_dtale(dataframe):
    dtale.show(dataframe).open_browser()

In [129]:
open_dtale(clean_data)

In [347]:
def minute_data(clean_data):
    global min_data
    min_data = clean_data[['game_id_player_id', 'player_num','player_id', 'did_not_play', 'minutes', 'player_game_num']]
    min_data['pred_min'] = min_data['minutes']

In [348]:
minute_data(clean_data)

In [363]:
def rolling_weighted_avg4(min_data):
    #Apply rolling weighted averages to pred_min column
    weights = np.array([.1, .2, .3, .4])
    global rwa4
    rwa4 = min_data.copy()
    rwa4['pred_min'] = rwa4['minutes'].rolling(4).apply(lambda x: np.sum(weights*x)).shift(+1)
    
    #Account for games early in season or injuries
    for i in range(0, len(rwa4)):
        if rwa4['player_game_num'].iloc[i] == 1:
            rwa4['pred_min'].iloc[i] = None
        elif rwa4['player_game_num'].iloc[i] == 2:
            rwa4['pred_min'].iloc[i] = rwa4['minutes'].iloc[i-1]
        elif rwa4['player_game_num'].iloc[i] == 3:
            rwa4['pred_min'].iloc[i] = (((rwa4['minutes'].iloc[i-1])* 0.65) 
            + ((rwa4['minutes'].iloc[i-2])* 0.35))
        elif rwa4['player_game_num'].iloc[i] == 4:
            rwa4['pred_min'].iloc[i] = (((rwa4['minutes'].iloc[i-1])* 0.55) 
            + ((rwa4['minutes'].iloc[i-2])* 0.3) 
            + ((rwa4['minutes'].iloc[i-3])* 0.15))

In [366]:
def rolling_weighted_avg3(min_data):
    #Apply rolling weighted averages to pred_min column
    weights = np.array([.15, .3, .55])
    global rwa3
    rwa3 = min_data.copy()
    rwa3['pred_min'] = rwa3['minutes'].rolling(3).apply(lambda x: np.sum(weights*x)).shift(+1)
    
    #Account for games early in season or injuries
    for i in range(0, len(rwa3)):
        if rwa3['player_game_num'].iloc[i] == 1:
            rwa3['pred_min'].iloc[i] = None
        elif rwa3['player_game_num'].iloc[i] == 2:
            rwa3['pred_min'].iloc[i] = rwa3['minutes'].iloc[i-1]
        elif rwa3['player_game_num'].iloc[i] == 3:
            rwa3['pred_min'].iloc[i] = (((rwa3['minutes'].iloc[i-1])* 0.65) 
            + ((rwa3['minutes'].iloc[i-2])* 0.35))

In [367]:
def rolling_avg4(min_data):
    global ra4
    ra4 = min_data.copy()
    ra4['pred_min'] = ra4['minutes'].rolling(4, min_periods = 1).mean().shift(+1)
    for i in range(0, len(ra4)):
        if ra4['player_game_num'].iloc[i] == 1:
            ra4['pred_min'].iloc[i] = None
        elif ra4['player_game_num'].iloc[i] == 2:
            ra4['pred_min'].iloc[i] = ra4['minutes'].iloc[i-1]
        elif ra4['player_game_num'].iloc[i] == 3:
            ra4['pred_min'].iloc[i] = (((ra4['minutes'].iloc[i-1])
            + (ra4['minutes'].iloc[i-2]))/2)
        elif ra4['player_game_num'].iloc[i] == 4:
            ra4['pred_min'].iloc[i] = ((((ra4['minutes'].iloc[i-1])) 
            + ((ra4['minutes'].iloc[i-2])) 
            + ((ra4['minutes'].iloc[i-3])))/3)

In [354]:
def rolling_avg3(min_data):
    global ra3
    ra3 = min_data.copy()
    ra3['pred_min'] = ra3['minutes'].rolling(3, min_periods = 1).mean().shift(+1)
    for i in range(0, len(ra3)):
        if ra3['player_game_num'].iloc[i] == 1:
            ra3['pred_min'].iloc[i] = None

In [368]:
rolling_weighted_avg4(min_data)

In [369]:
rolling_weighted_avg3(min_data)

In [370]:
rolling_avg4(min_data)

In [371]:
rolling_avg3(min_data)

In [372]:
print('RMSE Rolling Weighted Avg4. = ', ((rwa4.pred_min - rwa4.minutes) ** 2).mean() ** .5)
print('RMSE Rolling Weighted Avg3. = ', ((rwa3.pred_min - rwa3.minutes) ** 2).mean() ** .5)
print('RMSE Rolling Avg4. = ', ((ra4.pred_min - ra4.minutes) ** 2).mean() ** .5)
print('RMSE Rolling Avg3. = ', ((ra3.pred_min - ra3.minutes) ** 2).mean() ** .5)

RMSE Rolling Weighted Avg4. =  7.490258563774775
RMSE Rolling Weighted Avg3. =  7.549058512909152
RMSE Rolling Avg4. =  7.666503959160692
RMSE Rolling Avg3. =  7.8065281099398405


In [380]:
rwa4['RSE'] = ((rwa4['pred_min'] - rwa4['minutes'])**2)**.5

In [398]:
rwa4_sample = rwa4[rwa4['player_num'] == random.randint(0, 620)]

In [423]:
plt.scatter(rwa4_sample['player_game_num'], rwa4_sample['RSE'])
plt.show()