In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', 500)
import numpy as np

In [2]:
df = pd.read_excel('C:\\Users\\rbush\\Documents\\Projects\\PGA Finish Projections\\PGA Raw Data - Tournament Level_dev.xlsx', 
                   sheet_name = 'PGA Raw Data')

df = df.sort_values(by = ['player', 'days_from_today'])

df.columns = (df.columns.str.strip().str.lower()
              .str.replace(' ', '_')
              .str.replace('(', '')
              .str.replace(')', ''))

In [3]:
#df.head()
#df.dtypes

In [4]:
df['mc'] = df['place'].map(lambda x: 1 if x == '-' else 0)
df['days_since_last'] = df['days_from_today'].shift(-1)-df['days_from_today']

In [5]:
fields = ['player', 'player_id', 'tournament_id', 'tournament_name', 'course','season', 'days_from_today',
            'sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total']

In [6]:
df = df[fields]

In [7]:
features = ['sg_putt', 'sg_arg', 'sg_app', 'sg_ott', 'sg_t2g', 'sg_total']

In [8]:
players = df['player'].unique().tolist()

In [9]:
def scanning_mean(df, features, players, window):
    """
    The scanning_mean function scans through each row in the dataframe to identify null values, and calculates the local 
    average value across a specified window size.  The intent of the average is to represent typical SG performance at a 
    particular point in time where data was not loaded correctly.
    
    1) Creates a duplicate of the passed dataframe called df_clean.  All updates are made to df_clean, which is ultimately 
       passed as the output.  df_clean_row is initialized and set equal to 0, and is used to update df_clean.
       
    2) Takes a slice of the dataframe for each player and defines it as df_slice.  Additionally, calculates the number of rows 
       in each slice and defines the value as df_slice_rows.  The row count is used to:
           - iterate through each row and identify null feature values
           - provide an upper-bound on the scanning window to ensure index bounds are not exceeded
           
    3) Iterates through each row and scans each feature column specified in the function's input.
    
    4) Once a NaN value is discovered, a window of length [max(0,row-window):min(row+window,rows)] is created for that 
       feature vector:
           - max(0,row-window) ensures that the first row considered is either 0 or a row within the subset (i.e. ensures
             that the index never reaches back to a negative value)
           - min(row+window,rows) ensures that the last row considered is either a row within the subset or the final row
             (i.e. ensures that the index never reaches beyond the maximum row count)
       
       If the iteration encounters a non-NaN value, it passes the process onto the next iteration.
       
    5) Once all of the NaN values in df have been cleaned, df_clean is returned as the output.
    """
    
    # Initialize the output dataframe df_clean and initilize the row being evaluated for NaN values at 0
    df_clean = df
    df_clean_row = 0
    
    for player in players:
        
        # Create df_slice, a subset of df for the player being evaluated.  The script evaluates a subset for each player, and
        # writes the cleaned values back to df_clean
        df_slice = df[(df['player'] == player)]
        df_slice_rows = df_slice.shape[0]
        
        for df_slice_row in range(df_slice_rows):
            for feature in features:
                
                # For each row and feature in df_slice, evaluate whether the feature value is NaN.  If so, calculate the local
                # feature mean with a scanning window of length -window:window.  If the value is not NaN, the function passes
                # to the next row.
                if pd.isnull(df_slice[feature].iloc[df_slice_row]):
                    
                    # Specify the minimum and maximum index value for the scanning window
                    window_min = max(0, df_slice_row-window)
                    window_max = min(df_slice_row+window+1, df_slice_rows)
                    
                    # Obtain the feature vector to be evaluated
                    feature_slice = df_slice[feature]
                    
                    # Isolated a portion of the feature vector which is 1) centered on the NaN value, and 2) 
                    # of the size specified by the scanning window.
                    feature_window = feature_slice[window_min:window_max]
                    
                    # Calculate the average non-null value in the feature_window
                    feature_mean = feature_window.mean()
                    
                    # Update df_clean with the rolling window's average value
                    df_clean[feature].iloc[df_clean_row] = feature_mean
                    
                else:
                    pass
            
            # Update the row being evaluated in the output dataframe df_clean
            df_clean_row = df_clean_row + 1
    
    # Return only the values associated with the players specified in the function input
    df_clean = df_clean.loc[df['player'].isin(players)]
                
    return(df_clean)

In [10]:
df_clean = scanning_mean(df = df, features = features, players = players, window = 2)

In [11]:
#df_clean.iloc[205:225]

In [12]:
null_records = df_clean.loc[(pd.isnull(df_clean['sg_app']))]['player'].count()
total_records = df_clean['player'].count()

null_ratio = null_records/total_records
print(null_ratio)

0.013304677105721011


In [13]:
df_clean['nan'] = np.where(pd.isnull(df_clean['sg_app']), 1, 0)

In [14]:
df_clean.iloc[212:215,:]

Unnamed: 0,player,player_id,tournament_id,tournament_name,course,season,days_from_today,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,nan
131,Aaron Wise,10577,2501,U.S. Open,"Oakmont - Oakmont, PA",2016,1699,0.0,-0.31,-0.81,0.785,-0.335,-0.335,0
213,Abel Gallegos,4701571,401219478,Masters Tournament,"Augusta National - Augusta, GA",2021,89,,,,,,,1
310,Abraham Ancer,9261,401243401,The American Express,"La Quinta CC - La Quinta, CA",2021,19,-0.03,-0.3,1.18,0.54,1.43,1.39,0
