In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
sns.set()

Skip to [Observations and Discussion](#discussion)

- **'NGS-punt_return.csv'** is obtained from: https://www.kaggle.com/jdemeo/preprocessing-ngs
- **'play-punt_return-yardage.csv'** is obtained from: https://www.kaggle.com/jdemeo/preprocessing-punt-play

In [None]:
# Load in NGS data, player role data, and play info
ngs_df = pd.read_csv('../input/ngsconcussion/NGS-punt_return.csv')
play_player_role_df = pd.read_csv('../input/NFL-Punt-Analytics-Competition/play_player_role_data.csv')
play_df = pd.read_csv('../input/ngsconcussion/play-punt_return-yardage.csv')

# Merge datasets
ngs_df = pd.merge(ngs_df, play_player_role_df,
                  how="inner",
                  on=['GameKey', 'PlayID', 'GSISID'])

ngs_df = pd.merge(ngs_df, play_df,
                  how="inner",
                  on=['GameKey', 'PlayID'])

# Cleanup
droppers = ['count']
ngs_df.drop(columns=droppers, inplace=True)
ngs_df.head()

In [None]:
# NGS Unique_ids
ngs_ids = ngs_df.groupby(['GameKey','PlayID']).size().reset_index().rename(columns={0:'count'})
ngs_ids.shape

### Goal: Understand the proximity of the nearest punt player to the PR when the punt is caught by the PR
This is done in hopes to understand what the impact of having a restricted zone for a PR would be on the 2016 and 2017 punt returns. I am primarily interested in seeing how much return yardage is being lost if certain plays were negated as a result of the rule to better assess the penalty yardage amount for infringing on a PR restricted zone. This analysis is also done to understand how many plays would be negated by such a rule just in general and have reasoning for a particular restricted zone distance.

In [None]:
'''ONLY RUN THE FOLLOWING TWO BLOCKS TO GET AN IDEA OF THE COURSE OF EVENTS FOR A PARTICULAR PLAY'''

# def isolate_play(df, game_key, play_id):
#     '''Create a dataframe of a particular play'''
#     where_condition = ((df['GameKey'] == game_key) &
#                        (df['PlayID'] == play_id))
#     new_df = df[where_condition].copy()
#     new_df.sort_values(by=['Time'], inplace=True)
#     new_df.reset_index(drop=True, inplace=True)
#     return new_df

# def course_of_events(df):
#     '''Get list of events in order of occurrence for a particular play'''
#     events = []
#     for i in range(len(df)):
#         event = df.loc[i, 'Event']
#         if event not in events:
#             events.append(event)
           
#     print('Play Description:', df.loc[0, 'PlayDescription'])
#     print('---')
#     print('Game Events:', events)
#     print('-----------------------------------------------')

In [None]:
# # Iterate through ids to get events for each play
# for i in range(len(ngs_ids)):
#     game_key = ngs_ids.loc[i, 'GameKey']
#     play_id = ngs_ids.loc[i, 'PlayID']
#     the_play = isolate_play(ngs_df, game_key, play_id)
#     course_of_events(the_play)

- Looking at the course of events during a play, 'punt_received' seems like a reasonable place to see proximity of oponnent players

In [None]:
def event_df_creation(df, event):
    '''Get a new dataframe with data pertinent to a particular event'''
    new_df = df[df['Event'] == event].reset_index(drop=True)
    unique_ids = new_df.groupby(['GameKey','PlayID']).size().reset_index().rename(columns={0:'count'})
    return new_df, unique_ids

In [None]:
# Let's indicate what team the player is playing on based off player role
return_team_positions = ['PR', 'PDL1', 'PDL2', 'PDL3', 'PDL4', 'PDR1', 'PDR2', 'PDR3', 'PDR4', 'VL', 'VR', 
                         'PLL', 'PLR', 'VRo', 'VRi', 'VLi', 'VLo', 'PLM', 'PLR1', 'PLR2', 'PLL1', 'PLL2',
                         'PFB', 'PDL5', 'PDR5', 'PDL6', 'PLR3', 'PLL3', 'PDR6', 'PLM1', 'PDM']
punt_team_positions = ['P', 'PLS', 'PPR', 'PLG', 'PRG', 'PLT', 'PRT', 'PLW', 'PRW', 'GL', 'GR',
                       'GRo', 'GRi', 'GLi', 'GLo', 'PC', 'PPRo', 'PPRi', 'PPL', 'PPLi', 'PPLo']

def label_team(df):
    '''Label each player by the team they play on'''
    df['team'] = ''
    print('Determining player roles')

    for i, role in enumerate(df['Role']):
        if role in return_team_positions:
            df.loc[i, 'team'] = 'return team'
        elif role in punt_team_positions:
            df.loc[i, 'team'] = 'punt team'
        else:
            df.loc[i, 'team'] = 'unknown'

In [None]:
def calculate_player_proximity(role_x, role_y, player_x, player_y):
    '''Calculate euclidean distance between two players'''
    leg_x = (role_x - player_x) ** 2
    leg_y = (role_y - player_y) ** 2
    hypotenuse = np.sqrt(leg_x + leg_y)
    return hypotenuse

In [None]:
def calculate_x_proximity(role_x, player_x):
    '''Calculate distance of a player to a particular role only by yardline'''
    return np.abs(role_x - player_x)

In [None]:
def calculate_proximity_for_play(df, unique_ids, role):
    '''Calculate proximity of each player to the player of a particular role'''
    # Create feature for player proximity
    df['proximity_to_' + role + '_circle'] = 0
    df['proximity_to_' + role + '_x'] = 0
    
    print('Calculating player proximities to', role)
    
    # Go through each data point in particular NGS dataset
    for i in range(len(df)):
        
        # Play Information
        game_key = df.loc[i, 'GameKey']
        play_id = df.loc[i, 'PlayID']
        
        # Get one unique set of data points related to a single (GameKey, PlayID) pair
        where_condition = ((df['GameKey'] == game_key) &\
                           (df['PlayID'] == play_id))
        just_view = df[where_condition].reset_index()
        
        # Get coordinates of a player with a particular role
        if any(just_view['Role'] == role):
            role_x = just_view.loc[just_view['Role'] == role, 'x'].values[0]
            role_y = just_view.loc[just_view['Role'] == role, 'y'].values[0]
            
        # Plays that don't actually have the particular role represented
        else:
            continue

        # Current Player coordinates
        position_x = df.loc[i, 'x']
        position_y = df.loc[i, 'y']

        # Calculate proximity
        proximity_hypo = calculate_player_proximity(role_x, role_y, position_x, position_y)
        proximity_x = calculate_x_proximity(role_x, position_x)
        df.loc[i, 'proximity_to_' + role + '_circle'] = proximity_hypo
        df.loc[i, 'proximity_to_' + role + '_x'] = proximity_x

In [None]:
def calculate_closest_player(df, unique_ids, column):
    '''Find who the closest player on the punt team is and create new id set'''
    unique_ids[column] = 0
    good_indexes = []
    role = 'PR'
    print('Determining closest player to', role)
    
    for i in range(len(unique_ids)):
        
        # Play information
        game_key = unique_ids.loc[i, 'GameKey']
        play_id = unique_ids.loc[i, 'PlayID']

        # Get one unique set of data points related to a single (GameKey, PlayID) pair
        where_condition = ((df['GameKey'] == game_key) &\
                           (df['PlayID'] == play_id) &\
                           (df['team'] == 'punt team'))
        just_view = df[where_condition].reset_index(drop=True)
        
        # Take minimum of series and Error handling where the NGS data had no punt team :(
        try:
            unique_ids.loc[i, column] = min(just_view[column])
            good_indexes.append(i)
        except ValueError:
            continue
    
    # Create new set of ids
    new_ids = unique_ids.loc[good_indexes, :].copy()
    new_ids.reset_index(inplace=True, drop=True)
    
    return new_ids

### Goal: Want to look at the closest punt player's proximity when the ball is caught by the PR
- Workflow:
    - Get datapoints for a particular plays event
    - Label the players by what team they are
    - Calculate each players proximity to a particular role
    - Find the minimum distance from punt player to PR
    - Return those proximities for each play
- **PR-proximity.csv** is used in notebook: https://www.kaggle.com/jdemeo/analysis-fair-catches

In [None]:
event_df, event_ids = event_df_creation(ngs_df, 'punt_received')
label_team(event_df)
calculate_proximity_for_play(event_df, event_ids, 'PR')
new_ids = calculate_closest_player(event_df, event_ids, 'proximity_to_PR_circle')
new_ids = calculate_closest_player(event_df, new_ids, 'proximity_to_PR_x')
new_ids.to_csv('PR-proximity.csv', index=False)

In [None]:
'''Plot of distribution distance of closest punt team player to punt receiver'''
bins = [i for i in range(0, 40, 1)]
plt.hist(new_ids['proximity_to_PR_x'], bins=bins)

plt.title('Distribution of closest player on punt team to punt receiver')
plt.xlabel('Yards')
plt.ylabel('count')
plt.show()

new_ids['proximity_to_PR_x'].describe()

In [None]:
'''Plot of distribution distance of closest punt team player to punt receiver'''
bins = [i for i in range(0, 40, 1)]
plt.hist(new_ids['proximity_to_PR_circle'], bins=bins)

plt.title('Distribution of closest player on punt team to punt receiver')
plt.xlabel('Yards')
plt.ylabel('count')
plt.show()

new_ids['proximity_to_PR_circle'].describe()

## <a id="discussion">Observations and Discussion</a>
- Average of the closest punt team player to the PR by yardline distance: 9.17 yards
- Average of the closest punt team player to the PR by Euclidean distance: 11.04 yards
- This can be considered relative to fair catches (https://www.kaggle.com/jdemeo/analysis-fair-catches) in which:
    - Average of the closest punt team player to the by PR yardline distance: 2.72 yards
    - Average of the closest punt team player to the by PR Euclidean distance: 4.04 yards
- We'll now look at the differences in distribution in yards returned on a play as well as the reward of the return to get a better idea of at what distance from the PR are there diminishing returns with regards to having a larger restricted zone
- Part of this analysis is to show why I chose a restricted zone of 8 yards by yardline or 10 yards by euclidean distance.

In [None]:
# Combine yardage data and proximity data
yard_and_proximity = pd.merge(new_ids, play_df,
                          how='inner',
                          on=['GameKey', 'PlayID'])
print(yard_and_proximity.shape)
yard_and_proximity.head()

### Let's Look at Yardage/Reward and Proximity
- We'll look at the yardage gained on the return and the reward of the play given a certain distance threshold from the PR
- The intuition would likely be that the further the closest punt team player is from the PR the more yardage on average the PR will gain on the return.
- I'm interested in this because if a restricted zone type rule were implemented, I would like to know how many punt plays over the two seasons this rule would have affected and also gauge what is being lost in value for that particular play in both yardage or by a proportional reward metric (yards gained / yards needed to gain for a touchdown)

In [None]:
def build_that_histogram(x, title):
    bins = [i for i in range(min(x), max(x), 1)]
    plt.hist(x, bins=bins)
    plt.title(title)
    plt.xlabel('Yards Gained on Return')
    plt.ylabel('Count')
    plt.show()
    
def build_that_histogram_rewards(x, title):
    bins = [i * 0.01 for i in range(-15, 100, 1)]
    plt.hist(x, bins=bins)
    plt.title(title)
    plt.xlabel('Reward')
    plt.ylabel('Count')
    plt.show()
    
def distributions_of_interest(yard_and_proximity, proximity_col, dependent_col):
    '''Look at distributions based on proximity with regards to return yardage or reward'''
    mean_dif = []
    for i in range(1, 16):

        # Check for cases less than or equal to a certain distance
        where_condition = yard_and_proximity[proximity_col] <= i
        x = yard_and_proximity[where_condition][dependent_col]
        title = proximity_col + ' <= ' + str(i) + ' yards'
        if dependent_col == 'reward':
            build_that_histogram_rewards(x, title)
        else:
            build_that_histogram(x, title)
        print(x.describe())
        mean1 = x.describe()[1]

        # Check for cases greater than a certain distance
        where_condition = yard_and_proximity[proximity_col] > i
        x = yard_and_proximity[where_condition][dependent_col]
        title = proximity_col + ' > ' + str(i) + ' yards'
        if dependent_col == 'reward':
            build_that_histogram_rewards(x, title)
        else:
            build_that_histogram(x, title)
        print(x.describe())
        mean2 = x.describe()[1]

        mean_dif.append(mean2 - mean1)
        print('-----------------------------------------------')
    
    return mean_dif

In [None]:
# Try proximity_to_PR_circle' or 'proximity_to_PR_x' as the filter
# Try reward' or 'yardage on play' to see counts of that variable
mean_dif = distributions_of_interest(yard_and_proximity, 'proximity_to_PR_circle', 'yardage on play')

In [None]:
'''Plot of mean differences between the two distributions'''
plt.figure(figsize=(9, 6))
plt.scatter([i for i in range(1, 16)], mean_dif)

plt.title('Differences of means between Restricted Zone Punt Returns and Outside Restricted Zone Punt Returns')
plt.ylabel('Difference of average yards gained', fontsize=12)
plt.xlabel('Restricted Zone Yard Cutoff (Euclidean)', fontsize=12)
plt.savefig('mean_difference.png', bbox_inches='tight')

plt.show()

In [None]:
'''Run to see proximity by yardline distance distributions'''
# # Try proximity_to_PR_circle' or 'proximity_to_PR_x' as the filter
# # Try reward' or 'yardage on play' to see counts of that variable
# mean_dif = distributions_of_interest(yard_and_proximity, 'proximity_to_PR_x', 'yardage on play')

- Summary
    - **2604** punt returns were analyzed
    - **Average proximity by yardline distance: 9.17 yards**
    - **Average proximity by Euclidean distance: 11.04 yards**
    - Hypothetical Restricted Zones:
        - Given that the closest punt team player is less than or equal to 8 yards (yardline distance) from PR at time of catch:
            - Count: **1216**
            - **Average yards gained by PR on play: 8.01 yards**
            - **Average reward by PR on play: 0.11**
        - Given all punt team players are greater than 8 yards (yardline distance) from PR at time of catch:
            - Count: **1388**
            - **Average yards gained by PR on play: 11.71 yards**
            - **Average reward by PR on play: 0.15**
        - So we are looking at an average difference of 3.70 yards between the two distributions.
        - Given that the closest punt team player is less than or equal to 10 yards (Euclidean distance) from PR at time of catch:
            - Count: **1205**
            - **Average yards gained by PR on play: 7.91 yards**
            - **Average reward by PR on play: 0.11**
        - Given all punt team players are greater than 8 yards (Euclidean distance) from PR at time of catch:
            - Count: **1399**
            - **Average yards gained by PR on play: 11.76 yards**
            - **Average reward by PR on play: 0.16**
        - So we are looking at an average difference of 3.85 yards between the two distributions.
        - I present two hypothetical restricted zone distances to only show that they are nearly equivalent. It's just much easier to know 8 yards by yardline distance then 10 yards by Euclidean distance because of the fields painted yardline markers.
    - **So intuitively the further the closest punt team player is from the PR at the time of the catch, the higher the mean return distance and 'reward' value for return.**

# Links to other notebooks:
- Concussion play analysis with proposed rule changes: https://www.kaggle.com/jdemeo/analysis-concussions
- Analysis of uncalled penalties: https://www.kaggle.com/jdemeo/analysis-uncalled-penalties
- Analysis of fair catches: https://www.kaggle.com/jdemeo/analysis-fair-catches
- Preprocessing of Play Information: https://www.kaggle.com/jdemeo/preprocessing-punt-play
- Preprocessing of NGS data for the above notebooks: https://www.kaggle.com/jdemeo/preprocessing-ngs