# Load data and combine tracking data:

In [None]:
import pandas as pd
import statistics as stats
import os
games = pd.read_csv('../input/nfl-big-data-bowl-2021/games.csv')
players = pd.read_csv('../input/nfl-big-data-bowl-2021/players.csv')
plays = pd.read_csv('../input/nfl-big-data-bowl-2021/plays.csv')
week1 = pd.read_csv('../input/nfl-big-data-bowl-2021/week1.csv')
week2 = pd.read_csv('../input/nfl-big-data-bowl-2021/week2.csv')
week3 = pd.read_csv('../input/nfl-big-data-bowl-2021/week3.csv')
week4 = pd.read_csv('../input/nfl-big-data-bowl-2021/week4.csv')
week5 = pd.read_csv('../input/nfl-big-data-bowl-2021/week5.csv')
week6 = pd.read_csv('../input/nfl-big-data-bowl-2021/week6.csv')
week7 = pd.read_csv('../input/nfl-big-data-bowl-2021/week7.csv')
week8 = pd.read_csv('../input/nfl-big-data-bowl-2021/week8.csv')
week9 = pd.read_csv('../input/nfl-big-data-bowl-2021/week9.csv')
week10 = pd.read_csv('../input/nfl-big-data-bowl-2021/week10.csv')
week11 = pd.read_csv('../input/nfl-big-data-bowl-2021/week11.csv')
week12 = pd.read_csv('../input/nfl-big-data-bowl-2021/week12.csv')
week13 = pd.read_csv('../input/nfl-big-data-bowl-2021/week13.csv')
week14 = pd.read_csv('../input/nfl-big-data-bowl-2021/week14.csv')
week15 = pd.read_csv('../input/nfl-big-data-bowl-2021/week15.csv')
week16 = pd.read_csv('../input/nfl-big-data-bowl-2021/week16.csv')
week17 = pd.read_csv('../input/nfl-big-data-bowl-2021/week17.csv')
targeted_receiver = pd.read_csv('../input/nfl-big-data-bowl-2021-bonus/targetedReceiver.csv')

# TODO: add coverage data?

frames = [week1, week2, week3, week4, week5, week6, week7, week8, week9, week10, week11, week12, week13, week14, week15, week16, week17]
tracking_original = pd.concat(frames, ignore_index = True)
tracking_original

# Reorient tracking data to all face the same way:

In [None]:
tracking_df = tracking_original.copy()
tracking_df.loc[tracking_df['playDirection'] == 'left', 'x'] = 120 - tracking_df['x']
tracking_df.loc[tracking_df['playDirection'] == 'left', 'y'] = 53.33 - tracking_df['y']
tracking_df.loc[(tracking_df['playDirection'] == 'left') & (tracking_original['o'] <= 180), 'o'] = tracking_df['o'] + 180
tracking_df.loc[(tracking_df['playDirection'] == 'left') & (tracking_original['o'] > 180), 'o'] = tracking_df['o'] - 180
tracking_df.loc[(tracking_df['playDirection'] == 'left') & (tracking_original['dir'] <= 180), 'dir'] = tracking_df['dir'] + 180
tracking_df.loc[(tracking_df['playDirection'] == 'left') & (tracking_original['dir'] > 180), 'dir'] = tracking_df['dir'] - 180

# Drop Fumbles Recovered by the Defense

In [None]:
def filter_tracking_frames_by_event(df, event_types):
    """
    General function for filtering tracking data by event type
    :param df: Pandas dataframe containing tracking data by frame
    :param event_types: List of event types to filter for
    :return: Pandas Dataframe containing frames corresponding to event_types
    """
    frame_indices = df['event'].isin(event_types)
    return df[frame_indices]


def drop_fumble_plays(tracking_df):
    """
    Drop plays where the ball is fumbled and recovered by the defense.
    :param tracking_df: Dataframe containing tracking player-frame data (used to identify plays with 'fumble_defense_recovered' events)
    :return: Returns dataframe of tracking player-frame data filtered to not consider plays where the fumble is recovered by the defense. 
    """
    fumble_df = filter_tracking_frames_by_event(tracking_df, ['fumble_defense_recovered'])
    
    # Sanity check
    display(fumble_df.sort_values(['gameId', 'playId']))
    
    # Find playIds in which a player-frame with a fumble occurs and don't consider it
    play_ids = pd.Series(fumble_df['playId']).unique()
    display(play_ids)
    game_ids = pd.Series(fumble_df['gameId']).unique()
    display(game_ids)
    frame_indices = ~(plays['playId'].isin(play_ids) & plays['gameId'].isin(game_ids))
    
    return plays[frame_indices]

# Drop player-frames corresponding to plays in which a fumble recovered by defense occurs
plays = drop_fumble_plays(tracking_df)
display(plays)

# Add week 1 coverages and targeted receiver for each play:

In [None]:
# Add week 1 coverages
# plays = pd.merge(plays, coverages_week1, how='left')

# Add targeted receiver for each play
plays = pd.merge(plays, targeted_receiver, how='left')
plays = pd.merge(plays, players, how='left', left_on='targetNflId', right_on='nflId')
plays.iloc[0]

# Begin constructing plays_expanded, our pre-processed training data:

In [None]:
plays_expanded = plays[['gameId', 'playId', 'down', 'yardsToGo', 'penaltyCodes', 'penaltyJerseyNumbers', 'passResult', 'offensePlayResult', 'playResult', 'epa', 'nflId', 'displayName']]
plays_expanded

# Filter tracking data for only the frames where ball is released:

In [None]:
# Filter tracking data for the exact moments at which the ball is released for each play
release_df = filter_tracking_frames_by_event(tracking_df, ['pass_forward'])
release_df

# Merge receiver tracking data with plays_expanded:

In [None]:
# TODO: Scott write some comments here about how this works
plays_expanded = pd.merge(plays_expanded, release_df, how='left', on = ['gameId', 'playId', 'nflId'], suffixes=('', '_y'))
plays_expanded.drop(plays_expanded.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
plays_expanded.drop(columns = ['time', 'dis', 'event'], inplace = True)
plays_expanded.rename(columns = {'team': 'team_o', 'nflId': 'nflId_o', 'displayName': 'displayName_o', 'x': 'x_o', 'y': 'y_o', 's': 'speed_o', 'a': 'acceleration_o', 'o': 'orientation_o', 'dir': 'dir_o', 'jerseyNumber': 'jerseyNumber_o', 'position': 'position_o'}, inplace = True)
plays_expanded.columns

# Merge ball tracking data with plays_expanded:

In [None]:
def football_filter(df):
    """
    Function for filtering ball release frame data for only ball tracking data
    :param df: Pandas dataframe containing tracking data by frame filtered for ball release event
    :return: Pandas Dataframe containing frames corresponding to ball tracking data
    """
    df = df[df.displayName.isin(['Football'])]
    return df

football_df = football_filter(release_df)

# TODO: Scott write some comments here about how this works
plays_expanded = pd.merge(plays_expanded, football_df, how='left', on=['gameId', 'playId'], suffixes=('', '_y'))
plays_expanded.drop(plays_expanded.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
plays_expanded.drop(columns = ['team', 'time', 'dis', 'event', 'o', 'dir', 'nflId', 'jerseyNumber', 'position', 'displayName'], inplace=True)
plays_expanded.rename(columns = {'x': 'x_b', 'y': 'y_b', 's': 'speed_b', 'a': 'acceleration_b'}, inplace=True)
plays_expanded.iloc[0]

# Calculate DTR (Distance to Targeted Receiver) at time of release for each defender:

In [None]:
temp_df = []
import math

def find_DTR_at_release():
    """
    Calculates DTR (Distance to Targeted Receiver) for each defender at the time of release
    :return: Returns dataframe of defenders and their distances to the targeted receiver at time of release
    """
    # Filter for defensive positions only
    def_pos = ['DL', 'FS', 'DB', 'DE', 'OLB', 'SS', 'CB', 'ILB', 'S', 'MLB', 'NT', 'LB']
    
    temp_df = pd.merge(plays_expanded, release_df, how='right', on = ['gameId', 'playId'], suffixes=('', '_y'))
    temp_df.drop(plays_expanded.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
    temp_df = temp_df[temp_df.position.isin(def_pos)]
    
    # Calculate DTR for each defender
    temp_df['distance_from_receiver'] = (((temp_df['x']-temp_df['x_o'])**2)+((temp_df['y']-temp_df['y_o'])**2))**(1/2)
    
    # Create new dataframe consisting only of relevant defender distance data
    DTR_release_df = pd.DataFrame()
    DTR_release_df['defender_id'] = temp_df['nflId']
    DTR_release_df['defender_name'] = temp_df['displayName']
    DTR_release_df['defender_dist_to_target_receiver'] = temp_df['distance_from_receiver']
    DTR_release_df['gameId'] = temp_df['gameId']
    DTR_release_df['playId'] = temp_df['playId']
    DTR_release_df['frameId'] = temp_df['frameId']
    
    return temp_df, DTR_release_df
    
temp_df, DTR_release_df = find_DTR_at_release()
display(DTR_release_df)

In [None]:
# DEBUG: Display DTR (ascending) per ball release frame for all defenders in chronological order
display(DTR_release_df.sort_values(['gameId', 'playId', 'frameId', 'defender_dist_to_target_receiver']))

# Keep only the closest defender by DTR per ball release frame (at moment of release)
closest_DTR_release_df = DTR_release_df.sort_values(['gameId', 'playId', 'frameId', 'defender_dist_to_target_receiver']).drop_duplicates(subset=['gameId', 'playId', 'frameId'], keep='first')
closest_DTR_release_df = closest_DTR_release_df.reset_index(drop=True)
display(closest_DTR_release_df)

# Merge closest defender DTR at release data into plays_expanded:

In [None]:
# TODO: Write comments explaining this?
plays_expanded = pd.merge(plays_expanded, closest_DTR_release_df, how='left', on=['gameId', 'playId', 'frameId'])
plays_expanded = pd.merge(plays_expanded, release_df, how='left', right_on=['gameId', 'playId', 'nflId'], left_on=['gameId', 'playId', 'defender_id'], suffixes=('', '_y'))

plays_expanded.drop(plays_expanded.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
plays_expanded.drop(columns = ['time', 'dis', 'event', 'nflId'], inplace=True)

plays_expanded.rename(columns = {'team': 'team_d', 'displayName': 'displayName_d', 'x': 'x_d', 'y': 'y_d', 's': 'speed_d', 'a': 'acceleration_d', 'o': 'orientation_d', 'dir': 'dir_d', 'jerseyNumber': 'jerseyNumber_d', 'position': 'position_d'}, inplace = True)
plays_expanded.iloc[0]

# Calculate DTR for defenders on ball arrival (as opposed to release):

In [None]:
# DEBUG FUNCTION TO HELP VIEW ALL INFO FOR A FRAME 
import numpy as np

def display_info(df, game_id, play_id, frame_id):
    frame_indices = np.where((df['gameId'] == game_id) & (df['playId'] == play_id) & (df['frameId'] == frame_id))
    display(df.loc[frame_indices])

display_info(tracking_df, 2018102110, 1710, 1)

# Pre-processing the data

Every 'pass_outcome_complete' play should result in a 'pass_arrived' event, but some 'pass_outcome_incomplete' plays result in a 'pass_arrived' event never being triggered because the ball was never passed close enough to the receiver. As a result, we want to consider the frame at which 'pass_outcome_incomplete' is triggered as the 'pass_arrived' event frame. For plays where 'pass_arrived' and 'pass_outcome_incomplete' are both triggered (maybe at different frames), we only want to consider the 'pass_arrived' event.

TODO: Explain more on why we only consider 'pass_arrived' as the important event.
        List the three cases out (pass complete, pass incomplete and arrived, pass incomplete but not arrived)

In [None]:
def drop_duplicate_arrive_incomplete(df):
    """
    Handles the case where 'pass_arrived' and 'pass_outcome_incomplete' occur on the same play (we only want to assess the game state at the point of 'pass arrived').
    Sort by player, then gameId, then playId to group any events occurring to the same player on the same play and then event ('pass_arrived' takes precedence over 'pass_outcome_incomplete')
    If there are duplicate rows (considering only the same player in the same game in the same play), then it must be 'pass_arrived' and 'pass_outcome_incomplete' events, and we remove the second one since it will be 'pass_outcome_incomplete'.
    :param df: Dataframe containing tracking player-frame data filtered for only player-frames with 'pass_arrived' or 'pass_outcome_incomplete' events.
    :return: Returns dataframe of tracking player-frame data filtered to only consider player-frames 'pass_arrived' events if a 'pass_outcome_incomplete' event occurs on the same play. 
    """
    # Sanity checks
    display(df.sort_values(['nflId', 'gameId','playId']))
    # Displays all the duplicated rows
    display(df[df.duplicated(['gameId', 'playId', 'nflId'], keep=False)].sort_values(['nflId', 'gameId','playId']))
    
    # Remove duplicates after sorting
    fixed_df = df.sort_values(['nflId', 'gameId','playId','event']).drop_duplicates(subset=['gameId', 'playId', 'nflId'], keep='first')
    
    # Sanity check, we should see both types at beginning and end hopefully
    display(fixed_df.sort_values('event'))
    
    return fixed_df


# Filter all tracking data frames for 'pass_arrived' events and 'pass_outcome_incomplete' events
pass_arrived_incomplete_df = filter_tracking_frames_by_event(tracking_df, ['pass_arrived', 'pass_outcome_incomplete'])

# Sanity check (don't delete)
display(pass_arrived_incomplete_df)
# print(set(pass_arrived_incomplete_df['event']))

# For any rows/frames which have a 'pass_arrived' event and a 'pass_outcome_incomplete' event occur on the same play, drop the frame in which the 'pass_outcome_incomplete' event occurs
pass_arrived_df = drop_duplicate_arrive_incomplete(pass_arrived_incomplete_df)

# Convert all 'pass_outcome_incomplete' events to 'pass_arrived' events
pass_arrived_df.loc[pass_arrived_df.event == 'pass_outcome_incomplete', 'event'] = 'pass_arrived'

# Sanity check (don't delete)
display(pass_arrived_df)
#print(set(fixed_df['event']))

In [None]:
def find_DTR_at_arrival(plays_expanded, df):
    """
    Calculates DTR (Distance to Targeted Receiver) for each defender at the time of arrival
    :return: Returns dataframe of defenders and their distances to the targeted receiver at time of arrival
    """
    temp_df = []
    
    def_pos = ['DL', 'FS', 'DB', 'DE', 'OLB', 'SS', 'CB', 'ILB', 'S', 'MLB', 'NT', 'LB']
    
    temp_df = pd.merge(plays_expanded, df, how='right', on = ['gameId', 'playId'], suffixes=('', '_y'))
    temp_df.drop(plays_expanded.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
    temp_df = temp_df[temp_df.position.isin(def_pos)]
    
    temp_df['distance_from_receiver'] = (((temp_df['x'] - temp_df['x_o']) ** 2)
                                        + ((temp_df['y'] - temp_df['y_o']) ** 2)
                                        ) ** (1/2)
    
    DTR_arrival_df = pd.DataFrame()
    DTR_arrival_df['defender_id'] = temp_df['nflId']
    DTR_arrival_df['defender_name'] = temp_df['displayName']
    DTR_arrival_df['defender_dist_to_target_receiver'] = temp_df['distance_from_receiver']
    DTR_arrival_df['gameId'] = temp_df['gameId']
    DTR_arrival_df['playId'] = temp_df['playId']
    DTR_arrival_df['frameId'] = temp_df['frameId']
    return temp_df, DTR_arrival_df

temp_df, DTR_arrival_df = find_DTR_at_arrival(plays_expanded, pass_arrived_df)
display(temp_df)
display(DTR_arrival_df)

We want to get the closest three defenders to the targeted receiver at the moment of ball arrival who were NOT the closest defender to the targeted receiver at the moment of ball release.

TODO: Explain why this is the case some more.

In [None]:
# Get DTR (ascending) for all defenders at the moment of ball arrival for each play in chronological order
sorted_DTR_arrival_df = DTR_arrival_df.sort_values(['gameId', 'playId', 'frameId', 'defender_dist_to_target_receiver'])
display(sorted_DTR_arrival_df)

# DEBUG: Display DTR (ascending) for all defenders at the moment of release for each play in chronological order
display(DTR_release_df.sort_values(['gameId', 'playId', 'frameId', 'defender_dist_to_target_receiver']))

# Get the closest defender by DTR at the moment of release for each play in chronological order
sorted_closest_DTR_release_df = closest_DTR_release_df.sort_values(['gameId', 'playId', 'frameId', 'defender_dist_to_target_receiver'])
display(sorted_closest_DTR_release_df)

# Append the list of closest defenders by DTR at the moment of release for each play to the list of all defenders at the moment of arrival for each play
# Then remove any duplicates (both of them) in order to ensure we get the closest three defenders at arrival time who were NOT the closest defender at release time
sorted_closest_defenders_arrival_with_exclusion_df = sorted_DTR_arrival_df.append(sorted_closest_DTR_release_df).drop_duplicates(subset=['gameId', 'playId', 'frameId', 'defender_id'],keep=False)
display(sorted_closest_defenders_arrival_with_exclusion_df)

def get_defender_rank(x):
    x['defender_rank'] = np.arange(1,len(x)+1)
    return x

# Group defenders by each play
grouped_df = sorted_closest_defenders_arrival_with_exclusion_df.groupby(['gameId', 'playId', 'frameId']) #as_index=False
grouped_df_apply = grouped_df.apply(get_defender_rank).drop_duplicates(keep='last').groupby(['gameId', 'playId', 'frameId'])
#display(grouped_df_apply[(grouped_df_apply['gameId'] == 2018123015) & (grouped_df_apply['playId'] == 4104)])
# DEBUG: View groups representing each frame formed by players sorted in ascending order of DTR
# count = 0
# for key, item in grouped_df_apply:
# #     print(key)
# #     print(item)
#     if key == (2018123015, 4104, 52.0):
#         print(key)
#         print(item)
#     if count > 5: break
#     else: count += 1


# TODO: CHECK CORRECTNESS
# Keep only the three closest defenders by DTR for each play at the moment of arrival
top_three_defenders_arrival_with_exclusion = grouped_df_apply.head(3)
display(top_three_defenders_arrival_with_exclusion)

In [None]:
top_three_defenders_arrival_with_exclusion[top_three_defenders_arrival_with_exclusion['playId'] == 4104]

In [None]:
len(top_three_defenders_arrival_with_exclusion)

# Adding receiver and ball at arrival

In [None]:
# def arrival_filter(df):
#     arrival = df['event'] == 'pass_arrived'
#     return df[arrival]

# arrival_df = arrival_filter(pass_arrived_df)

plays_expanded = pd.merge(plays_expanded, pass_arrived_df, how='left', left_on = ['gameId', 'playId', 'nflId_o'], right_on = ['gameId', 'playId', 'nflId'],suffixes=('', '_y'))
plays_expanded.drop(plays_expanded.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
plays_expanded.drop(columns = ['time', 'dis', 'event', 'team', 'displayName', 'jerseyNumber', 'position', 'nflId'], inplace = True)
plays_expanded.rename(columns = {'x': 'x_o_a', 'y': 'y_o_a', 's': 'speed_o_a', 'a': 'acceleration_o_a', 'o': 'orientation_o_a', 'dir': 'dir_o_a'}, inplace = True)
plays_expanded.iloc[0]

In [None]:
def football_arrival_filter(df):
    football_arrival_df = df[df.displayName.isin(['Football'])]
    return football_arrival_df

football_arrival_df = football_filter(pass_arrived_df)

plays_expanded = pd.merge(plays_expanded, football_arrival_df, how='left', on = ['gameId', 'playId'], suffixes=('', '_y'))
plays_expanded.drop(plays_expanded.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
plays_expanded.drop(columns = ['team', 'time', 'dis', 'event', 'o', 'dir', 'nflId', 'jerseyNumber', 'position', 'displayName'], inplace = True)
plays_expanded.rename(columns = {'x': 'x_b_a', 'y': 'y_b_a', 's': 'speed_b_a', 'a': 'acceleration_b_a'}, inplace = True)
plays_expanded.loc[0]

# Drop plays with no intended receiver and multiple penalties

In [None]:
plays_expanded = plays_expanded[~plays_expanded['displayName_o'].isnull()]
# drop plays with multiple penalties
update = plays_expanded['penaltyJerseyNumbers'].str.split(' ', expand = True)
plays_expanded['penaltyJerseyNumbers_update'] = update[1]
plays_expanded = plays_expanded[~plays_expanded['penaltyJerseyNumbers_update'].str.contains(';', na = False)]
plays_expanded

# drop all penalties expcept for DPI against player being graded

In [None]:
plays_expanded = plays_expanded[plays_expanded['penaltyCodes'].isnull() | (plays_expanded['penaltyCodes'].isin(['DPI'])) & ((plays_expanded['penaltyJerseyNumbers_update'].astype('float')) == plays_expanded['jerseyNumber_d'])]
plays_expanded

In [None]:
display(top_three_defenders_arrival_with_exclusion)

In [None]:
temp_df = top_three_defenders_arrival_with_exclusion.pivot_table('defender_dist_to_target_receiver', ['gameId', 'playId'], 'defender_rank')

In [None]:
temp_df.reset_index(drop = False, inplace = True)
temp_df.reindex(['gameId', 'playId', 'defender_1', 'defender_2', 'defender_3'], axis = 1)
temp_df

In [None]:
plays_expanded = pd.merge(plays_expanded, temp_df, how='left', on = ['gameId', 'playId'], suffixes=('', '_y'))
plays_expanded.drop(plays_expanded.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
plays_expanded.rename(columns = {1: 'dist_defender_1_a', 2: 'dist_defender_2_a', 3: 'dist_defender_3_a'}, inplace = True)
plays_expanded

# Random Forest

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
import seaborn as sns

In [None]:
# drop_columns = ['']
# x = plays_expanded.drop(['epa'], axis=1)
x = plays_expanded[['down', 'yardsToGo','x_o', 'y_o', 'speed_o', 'acceleration_o', 'orientation_o', 'dir_o', 'x_b','y_b','speed_b','acceleration_b','defender_dist_to_target_receiver','x_d','y_d','speed_d','acceleration_d','orientation_d','dir_d','x_o_a','y_o_a','speed_o_a','acceleration_o_a','orientation_o_a', 'dir_o_a','x_b_a','y_b_a','speed_b_a','acceleration_b_a','dist_defender_1_a','dist_defender_2_a','dist_defender_3_a', 'playResult']]
x = x.dropna()
display(x)
print(x.describe())
y = x.playResult.values.ravel()
display(y)

x = x.drop(['playResult'], axis=1)
display(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

In [None]:
model = RandomForestRegressor()
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)
y_pred

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmse

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test, y_pred)
plt.show()

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)
score

In [None]:
expected_yards = model.predict(x)
mse = mean_squared_error(y, expected_yards)
rmse = np.sqrt(mse)
rmse

In [None]:
defenders_df = plays_expanded[['defender_name', 'down', 'yardsToGo','x_o', 'y_o', 'speed_o', 'acceleration_o', 'orientation_o', 'dir_o', 'x_b','y_b','speed_b','acceleration_b','defender_dist_to_target_receiver','x_d','y_d','speed_d','acceleration_d','orientation_d','dir_d','x_o_a','y_o_a','speed_o_a','acceleration_o_a','orientation_o_a', 'dir_o_a','x_b_a','y_b_a','speed_b_a','acceleration_b_a','dist_defender_1_a','dist_defender_2_a','dist_defender_3_a', 'playResult']]
defenders_df = defenders_df.dropna()
# display(defenders_df)

yards_pred = list(expected_yards)
defenders_df['yards_pred'] = yards_pred
# display(defenders_df)

defenders_df = defenders_df.rename(columns={'playResult':'yards_actual'})
defenders_df = defenders_df[['defender_name','yards_actual', 'yards_pred']]
# display(defenders_df)

grouped_defenders = defenders_df.groupby('defender_name', sort=False).filter(lambda x: len(x) > 50)
grouped_defenders = grouped_defenders.groupby('defender_name', sort=False)

# for key, item in grouped_defenders:
#     print(grouped_defenders.get_group(key), "\n\n")
    
best_defenders = grouped_defenders.aggregate(np.mean)
best_defenders['yards_saved'] = best_defenders.yards_pred - best_defenders.yards_actual
best_defenders = best_defenders.sort_values(by=['yards_saved'], ascending=False)
display(best_defenders)

worst_defenders = best_defenders.sort_values(['yards_saved'], ascending=True)
display(worst_defenders)

# XGBOOST

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt 
xgbr = xgb.XGBRegressor(verbosity=0)
xgbr.fit(x_train, y_train)
kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(xgbr, x_train, y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())
y_pred = xgbr.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print("RMSE: %.2f" % (mse**(1/2.0))) 

In [None]:
score = r2_score(y_test, y_pred)
score

In [None]:
expected_yards = xgbr.predict(x)
mse = mean_squared_error(y, expected_yards)
rmse = np.sqrt(mse)
rmse

In [None]:
plt.scatter(y_test, y_pred)
plt.show()

In [None]:
from xgboost import plot_importance
from matplotlib import pyplot
plot_importance(xgbr)
pyplot.show()

In [None]:
defenders_df = plays_expanded[['defender_name', 'down', 'yardsToGo','x_o', 'y_o', 'speed_o', 'acceleration_o', 'orientation_o', 'dir_o', 'x_b','y_b','speed_b','acceleration_b','defender_dist_to_target_receiver','x_d','y_d','speed_d','acceleration_d','orientation_d','dir_d','x_o_a','y_o_a','speed_o_a','acceleration_o_a','orientation_o_a', 'dir_o_a','x_b_a','y_b_a','speed_b_a','acceleration_b_a','dist_defender_1_a','dist_defender_2_a','dist_defender_3_a', 'playResult']]
defenders_df = defenders_df.dropna()
# display(defenders_df)

yards_pred = list(expected_yards)
defenders_df['yards_pred'] = yards_pred
# display(defenders_df)

defenders_df = defenders_df.rename(columns={'playResult':'yards_actual'})
defenders_df = defenders_df[['defender_name','yards_actual', 'yards_pred']]
# display(defenders_df)

grouped_defenders = defenders_df.groupby('defender_name', sort=False).filter(lambda x: len(x) > 50)
grouped_defenders = grouped_defenders.groupby('defender_name', sort=False)

# for key, item in grouped_defenders:
#     print(grouped_defenders.get_group(key), "\n\n")
    
best_defenders = grouped_defenders.aggregate(np.mean)
best_defenders['yards_saved'] = best_defenders.yards_pred - best_defenders.yards_actual
best_defenders = best_defenders.sort_values(by=['yards_saved'], ascending=False)
display(best_defenders)

worst_defenders = best_defenders.sort_values(['yards_saved'], ascending=True)
display(worst_defenders)