# NFL 2018 Defense Analyzer

# Importing Libraries

In [None]:
# acquire libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re


#explore libraries
from scipy import stats
import seaborn as sns
from sklearn.model_selection import train_test_split
import os
plt.rc("figure", figsize=(12, 7))
plt.rc("font", size=14)

# model libraries
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler


pd.set_option('display.max_columns', None)

In [None]:
def get_plays_data():
    '''
    This function retrieves the data from a csv saved locally containing the plays data
    '''
    df = pd.read_csv('../input/nfl-big-data-bowl-2021/plays.csv')
    return df

print("Acquire.py Loaded Successfully")

In [None]:
def prep_plays_data():
    '''
    This function retrieves calls the function that acquires 
    the plays csv and prepares it for an mvp
    '''
    # acquire the plays csv and save it as a dataframe
    df = get_plays_data()
    # keep only the useful columns for mvp
    df = df[['playDescription', 'quarter', 'down', 'yardsToGo', 'possessionTeam',
             'offenseFormation', 'personnelO', 'defendersInTheBox', 'numberOfPassRushers', 
             'personnelD', 'typeDropback', 'gameClock', 'absoluteYardlineNumber', 'epa',
             'playType', 'passResult', 'playResult', 'gameId', 'playId']]
    # filter out any data that is not a pass play
    df = df[df.playType == 'play_type_pass']
    # creates 0 or 1 for tradtional and scramble
    df['typeDropback'].replace({'TRADITIONAL':0,'SCRAMBLE_ROLLOUT_RIGHT':1,
                                 'SCRAMBLE':1,'DESIGNED_ROLLOUT_RIGHT':0,
                                 'SCRAMBLE_ROLLOUT_LEFT':1,'DESIGNED_ROLLOUT_LEFT':0,
                                 'UNKNOWN':0}, inplace=True)
    # ranking the teams with the most cumulative passing yards
    df['possessionTeam'].replace({'TB': .428528, 'PIT': .417436, 'KC': .392355, 'ATL': .382256, 'LA': .376048, 'GB': .349394, 'PHI': .362307,
                                  'NE': .358086, 'NYG': .335902, 'CLE': .335819, 'IND': .371082, 'HOU': .316947, 'SF': .303620, 'OAK': .312559,
                                  'CAR': .314215, 'MIN': .316367, 'NO': .334329, 'LAC': .303868, 'DAL': .295094, 'DET': .303620, 'CHI': .291038,
                                  'CIN': .273655, 'DEN': .287230, 'BAL': .296998, 'JAX': .259252, 'NYJ': .335902, 'MIA': .243442, 'WAS': .253872,
                                  'TEN': .246753, 'BUF': .232516, 'ARI': .209504, 'SEA': .234916}, inplace=True)  
    
    # cleaning up the pass result column to only pass complete and pass incomplete
    df['passResult'].replace({'C': 0,'I' : 1, 'IN' : 1}, inplace=True)
    # create a new column that extracts 
    # "(number) RB, (number) TE, (number) WR"
    # and saves it as a temporary column
    df['tempO'] = df.personnelO.str.extract(r'(\d RB, \d TE, \d WR)')
    # create a new column that extracts 
    # "(number) DL, (number) LB, (number) DB"
    # and saves it as a temporary column
    df['tempD'] = df.personnelD.str.extract(r'(\d DL, \d LB, \d DB)')
    # keeps the rows that contain only the string in tempO column
    df = df[df.personnelO == df.tempO]
    # keep the rows that contain only the string in tempD column
    df = df[df.personnelD == df.tempD]
    # create a temporary dataframe containing the personnelO 
    # column split by a comma and space
    temp = df.personnelO.str.split(', ', expand = True)
    # create a new column with the number of RB on the field
    df['RB'] = temp[0].str.replace(r' RB', '')
    # create a new column with the number of TE on the field
    df['TE'] = temp[1].str.replace(r' TE', '')
    # create a new column with the number of WR on the field
    df['WR'] = temp[2].str.replace(r' WR', '')
    # create a temporary dataframe containing the personnelD 
    # column split by a comma and space
    temp = df.tempD.str.split(', ', expand = True)
    # create a new column with the number of DL on the field
    df['DL'] = temp[0].str.replace(r' DL', '')
    # create a new column with the number of LB on the field
    df['LB'] = temp[1].str.replace(r' LB', '')
    # create a new column with the number of DB on the field
    df['DB'] = temp[2].str.replace(r' DB', '')
    # create dummies for offensive formation
    formation = pd.get_dummies(df.offenseFormation)
    # Classifying traditional and rollouts into normal dropbacks
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('DESIGNED_ROLLOUT_RIGHT', 'normal'))
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('TRADITIONAL', 'normal'))
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('DESIGNED_ROLLOUT_LEFT', 'normal'))
    # Classifying all scrambles as scrambles
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('SCRAMBLE_ROLLOUT_RIGHT', 'scramble'))
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('SCRAMBLE', 'scramble'))
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('SCRAMBLE_ROLLOUT_LEFT', 'scramble'))
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('scramble_ROLLOUT_LEFT', 'scramble'))   
    df = df.rename(columns = {'typeDropback' : 'QB_under_pressure', 'passResult' : 'pass_stopped', 'possessionTeam': 'team_by_comp_yds'})
    # join all dataframes together
    df = pd.concat([df, formation], axis = 1)
    # drop temporary columns and duplicates
    df = df.drop(columns = {'personnelO', 'personnelD', 'tempO', 'tempD', 'playType', 'offenseFormation'})
    # reorder the index and drop the old index
    # Changing datatype from object to int
    df = df.astype({'DL':'int', 'LB':'int','DB':'int'})
    # creating formation columns
    df['four_three'] = np.where((df['DL'] == 4) & (df['LB'] == 3),1,0)
    df['three_four'] = np.where((df['DL'] == 3) & (df['LB'] == 4),1,0)
    df['nickel'] = np.where(df['DB'] == 5, 1, 0)
    df['dime'] = np.where(df['DB'] == 6, 1, 0)
    df = df.reset_index(drop=True)
    df = df.dropna()
    return df



### Function for returning Passing Team Rank

def passing_team_rank():
    # brings in the plays csv
    plays = pd.read_csv('plays.csv')
    # returns only pass plays
    plays = plays[plays.playType == 'play_type_pass']
    # groups by team and sums the offense play result regardless of penalties
    team_rank = plays.groupby('possessionTeam')['offensePlayResult'].sum().reset_index()
    # sorts the summed results from highest to lowest
    team_rank = team_rank.sort_values(by='offensePlayResult', ascending=False)
    # returns the team rank
    return team_rank


def explore_plays_data():
    '''
    This function retrieves calls the function that acquires 
    the plays csv and prepares it for an mvp
    '''
    # acquire the plays csv and save it as a dataframe
    df = get_plays_data()
    # keep only the useful columns for mvp
    df = df[['playDescription', 'quarter', 'down', 'yardsToGo', 'possessionTeam',
             'offenseFormation', 'personnelO', 'defendersInTheBox', 'numberOfPassRushers', 
             'personnelD', 'typeDropback', 'gameClock', 'absoluteYardlineNumber', 'epa',
             'playType', 'passResult', 'playResult']]
    # creates 0 or 1 for tradtional and scramble
    df['typeDropback'].replace({'TRADITIONAL':0,'SCRAMBLE_ROLLOUT_RIGHT':1,
                                 'SCRAMBLE':1,'DESIGNED_ROLLOUT_RIGHT':0,
                                 'SCRAMBLE_ROLLOUT_LEFT':1,'DESIGNED_ROLLOUT_LEFT':0,
                                 'UNKNOWN':0}, inplace=True)
    df = df.rename(columns = {'typeDropback' : 'QB_under_pressure', 'passResult' : 'pass_stopped', 'possessionTeam': 'team_by_comp_yds'})
    # cleaning up the pass result column to only pass complete and pass incomplete
    df['pass_stopped'].replace({'C': 0,'I' : 1, 'IN' : 1}, inplace=True)         
    # filter out any data that is not a pass play
    df = df[df.playType == 'play_type_pass']
    # create a new column that extracts 
    # "(number) RB, (number) TE, (number) WR"
    # and saves it as a temporary column
    df['tempO'] = df.personnelO.str.extract(r'(\d RB, \d TE, \d WR)')
    # create a new column that extracts 
    # "(number) DL, (number) LB, (number) DB"
    # and saves it as a temporary column
    df['tempD'] = df.personnelD.str.extract(r'(\d DL, \d LB, \d DB)')
    # keeps the rows that contain only the string in tempO column
    df = df[df.personnelO == df.tempO]
    # keep the rows that contain only the string in tempD column
    df = df[df.personnelD == df.tempD]
    # create a temporary dataframe containing the personnelO 
    # column split by a comma and space
    temp = df.personnelO.str.split(', ', expand = True)
    # create a new column with the number of RB on the field
    df['RB'] = temp[0].str.replace(r' RB', '')
    # create a new column with the number of TE on the field
    df['TE'] = temp[1].str.replace(r' TE', '')
    # create a new column with the number of WR on the field
    df['WR'] = temp[2].str.replace(r' WR', '')
    #create a temporary dataframe containing the personnelD 
    #column split by a comma and space
    temp = df.tempD.str.split(', ', expand = True)
    # create a new column with the number of DL on the field
    df['DL'] = temp[0].str.replace(r' DL', '')
    # create a new column with the number of LB on the field
    df['LB'] = temp[1].str.replace(r' LB', '')
    # create a new column with the number of DB on the field
    df['DB'] = temp[2].str.replace(r' DB', '')
    # Changing datatype from object to int
    df = df.astype({'DL':'int', 'LB':'int','DB':'int'})
    # creating formation columns
    df['four_three'] = np.where((df['DL'] == 4) & (df['LB'] == 3),1,0)
    df['three_four'] = np.where((df['DL'] == 3) & (df['LB'] == 4),1,0)
    df['nickel'] = np.where(df['DB'] == 5, 1, 0)
    df['dime'] = np.where(df['DB'] == 6, 1, 0)
    
    # drop temporary columns and duplicates
    df = df.drop(columns = {'tempO', 'tempD'})
    df = df.reset_index(drop=True)
    df = df.dropna()
    # split df into test (30%) and train_validate (70%)
    train_validate, test = train_test_split(df, test_size=.3, random_state=123, stratify = df.pass_stopped)

    # split train_validate off into train (60% of 70% = 42%) and validate (40% of 70% = 28%)
    train, validate = train_test_split(train_validate, test_size=.4, random_state=123, stratify = train_validate.pass_stopped)
    return train, validate, test



###################################################################################
############################## PHASE 2 ############################################
###################################################################################
        
        
############################### prep plays csv to combine with weeks ##############


def prep_plays_for_weeks():
    '''
    This function retrieves calls the function that acquires 
    the plays csv and prepares it for weeks.csv.
    This is the same as the prepare file above without 
    train, validate, test split.
    '''
    # acquire the plays csv and save it as a dataframe
    df = get_plays_data()
    # keep only the useful columns for mvp
    df = df[['gameId', 'playId', 'playDescription', 'quarter', 'down', 'yardsToGo', 'possessionTeam',
             'offenseFormation', 'personnelO', 'defendersInTheBox', 'numberOfPassRushers', 
             'personnelD', 'typeDropback', 'gameClock', 'absoluteYardlineNumber', 'epa',
             'playType', 'passResult', 'playResult']]
    # filter out any data that is not a pass play
    df = df[df.playType == 'play_type_pass']
    # creates 0 or 1 for tradtional and scramble
    df['typeDropback'].replace({'TRADITIONAL':0,'SCRAMBLE_ROLLOUT_RIGHT':1,
                                 'SCRAMBLE':1,'DESIGNED_ROLLOUT_RIGHT':0,
                                 'SCRAMBLE_ROLLOUT_LEFT':1,'DESIGNED_ROLLOUT_LEFT':0,
                                 'UNKNOWN':0}, inplace=True)
    # ranking the teams with the most cumulative passing yards
    df['possessionTeam'].replace({'TB': 1, 'PIT': 2, 'KC': 4, 'ATL': 3, 'LA': 5, 'GB': 7, 'PHI': 8,
                                  'NE': 9, 'NYG': 10, 'CLE': 11, 'IND': 6, 'HOU': 12, 'SF': 17, 'OAK': 16,
                                  'CAR': 15, 'MIN': 14, 'NO': 13, 'LAC': 19, 'DAL': 18, 'DET': 20, 'CHI': 22,
                                  'CIN': 24, 'DEN': 23, 'BAL': 21, 'JAX': 25, 'NYJ': 26, 'MIA': 28, 'WAS': 27,
                                  'TEN': 29, 'BUF': 31, 'ARI': 32, 'SEA': 30}, inplace=True)  
    
    # cleaning up the pass result column to only pass complete and pass incomplete
    df['passResult'].replace({'C': 0,'I' : 1, 'IN' : 1}, inplace=True)
    # create a new column that extracts 
    # "(number) RB, (number) TE, (number) WR"
    # and saves it as a temporary column
    df['tempO'] = df.personnelO.str.extract(r'(\d RB, \d TE, \d WR)')
    # create a new column that extracts 
    # "(number) DL, (number) LB, (number) DB"
    # and saves it as a temporary column
    df['tempD'] = df.personnelD.str.extract(r'(\d DL, \d LB, \d DB)')
    # keeps the rows that contain only the string in tempO column
    df = df[df.personnelO == df.tempO]
    # keep the rows that contain only the string in tempD column
    df = df[df.personnelD == df.tempD]
    # create a temporary dataframe containing the personnelO 
    # column split by a comma and space
    temp = df.personnelO.str.split(', ', expand = True)
    # create a new column with the number of RB on the field
    df['RB'] = temp[0].str.replace(r' RB', '')
    # create a new column with the number of TE on the field
    df['TE'] = temp[1].str.replace(r' TE', '')
    # create a new column with the number of WR on the field
    df['WR'] = temp[2].str.replace(r' WR', '')
    # create a temporary dataframe containing the personnelD 
    # column split by a comma and space
    temp = df.tempD.str.split(', ', expand = True)
    # create a new column with the number of DL on the field
    df['DL'] = temp[0].str.replace(r' DL', '')
    # create a new column with the number of LB on the field
    df['LB'] = temp[1].str.replace(r' LB', '')
    # create a new column with the number of DB on the field
    df['DB'] = temp[2].str.replace(r' DB', '')
    # create dummies for offensive formation
    formation = pd.get_dummies(df.offenseFormation)
    # Classifying traditional and rollouts into normal dropbacks
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('DESIGNED_ROLLOUT_RIGHT', 'normal'))
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('TRADITIONAL', 'normal'))
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('DESIGNED_ROLLOUT_LEFT', 'normal'))
    # Classifying all scrambles as scrambles
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('SCRAMBLE_ROLLOUT_RIGHT', 'scramble'))
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('SCRAMBLE', 'scramble'))
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('SCRAMBLE_ROLLOUT_LEFT', 'scramble'))
    df.typeDropback = df.typeDropback.apply(lambda value : str(value).replace('scramble_ROLLOUT_LEFT', 'scramble'))   
    df = df.rename(columns = {'typeDropback' : 'QB_under_pressure', 'passResult' : 'pass_stopped', 'possessionTeam': 'team_by_comp_yds'})
    # join all dataframes together
    df = pd.concat([df, formation], axis = 1)
    # drop temporary columns and duplicates
    df = df.drop(columns = {'personnelO', 'personnelD', 'tempO', 'tempD', 'playType', 'offenseFormation'})
    # reorder the index and drop the old index
    # Changing datatype from object to int
    df = df.astype({'DL':'int', 'LB':'int','DB':'int'})
    # crreating formation columns
    df['four_three'] = np.where((df['DL'] == 4) & (df['LB'] == 3),1,0)
    df['three_four'] = np.where((df['DL'] == 3) & (df['LB'] == 4),1,0)
    df['nickel'] = np.where(df['DB'] == 5, 1, 0)
    df['dime'] = np.where(df['DB'] == 6, 1, 0)
    df = df.reset_index(drop=True)
    df = df.dropna()
    return df

print("Prep.py Loaded Successfully")


################################### prep week csv ###########################

def filter_nfl_weeks():
    '''
    This function creates a copy of the weeks.csv
    that only contain pass_forward
    '''
    
    for i in range(1,18):
        # read a week csv
        df = pd.read_csv('../input/nfl-big-data-bowl-2021/week' + str(i) + '.csv')
        # keep only 5 events from the df
        df = df[(df.event == 'pass_forward')]
        # fill null values in position to none
        df.position = df.position.fillna('BALL')
        # reset the index
        df.reset_index(drop=True)
        # save the df as a new csv
        df.to_csv('week' + str(i) + 'filtered.csv', index=False)
        # print the week number after you run through the above steps
        print(f'{i}')
    #had to drop playId '3640' because it was assigned pass_forward on two different frames
    week9 = pd.read_csv('week9filtered.csv')
    week9 = week9[week9.playId != 3640]
    week9.to_csv('week9filtered.csv')
    #had to drop plaId '2650' because it was assigned pass_forward on three different frames
    week10 = pd.read_csv('week10filtered.csv')
    week10 = week10[week10.playId != 2650]
    week10.to_csv('week10filtered.csv')
    

################################## getting new features ###########################
    
    
def combine_week_and_plays(week_num):
    '''
    This function combines the week.csv's
    with the plays data and returns
    playid and week number, name and distance 
    of closest defender with their coordindates
    '''
    ##############################################################
    #first we load our prepped plays and create a unique idetifier
    ##############################################################
    #loading prepped plays data
    plays = prep_plays_for_weeks()
    #changing gameid into string
    plays.gameId = plays.gameId.astype(str)
    #changing playid into string
    plays.playId = plays.playId.astype(str)
    #concat to create a unique identifier
    plays['playid'] = plays.gameId + plays.playId
    #drop old columns
    plays = plays.drop(columns = {'gameId', 'playId'})
    #drop any duplicates
    plays.drop_duplicates(inplace=True)
    ##############################################################
    #second we load our week data and create a unique identifier
    ##############################################################
    #read in week data that contains only plays when pass is being released
    week = pd.read_csv('week' + str(week_num) + 'filtered.csv')
    #changing gameid into string
    week.gameId = week.gameId.astype(str)
    #changing playid into string
    week.playId = week.playId.astype(str)
    #concat to create a unique identifier
    week['playid'] = week.gameId + week.playId
    #drop old columns
    week = week.drop(columns = {'gameId', 'playId'})
    #drop any duplicates
    week.drop_duplicates(inplace=True)
    ##############################################################
    #third we merge the dataframe
    ##############################################################
    #merge plays and week1 so we can have a play description
    df = pd.merge(plays, week, left_on = 'playid', right_on = 'playid', how = 'inner')
    #drop duplicates
    df.drop_duplicates(inplace=True)
    ##############################################################
    #forth we extract the intended receiver from the play description
    ##############################################################
    #extracting names from play description
    #second name will be the intended reciever
    desc = df.playDescription.str.findall(r'(\b[A-Z]+\.\b[A-Z]+\w+)').apply(','.join)
    #split the desc names by comma
    temp = desc.str.split(',', expand = True)
    #saving name of receiver
    df['receiver_last_name'] = temp[1]
    #splitting first and last name of player
    temp2 = df.displayName.str.split(' ', expand = True)
    #getting first initial
    initial = temp2[0].astype(str).str[0]
    #getting last name
    last_name = temp2[1]
    #saving player last name as first inital dot last name
    df['player_last_name'] = initial + '.' + last_name
    #filtering out the football
    df = df[df.displayName != 'Football']
    #resetting the index
    df = df.reset_index(drop=True)
    #labeling incorrect receivers with their surname
    df.loc[(df.receiver_last_name == 'J.Smith'),'receiver_last_name'] = 'J.Smith-Schuster'
    df.loc[(df.receiver_last_name == 'A.Seferian'),'receiver_last_name'] = 'A.Seferian-Jenkins'
    df.loc[(df.receiver_last_name == 'R.Seals'),'receiver_last_name'] = 'R.Seals-Jones'
    df.drop_duplicates(inplace=True)
    #########################################################################################################
    #fifth we create a function that will find the distance of the closest defender to the intended receiver
    ########################################################################################################
    newdf = pd.DataFrame(columns = ['playid', 'closest_dist', 'closest_x', 'closest_y', 'defender_receiver', 'week'])
    playids = [play for play in df.playid.unique()]

    #loop through each playid in playids
    for play in playids:
        #reset shortest distance
        closest_distance = 100
        #reset shortest x
        closest_x = 0
        #reset shortest y
        closest_y = 0
        #filter for all players in current play
        current_play = df[df.playid == play]
        #create a dataframe of offensive players
        offense = current_play[(current_play.position == 'QB') | (current_play.position == 'RB') | (current_play.position == 'WR') | (current_play.position == 'FB') | (current_play.position == 'HB') | (current_play.position == 'TE')]
        #create a dataframe of defensive players
        defense = current_play[(current_play.position == 'CB') | (current_play.position == 'OLB') | (current_play.position == 'FS') | (current_play.position == 'SS') | (current_play.position == 'ILB') | (current_play.position == 'MLB') | (current_play.position == 'LB') | (current_play.position == 'DB') | (current_play.position == 'S') | (current_play.position == 'DL') | (current_play.position == 'DE') | (current_play.position == 'NT')]
        #for x in coordinates of players
        for name in defense.displayName:
            if (offense.receiver_last_name == offense.player_last_name).any():
                #retrieve y coordinate of this player
                x = defense.loc[defense.displayName == name].x.item()
                #retrieve y coordinate of this player
                y = defense.loc[defense.displayName == name].y.item()
                #retrive x coordinate of reciever
                x1= offense.loc[offense.receiver_last_name == offense.player_last_name].x.item()
                # retrieve y coordinate of reciever
                y1= offense.loc[offense.receiver_last_name == offense.player_last_name].y.item()
                #solve for distance
                distance = ((x-x1)**2+(y-y1)**2)**(1/2)
                #if the distance is the shortest distance
                if distance < closest_distance:
                    #save the distance
                    closest_distance = distance
                    #save the x coordinate
                    closest_x = x
                    #save the y coordinate
                    closest_y = y
                    #save the defender name
                    def_name = name
            else:
                #fill with unrealistic values 
                closest_distance = 0
                closest_x = 0
                closest_y = 0
                def_name = "unknown"
        newdf = newdf.append({'playid': play, 'closest_dist': closest_distance, 'closest_x': closest_x, 'closest_y': closest_y, 'defender_receiver': def_name, 'week': week_num}, ignore_index=True)
    return newdf


################################ adding new features to original df ##########################


def combine_all_weeks_and_plays():
    '''
    This function creates new features from week.csv's
    and adds them to original prep_plays_for_weeks
    '''
    #create new features from week 1
    df = combine_week_and_plays(1)
    print(1)
    #create new features for remaining weeks
    for i in range(2,18):
        newdf = combine_week_and_plays(i)
        #append new features from weeks to each other
        df = df.append(newdf).reset_index(drop=True)
        #print week number when done
        print(i)
    #load prepped plays df    
    plays = prep_plays_for_weeks()
    #changing gameid into string
    plays.gameId = plays.gameId.astype(str)
    #changing playid into string
    plays.playId = plays.playId.astype(str)
    #concat to create a unique identifier
    plays['playid'] = plays.gameId + plays.playId
    #drop old columns
    plays = plays.drop(columns = {'gameId', 'playId'})
    #drop any duplicates
    plays.drop_duplicates(inplace=True)  
    #merge new features with old
    total_df = pd.merge(plays, df, left_on = 'playid', right_on = 'playid', how = 'inner')
    return total_df

def get_weeksnplays_data():
    
    ''' This function will acquire the csv file needed to work with the season data, if there is not csv saved,
    then it ill iterate through the function above and create one for you'''

    if os.path.isfile('final.csv'):
        df = pd.read_csv('final.csv')
        df = df.drop(columns = {'Unnamed: 0'})
        print('Dataframe Ready For Use')
    else:
        filter_nfl_weeks()
        df = combine_all_weeks_and_plays()
        df.to_csv('final.csv')
    return df


################################ finding top defenders in NFL ##########################

def top_defenders():
    '''
    This function will create a dataframe of the best defenders 
    in the NFL in regards to defending the intended receiver
    '''
    # loading whole dataframe
    filter_nfl_weeks()
    df = combine_all_weeks_and_plays()
    # top 100 defenders directly involved in a pass play
    total_plays = df[df.defender_receiver != 'unknown'].defender_receiver.value_counts().head(100)
    # transform total_plays into a dataframe
    total_plays = pd.DataFrame(total_plays)
    # reset index
    total_plays = total_plays.reset_index()
    # rename columns
    total_plays = total_plays.rename(columns = {'index': 'defender', 'defender_receiver': 'total_plays'})
    # creating a temp df for passes stopped
    temp = df[df.pass_stopped == 1]
    #top 10 defenders who were directly involved in stopping the pass play
    top_10 = temp[temp.defender_receiver != 'unknown'].defender_receiver.value_counts().head(10)
    # transform top_10 into a dataframe
    top_10 = pd.DataFrame(top_10)
    # reset index
    top_10 = top_10.reset_index()
    # rename columns
    top_10 = top_10.rename(columns = {'index': 'defender', 'defender_receiver': 'stopped_passes'})
    # merging dataframes to find top defenders
    top_defenders = pd.merge(top_10, total_plays, how= 'inner')
    # finding precentage of passes stopped
    top_defenders['stopped_pass_perc'] = (top_defenders.stopped_passes / top_defenders.total_plays).round(2)
    # sorting values in dataframe
    defenders = top_defenders.sort_values('stopped_pass_perc', ascending = False)
    return defenders

In [None]:
def prep_nfl():
    df = clean_season()
    df['force_per_second'] = (((df.weight * 0.45359237)/ (9.8)) * (df.a * .9144)).round(2)
    df['uniqueId'] = (df.gameId.astype(str) + df.playId.astype(str)).astype(int)
    df2 = prep_plays_data()
    df2['uniqueId'] = (df2.gameId.astype(str) + df2.playId.astype(str)).astype(int)
    df = pd.merge(df, df2, how='left', on='uniqueId')
    df = df.drop(columns = {'playId_y', 'gameId_y', 'pass_stopped_y'})
    df = df.rename(columns = {'gameId_x': 'gameId','playId_x': 'playId', 'pass_stopped_x': 'pass_stopped'})
    df = df.dropna()
    df3 = get_weeksnplays_data()
    df3['uniqueId'] = df3.playid.rename({'playid': 'uniqueId'}).astype(int)
    df = pd.merge(df, df3, how='left', on='uniqueId')
    df = df.drop(columns = {'week_y', 'playid', 'playDescription_y', 'quarter_y', 'down_y',
                            'yardsToGo_y', 'team_by_comp_yds_y', 'defendersInTheBox_y',
                            'numberOfPassRushers_y', 'QB_under_pressure_y', 'gameClock_y',
                            'absoluteYardlineNumber_y', 'epa_y', 'pass_stopped_y', 'playResult_y',
                            'RB_y', 'TE_y', 'WR_y', 'DL_y', 'LB_y', 'DB_y', 'EMPTY_y', 'I_FORM_y',
                            'JUMBO_y', 'PISTOL_y', 'SHOTGUN_y', 'SINGLEBACK_y', 'WILDCAT_y',
                            'four_three_y', 'three_four_y', 'nickel_y', 'dime_y'})
    df = df.rename(columns = {'week_x': 'week', 'playDescription_x': 'playDescription',
                              'quarter_x': 'quarter', 'down_x': 'down', 'yardsToGo_x': 'yardsToGo',
                              'team_by_comp_yds_x': 'team_by_comp_yds', 'defendersInTheBox_x': 'defendersInTheBox',
                              'numberOfPassRushers_x': 'numberOfPassRushers', 'QB_under_pressure_x': 'QB_under_pressure',
                              'gameClock_x': 'gameClock','absoluteYardlineNumber_x': 'absoluteYardlineNumber',
                              'epa_y': 'epa', 'pass_stopped_x': 'pass_stopped', 'playResult_x': 'playResult',
                              'RB_x': 'RB', 'TE_x': 'TE', 'WR_x': 'WR', 'DL_x': 'DL', 'LB_x': 'LB', 'DB_x': 'DB',
                              'EMPTY_x': 'EMPTY', 'I_FORM_x': 'I_FORM','JUMBO_x': 'JUMBO', 'PISTOL_x': 'PISTOL',
                              'SHOTGUN_x': 'SHOTGUN', 'SINGLEBACK_x': 'SINGLEBACK', 'WILDCAT_x': 'WILDCAT',
                              'four_three_x': 'four_three', 'three_four_x': 'three_four', 'nickel_x': 'nickel',
                              'dime_x': 'dime', 'pass_stopped_x': 'pass_stopped', 'epa_x': 'epa'})
    df = df.dropna()
    df.to_csv('clean_nfl.csv')
    print('Prep_NFL.py Loaded Successfully')
    return df

def get_nfl_data():
    
    ''' This function will acquire the csv file needed to work with the season data, if there is not csv saved,
    then it ill iterate through the function above and create one for you'''
    
    if os.path.isfile('clean_nfl.csv'):
        df = pd.read_csv('clean_nfl.csv')
        df = df.drop(columns = {'Unnamed: 0'})
        print('Dataframe Ready For Use')
    else:
        df = prep_nfl()
        print('Dataframe Ready For Use')
    return df

print('Prep_NFL.py Imported Successfully')

In [None]:
########################### ALT Prep Season function ############################

def prep_season():
    '''
    This function acquires the players csv and prepares
    it to merge with other csv's
    '''
    
    # Acquire the players csv
    players= pd.read_csv('../input/nfl-big-data-bowl-2021/players.csv')
    # Convert the birthdate to datetime to get rid of different date formats
    players.birthDate = pd.to_datetime(players.birthDate)
    # Creating a age column that takes the start date of the 2018 season and subtracts the birthdate
    players['age'] = (pd.to_datetime('09/06/2018') - players.birthDate).astype('<m8[Y]')
    # Function that converts heights
    def conv_height(value):
        if len(re.findall(r'(\d+)-(\d+)', value)) > 0:
            feet = int(re.findall(r'(\d+)-(\d+)', value)[0][0])
            inches = int(re.findall(r'(\d+)-(\d+)', value)[0][1])
            return (feet * 12) + inches
        else:
            return value
    # Changing height column to equal just inches
    players['height'] = players.height.apply(conv_height)
    players['height'] = players['height'].astype(int)
    
    # Bringing in the week csv's
    df2 = players
    week1 = pd.read_csv('../input/nfl-big-data-bowl-2021/week1.csv')
    week1['week'] = 1
    week2 = pd.read_csv('../input/nfl-big-data-bowl-2021/week2.csv')
    week2['week'] = 2
    week3 = pd.read_csv('../input/nfl-big-data-bowl-2021/week3.csv')
    week3['week'] = 3
    week4 = pd.read_csv('../input/nfl-big-data-bowl-2021/week4.csv')
    week4['week'] = 4
    week5 = pd.read_csv('../input/nfl-big-data-bowl-2021/week5.csv')
    week5['week'] = 5
    week6 = pd.read_csv('../input/nfl-big-data-bowl-2021/week6.csv')
    week6['week'] = 6
    week7 = pd.read_csv('../input/nfl-big-data-bowl-2021/week7.csv')
    week7['week'] = 7
    week8 = pd.read_csv('../input/nfl-big-data-bowl-2021/week8.csv')
    week8['week'] = 8
    week9 = pd.read_csv('../input/nfl-big-data-bowl-2021/week9.csv')
    week9['week'] = 9
    week10 = pd.read_csv('../input/nfl-big-data-bowl-2021/week10.csv')
    week10['week'] = 10
    week11 = pd.read_csv('../input/nfl-big-data-bowl-2021/week11.csv')
    week11['week'] = 11
    week12 = pd.read_csv('../input/nfl-big-data-bowl-2021/week12.csv')
    week12['week'] = 12
    week13 = pd.read_csv('../input/nfl-big-data-bowl-2021/week13.csv')
    week13['week'] = 13
    week14 = pd.read_csv('../input/nfl-big-data-bowl-2021/week14.csv')
    week14['week'] = 14
    week15 = pd.read_csv('../input/nfl-big-data-bowl-2021/week15.csv')
    week15['week'] = 15
    week16 = pd.read_csv('../input/nfl-big-data-bowl-2021/week16.csv')
    week16['week'] = 16
    week17 = pd.read_csv('../input/nfl-big-data-bowl-2021/week17.csv')
    week17['week'] = 17
    df1 = pd.concat([week1, week2, week3, week4, week5, week6, week7, week8, week9,
                    week10, week11, week12, week13, week14, week15, week16, week17])
    df = pd.merge(df1, df2, how='inner', on='displayName')
    df = df.drop(columns = {'position_y', 'nflId_y'})
    df = df.rename(columns = {'position_x':'position', 'nflId_x': 'nflId'})
    
    # adding columns to measure time taken to travel and force of players
    df['time_since_last_x'] = (df.dis / df.s).round(4)        
    # Calculate force by converting the weight to Kg's then divide by gravity (9.81 m/s^2) * acceleration
    # This will provide a players force in Newtons
    df['force_per_second'] = (((df.weight * 0.45359237)/ (9.8)) * (df.s / 1.094)).round(4)
    
    
    # replacing the event column with target variable
    df.drop(df.index[df['event'] == 'None'], inplace = True)
    df.drop(df.index[df['event'] == 'ball_snap'], inplace = True)
    df.drop(df.index[df['event'] == 'pass_forward'], inplace = True)
    df.drop(df.index[df['event'] == 'pass_arrived'], inplace = True)
    df.drop(df.index[df['event'] == 'tackle'], inplace = True)
    df.drop(df.index[df['event'] == 'first_contact'], inplace = True)
    df.drop(df.index[df['event'] == 'play_action'], inplace = True)
    df.drop(df.index[df['event'] == 'out_of_bounds'], inplace = True)
    df.drop(df.index[df['event'] == 'line_set'], inplace = True)
    df.drop(df.index[df['event'] == 'man_in_motion'], inplace = True)
    df.drop(df.index[df['event'] == 'touchdown'], inplace = True)
    df.drop(df.index[df['event'] == 'pass_tipped'], inplace = True)
    df.drop(df.index[df['event'] == 'pass_outcome_touchdown'], inplace = True)
    df.drop(df.index[df['event'] == 'fumble'], inplace = True)
    df.drop(df.index[df['event'] == 'shift'], inplace = True)
    df.drop(df.index[df['event'] == 'fumble_defense_recovered'], inplace = True)
    df.drop(df.index[df['event'] == 'handoff'], inplace = True)
    df.drop(df.index[df['event'] == 'pass_shovel'], inplace = True)
    df.drop(df.index[df['event'] == 'penalty_flag'], inplace = True)
    df.drop(df.index[df['event'] == 'fumble_offense_recovered'], inplace = True)
    df.drop(df.index[df['event'] == 'touchback'], inplace = True)
    df.drop(df.index[df['event'] == 'penalty_accepted'], inplace = True)
    df.drop(df.index[df['event'] == 'field_goal_blocked'], inplace = True)
    df.drop(df.index[df['event'] == 'pass_lateral'], inplace = True)
    df.drop(df.index[df['event'] == 'lateral'], inplace = True)
    df.drop(df.index[df['event'] == 'snap_direct'], inplace = True)
    df.drop(df.index[df['event'] == 'run_pass_option'], inplace = True)
    df.drop(df.index[df['event'] == 'huddle_break_offense'], inplace = True)
    df.drop(df.index[df['event'] == 'huddle_start_offense'], inplace = True)
    df.drop(df.index[df['event'] == 'qb_strip_sack'], inplace = True)
    df.drop(df.index[df['event'] == 'timeout_home'], inplace = True)
    df.drop(df.index[df['event'] == 'qb_sack'], inplace = True)
    df.drop(df.index[df['event'] == 'qb_spike'], inplace = True)
    df.drop(df.index[df['event'] == 'run'], inplace = True)
    df.drop(df.index[df['event'] == 'punt_fake'], inplace = True)
    df.drop(df.index[df['event'] == 'field_goal_fake'], inplace = True)
    df.drop(df.index[df['event'] == 'safety'], inplace = True)
    df.drop(df.index[df['event'] == 'field_goal_play'], inplace = True)
    df['event'].replace({'pass_outcome_caught': 0,'pass_outcome_incomplete' : 1,'pass_outcome_interception' : 1}, inplace=True)
    df.reset_index(inplace=True)

    # Dropping undefined route
    df.drop(df.index[df['route'] == 'undefined'], inplace =True) 
    # Write DataFrame to csv file for future use
    df.to_csv('season.csv')
    print('CSV Successfully Created')
    return df


def get_season_data():
    
    ''' This function will acquire the csv file needed to work with the season data, if there is not csv saved,
    then it ill iterate through the function above and create one for you'''
    
    if os.path.isfile('season.csv'):
        df = pd.read_csv('season.csv')
        df = df.drop(columns = {'Unnamed: 0', 'index'})
        print('Season Data Imported Successfully')
    else:
        df = prep_season()
        df = df.drop(columns = {'index'})
        print('Season Data Imported Successfully')
    return df

def clean_season():
    df = get_season_data()
    df.route.fillna(value='NONE', inplace=True)
    df = df.dropna()
    df = df.rename(columns = {'event':'pass_stopped'})
    # 1 is play shifted to left side of field, 0 is play shifted to right side
    df['playDirection'] = df.playDirection.replace({'left': 1, 'right': 0})
    df['is_home'] = df.team.replace({'home': 1, 'away': 0})
    df = df.drop(columns = {'team'})
    df['time_since_last_x'] = df.time_since_last_x.replace([np.inf, -np.inf], np.nan)
    df['time_since_last_x'] = df.time_since_last_x.replace([np.inf, -np.inf], np.nan).dropna()
    df['is_defense'] = df.position.replace({'QB': 0, 'SS': 1, 'WR': 0, 'FS': 1, 'RB': 0, 'MLB': 1, 'CB': 1, 'TE': 0,
                                                 'LB': 1, 'FB': 0, 'OLB': 1,'HB': 0, 'ILB': 1, 'DL': 1, 'DB': 1,
                                                 'S': 1, 'NT': 1, 'DE': 1, 'P': 0, 'LS': 0, 'K': 0, 'DT': 1})
    return df

def get_viz(df):
    agedf = df.groupby('age')['event'].sum().reset_index()
    agedf = agedf.sort_values(by='event', ascending=False).head(7)
    sns.barplot(data=agedf, x='age', y= 'event', palette = 'mako')
    plt.title('Age and Incompletions', fontsize=13)
    plt.xlabel('age', fontsize=13)
    plt.ylabel('Incomplete Passes', fontsize=13)
    plt.show()

    ### College
    collegedf = df.groupby('collegeName')['event'].sum().reset_index()
    college20 = collegedf.sort_values(by='event', ascending=False).head(7)
    sns.barplot(data=college20, x='collegeName', y= 'event',palette='mako' )
    plt.title('College and Incompletions', fontsize=13)
    plt.xlabel('College', fontsize=13)
    plt.ylabel('Incomplete Passes', fontsize=13)
    #plt.xticks(rotation=30)
    plt.show()

    ### Height
    heightdf = df.groupby('height')['event'].sum().reset_index()
    heightdf = heightdf.sort_values(by='event', ascending=False).head(7)
    sns.barplot(data=heightdf, x='height', y= 'event', palette='mako')
    plt.title('Height and Incompletions', fontsize=13)
    plt.xlabel('height', fontsize=13)
    plt.ylabel('Incomplete Passes', fontsize=13)
    plt.xticks(rotation=30)
    plt.show()

    ### Weight
    weightdf = df.groupby('weight')['event'].sum().reset_index()
    weight20 = weightdf.sort_values(by='event', ascending=False).head(7)
    sns.barplot(data=weight20, x='weight', y= 'event', palette='mako')
    plt.title('Weight and Incompletions', fontsize=13)
    plt.xlabel('Weight', fontsize=13)
    plt.ylabel('Incomplete Passes', fontsize=13)
    plt.xticks(rotation=30)
    plt.show()
    
    
    
################################ finding top defenders in NFL ##########################

def top_defenders():
    '''
    This function will create a dataframe of the best defenders 
    in the NFL in regards to defending the intended receiver
    '''
    df = filter_nfl_weeks()
    df = combine_all_weeks_and_plays()
    # top 100 defenders directly involved in a pass play
    total_plays = df[df.defender_receiver != 'unknown'].defender_receiver.value_counts().head(100)
    # transform total_plays into a dataframe
    total_plays = pd.DataFrame(total_plays)
    # reset index
    total_plays = total_plays.reset_index()
    # rename columns
    total_plays = total_plays.rename(columns = {'index': 'defender', 'defender_receiver': 'total_plays'})
    # creating a temp df for passes stopped
    temp = df[df.pass_stopped == 1]
    #top 10 defenders who were directly involved in stopping the pass play
    top_10 = temp[temp.defender_receiver != 'unknown'].defender_receiver.value_counts().head(10)
    # transform top_10 into a dataframe
    top_10 = pd.DataFrame(top_10)
    # reset index
    top_10 = top_10.reset_index()
    # rename columns
    top_10 = top_10.rename(columns = {'index': 'defender', 'defender_receiver': 'stopped_passes'})
    # merging dataframes to find top defenders
    top_defenders = pd.merge(top_10, total_plays, how= 'inner')
    # finding precentage of passes stopped
    top_defenders['stopped_pass_perc'] = (top_defenders.stopped_passes / top_defenders.total_plays).round(2)
    # sorting values in dataframe
    defenders = top_defenders.sort_values('stopped_pass_perc', ascending = False)
    return defenders



print('Prep_Season.py Loaded Successfully')

In [None]:
# Turn off warnings
import warnings
warnings.filterwarnings("ignore")

# split_scale
# import split_scale

# libraries needed for preparing the data:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
import sklearn.preprocessing
from sklearn.cluster import KMeans

def train_validate_test(df):
    '''
    this function takes in a dataframe and splits it into 3 samples, 
    a test, which is 30% of the entire dataframe, 
    a validate, which is 28% of the entire dataframe,
    and a train, which is 42% of the entire dataframe. 
    It then splits each of the 3 samples into a dataframe with independent variables
    and a series with the dependent, or target variable. 
    The function returns 3 dataframes and 3 series:
    X_train (df) & y_train (series), X_validate & y_validate, X_test & y_test. 
    '''
    # split df into test (30%) and train_validate (70%)
    train_validate, test = train_test_split(df, test_size=.3, random_state=123, stratify = df.pass_stopped)

    # split train_validate off into train (60% of 70% = 42%) and validate (40% of 70% = 28%)
    train, validate = train_test_split(train_validate, test_size=.4, random_state=123, stratify = train_validate.pass_stopped)


    # split train into X (dataframe, drop target) & y (series, keep target only)
    X_train = train.drop(columns= ['time', 'nflId', 'displayName',
                                  'jerseyNumber', 'frameId', 'gameId', 'playId',
                                  'route', 'week', 'birthDate'])
    X_validate = validate.drop(columns= ['time', 'nflId', 'displayName',
                                  'jerseyNumber', 'frameId', 'gameId', 'playId',
                                  'route', 'week', 'birthDate'])
    X_test = test.drop(columns= ['time', 'nflId', 'displayName',
                                  'jerseyNumber', 'frameId', 'gameId', 'playId',
                                  'route', 'week', 'birthDate'])

    y_train = train[['pass_stopped']]
    y_validate = validate[['pass_stopped']]
    y_test = test[['pass_stopped']]
    return X_train, y_train, X_validate, y_validate, X_test, y_test


def min_max_scale(X_train, X_validate, X_test):
    '''
    this function takes in 3 dataframes with the same columns, 
    a list of numeric column names (because the scaler can only work with numeric columns),
    and fits a min-max scaler to the first dataframe and transforms all
    3 dataframes using that scaler. 
    it returns 3 dataframes with the same column names and scaled values. 
    '''
    # create the scaler object and fit it to X_train (i.e. identify min and max)
    # if copy = false, inplace row normalization happens and avoids a copy (if the input is already a numpy array).
    X_train = X_train.drop(columns = {'collegeName', 'position', 'pass_stopped',
                                      'playDescription', 'uniqueId', 'gameClock',
                                     'playResult', 'defender_receiver'})
    X_validate = X_validate.drop(columns = {'collegeName', 'position', 'pass_stopped',
                                            'playDescription', 'uniqueId', 'gameClock',
                                           'playResult', 'defender_receiver'})
    X_test = X_test.drop(columns = {'collegeName', 'position', 'pass_stopped',
                                    'playDescription', 'uniqueId', 'gameClock',
                                   'playResult', 'defender_receiver'})
    scaler = MinMaxScaler(copy = True).fit(X_train)

    X_train_scaled = scaler.transform(X_train)
    X_validate_scaled = scaler.transform(X_validate)
    X_test_scaled = scaler.transform(X_test)
    X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns.values).set_index([X_train.index.values])
    X_validate_scaled = pd.DataFrame(X_validate_scaled, columns =
                                     X_validate.columns.values).set_index([X_validate.index.values])
    X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns.values).set_index([X_test.index.values])


    return X_train_scaled, X_validate_scaled, X_test_scaled

def add_clusters(X_train_scaled, X_validate_scaled, X_test_scaled, X_train, X_validate, X_test):
    X1_train = X_train_scaled[['height', 'weight', 'age', 'RB', 'TE', 'WR', 'DL',
                               'LB', 'DB', 's', 'a', 'dis']]
    X1_val = X_validate_scaled[['height', 'weight', 'age', 'RB', 'TE', 'WR', 'DL',
                                'LB', 'DB', 's', 'a', 'dis']]
    X1_test = X_test_scaled[['height', 'weight', 'age', 'RB', 'TE', 'WR', 'DL',
                             'LB', 'DB', 's', 'a', 'dis']]
    kmeans = KMeans(n_clusters=4)
    kmeans.fit(X1_train)
    X_train_scaled['pos_att_cluster'] = kmeans.predict(X1_train)
    X_train['pos_att_cluster'] = kmeans.predict(X1_train)
    X_validate_scaled['pos_att_cluster'] = kmeans.predict(X1_val)
    X_validate['pos_att_cluster'] = kmeans.predict(X1_val)
    X_test_scaled['pos_att_cluster'] = kmeans.predict(X1_test)
    X_test['pos_att_cluster'] = kmeans.predict(X1_test)


    X2_train = X_train_scaled[['x', 'y', 'dis', 'o', 'dir', 'playDirection', 'quarter', 'down',
                               'yardsToGo', 'numberOfPassRushers', 'QB_under_pressure']]
    X2_val = X_validate_scaled[['x', 'y', 'dis', 'o', 'dir', 'playDirection', 'quarter', 'down',
                               'yardsToGo', 'numberOfPassRushers', 'QB_under_pressure']]
    X2_test = X_test_scaled[['x', 'y', 'dis', 'o', 'dir', 'playDirection', 'quarter', 'down',
                               'yardsToGo', 'numberOfPassRushers', 'QB_under_pressure']]
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(X2_train)
    X_train_scaled['play_cluster'] = kmeans.predict(X2_train)
    X_train['play_cluster'] = kmeans.predict(X2_train)
    X_validate_scaled['play_cluster'] = kmeans.predict(X2_val)
    X_validate['play_cluster'] = kmeans.predict(X2_val)
    X_test_scaled['play_cluster'] = kmeans.predict(X2_test)
    X_test['play_cluster'] = kmeans.predict(X2_test)

    X3_train = X_train_scaled[['RB', 'TE', 'WR', 'DL', 'LB', 'DB', 'I_FORM', 'JUMBO',
                               'PISTOL', 'SHOTGUN', 'SINGLEBACK', 'WILDCAT', 'four_three', 'three_four',
                               'nickel', 'dime']]
    X3_val = X_validate_scaled[['RB', 'TE', 'WR', 'DL', 'LB', 'DB', 'I_FORM', 'JUMBO',
                               'PISTOL', 'SHOTGUN', 'SINGLEBACK', 'WILDCAT', 'four_three', 'three_four',
                               'nickel', 'dime']]
    X3_test = X_test_scaled[['RB', 'TE', 'WR', 'DL', 'LB', 'DB', 'I_FORM', 'JUMBO',
                               'PISTOL', 'SHOTGUN', 'SINGLEBACK', 'WILDCAT', 'four_three', 'three_four',
                               'nickel', 'dime']]
    kmeans = KMeans(n_clusters=4)
    kmeans.fit(X3_train)
    X_train_scaled['offvsdef_cluster'] = kmeans.predict(X3_train)
    X_train['offvsdef_cluster'] = kmeans.predict(X3_train)
    X_validate_scaled['offvsdef_cluster'] = kmeans.predict(X3_val)
    X_validate['offvsdef_cluster'] = kmeans.predict(X3_val)
    X_test_scaled['offvsdef_cluster'] = kmeans.predict(X3_test)
    X_test['offvsdef_cluster'] = kmeans.predict(X3_test)

    X4_train = X_train_scaled[['QB_under_pressure', 'numberOfPassRushers', 'defendersInTheBox', 'force_per_second',
                               'time_since_last_x', 's', 'a']]
    X4_val = X_validate_scaled[['QB_under_pressure', 'numberOfPassRushers', 'defendersInTheBox', 'force_per_second',
                               'time_since_last_x', 's', 'a']]
    X4_test = X_test_scaled[['QB_under_pressure', 'numberOfPassRushers', 'defendersInTheBox', 'force_per_second',
                               'time_since_last_x', 's', 'a']]
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(X4_train)
    X_train_scaled['def_react_cluster'] = kmeans.predict(X4_train)
    X_train['def_react_cluster'] = kmeans.predict(X4_train)
    X_validate_scaled['def_react_cluster'] = kmeans.predict(X4_val)
    X_validate['def_react_cluster'] = kmeans.predict(X4_val)
    X_test_scaled['def_react_cluster'] = kmeans.predict(X4_test)
    X_test['def_react_cluster'] = kmeans.predict(X4_test)
    
    return X_train_scaled, X_validate_scaled, X_test_scaled

print('Wrangle_NFL.py Loaded Successfully')

# Acquire 

- We acquired the data from kaggle.com as several .csv's but the data itself is provided by nextgenstats.nfl.com

In [None]:
nfl = pd.read_csv('../input/nfl-big-data-bowl-2021/plays.csv')

In [None]:
nfl.head()

In [None]:
nfl.shape

In [None]:
nfl.info()

In [None]:
nfl.describe().T

In [None]:
num_cols = nfl.columns[[(nfl[col].dtype == 'int64') | (nfl[col].dtype == 'float64') for col in nfl.columns]]
for col in num_cols:
    plt.hist(nfl[col])
    plt.title(col)
    plt.show()

**Takeaways:**
- There are some null values listed that will need some investigation
- More plays are being ran in the second and fourth quarter
- Less plays are being ran as the down gets greater
- Yards to go is skewed right(makes sense)
    - Less likely to lose yards than gain
- Most plays begin between home 20 and away 20
    - Hard to pin your opponent inside 20 for kickoff or punt
- Defenders in the box is a normal distribution
- Number of pass  rushers is a normal distribution
- Scores are skewed right
- Play result is skewed right slightly
- epa is fairly normal distribution

# Prepare

- Create a function that will acquire the plays.csv and prepare it for exploration(prep_plays.py)
- Keep only the useful columns that can help us determine the success of a defense(whether a pass was completed or not)
    - `playDescription`, `quarter`, `down`, `yardsToGo`, `possessionTeam`, `offenseFormation`, `personnelO`, `defendersInTheBox`, `numberOfPassRushers`, `personnelD`, `typeDropback`, `gameClock`, `absoluteYardlineNumber`, `epa`, `playType`, `passResult`, `playResult`
- Create a new column called `pass_stopped` 
    - Will change completion into 0
    - Will change incomplete and interception into 1
- Filter out data that is not a pass play(no fake punts, fake field goals, etc)
- Create new columns that extract positions from offensive personnel
    - RB, TE, WR
- Create new columns that extract positions from defensive personnel
    - DL, LB, DB
- Rename `typeDropback` to `QB_under_pressure` and change values into normal or scramble
- Rename `passResult` into `pass_stopped`
- Create formations out of personnel on the field
- Create `closest_dist`, `closest_x`, `closest_y`, and `defender_receiver` from player tracking data
- Merge all dataframes together
- Convert height and age to be uniform
- Create `time_since_last_x` and `force_per_second` 

# Explore

In [None]:
train, validate, test = explore_plays_data()

In [None]:
train.T

In [None]:
alpha = .05

## Does the offense formation matter? i.e. (is a certain offensive formation harder to defend?)

- $H_0$: There is no dependence between offensive formation and pass stopped
- $H_a$: There is a dependence between offensive formation and pass stopped

In [None]:
observed = pd.crosstab(train.offenseFormation, train.pass_stopped)

In [None]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

In [None]:
if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.catplot(x="offenseFormation", hue="pass_stopped", kind="count", data=train, height=8, aspect=2)._legend.remove()
plt.title('Do certain offensive formations have more more passes stopped than others?', size = 30)
plt.xlabel('Offensive Formation', size = 16)
plt.ylabel('Count', size = 20)
plt.legend(labels = ('Pass Completed', 'Pass Stopped'), loc='center right', frameon=False, fontsize='x-large')
plt.xticks([0, 1, 2, 3, 4, 5, 6], ['Shotgun', 'Empty', 'Singleback', 'I Formation', 'Pistol', 'Jumbo', 'Wildcat'], size = 20)
plt.show()

**Takeaways:**
- There does not seem to be a certain formation that will have there pass stopped more than others
- After a statistical test, we can safely say that there is not dependence on stopping the play and the formation the offense is lined up in.

In [None]:
train.groupby('offenseFormation').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('offenseFormation').pass_stopped.count()

## Are passes stoped dependent on Down?

- $H_0$: There is no dependence between down and pass stopped
- $H_a$: There is a dependence between down and pass stopped

In [None]:
observed = pd.crosstab(train.down, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='down', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Down')
plt.ylabel('Pass Stopped %')
plt.title("Are Passes Stopped dependent on Down?")
plt.show()

**Takeaway:**
- There is a dependence between a pass being stopped and what down it is.
- more passes are stopped on 3rd down with 4th down right behind it


In [None]:
train.groupby('down').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('down').pass_stopped.count()

## Are EPA values dramatically different for passes stopped vs. passes completed?

- $H_0$: The EPA value is the same for passes completed and passes stopped
- $H_a$: The EPA value is different for passes completed and passes stopped

In [None]:
pass_completed = train[train.pass_stopped == 0]
pass_not_completed = train[train.pass_stopped == 1]

t, p = stats.ttest_ind(pass_completed.epa, pass_not_completed.epa)

In [None]:
if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
plt.rc("figure", figsize=(10, 6))
sns.violinplot(train.pass_stopped, train.epa)
plt.xlabel('')
plt.xticks([0,1], ['Pass Completed', 'Pass Stopped'])
plt.yticks(size = 24)
plt.ylabel('EPA')
plt.title("Are Passes Stopped dependent on EPA?")
plt.show()

In [None]:
print(f"The EPA mean for passes completed is {pass_completed.epa.mean()}.")
print(f"The EPA minimum for passes completed is {pass_completed.epa.min()}.")
print(f"The EPA max for passes completed is {pass_completed.epa.max()}.")

In [None]:
print(f"The EPA mean for passes stopped is {pass_not_completed.epa.mean()}.")
print(f"The EPA minimum for passes stopped is {pass_not_completed.epa.min()}.")
print(f"The EPA max for passes stopped is {pass_not_completed.epa.max()}.")

**Takeaways:**
- On average the EPA is negative for passes stopped and the EPA is positive for passes completed
- The pass is usually stopped when the EPA is negative but not always.
- If the EPA is above 2.5 then it almost guarantees that the pass will be completed

## Are passes stopped dependent on QB pressure?

- $H_0$: There is no dependence between QB pressure and pass stopped
- $H_a$: There is a dependence between QB pressure and pass stopped

In [None]:
observed = pd.crosstab(train.QB_under_pressure, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='QB_under_pressure', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.xticks([0,1], ['No Pressure', 'Pressure Applied'])
plt.title("Are Passes Stopped dependent on Pressure Applied to QB?")
plt.show()

In [None]:
train.groupby('QB_under_pressure').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('QB_under_pressure').pass_stopped.count()

## Are passes stopped dependent on how many Defenders are in the Box?

- $H_0$: There is no dependence between defenders in the box and pass stopped
- $H_a$: There is a dependence between defenders in the box and pass stopped

In [None]:
observed = pd.crosstab(train.defendersInTheBox, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='defendersInTheBox', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Defenders in the Box')
plt.ylabel('Pass Stopped %')
plt.xticks([0,1,2,3,4,5,6,7,8,9], [1,2,3,4,5,6,7,8,9,10])
plt.title("Are Passes Stopped dependent on the number of Defenders in the Box?")
plt.show()

In [None]:
train.groupby('defendersInTheBox').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('defendersInTheBox').pass_stopped.count()

## Are passes stopped dependent on how many DL?

- $H_0$: There is no dependence between DL and pass stopped
- $H_a$: There is a dependence between DL and pass stopped

In [None]:
observed = pd.crosstab(train.DL, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='DL', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Number of DL')
plt.ylabel('Pass Stopped %')
# plt.xticks([0,1,2,3,4,5,6,7,8,9], [1,2,3,4,5,6,7,8,9,10])
plt.title("Are Passes Stopped dependent on DL count?")
plt.show()

In [None]:
train.groupby('DL').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('DL').pass_stopped.count()

## Are passes stopped dependent on how many LB?

- $H_0$: There is no dependence between LB and pass stopped
- $H_a$: There is a dependence between LB and pass stopped

In [None]:
observed = pd.crosstab(train.LB, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='LB', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Number of LB')
plt.ylabel('Pass Stopped %')
# plt.xticks([0,1,2,3,4,5,6,7,8,9], [1,2,3,4,5,6,7,8,9,10])
plt.title("Are Passes Stopped dependent on LB count?")
plt.show()

## Are passes stopped dependent on how many DB?

- $H_0$: There is no dependence between DB and pass stopped
- $H_a$: There is a dependence between DB and pass stopped

In [None]:
observed = pd.crosstab(train.DB, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='DB', y='pass_stopped').set(ylim=(0, .55))
plt.xlabel('Number of DB')
plt.ylabel('Pass Stopped %')
# plt.xticks([0,1,2,3,4,5,6,7,8,9], [1,2,3,4,5,6,7,8,9,10])
plt.title("Are Passes Stopped Dependent on DB count?")
plt.show()

In [None]:
train.groupby('DB').pass_stopped.count()

In [None]:
train[train.pass_stopped ==1].groupby('DB').pass_stopped.count()

## Are passes stopped dependent on how defensive formation(Nickel)?

- $H_0$: There is no dependence between Nickel formation and pass stopped
- $H_a$: There is a dependence between Nickel formation and pass stopped

In [None]:
observed = pd.crosstab(train.nickel, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='nickel', y='pass_stopped').set(ylim=(0, .40))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.title("Is the Nickle Formation better at stopping the pass than other formations?")
plt.xticks([0,1], ['Other Formation', 'Nickle Formation'])
plt.show()

## Are passes stopped dependent on how defensive formation(Dime)?


- $H_0$: There is no dependence between Dime formation and pass stopped
- $H_a$: There is a dependence between Dime formation and pass stopped

In [None]:
observed = pd.crosstab(train.dime, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='dime', y='pass_stopped').set(ylim=(0, .40))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.title("Is the Dime Formation better at stopping the pass than other formations?")
plt.xticks([0,1], ['Other Formation', 'Dime Formation'])
plt.show()

## Are passes stopped dependent on how defensive formation(4-3)?

- $H_0$: There is no dependence between 4-3 formation and pass stopped
- $H_a$: There is a dependence between 4-3 formation and pass stopped

In [None]:
observed = pd.crosstab(train.four_three, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='four_three', y='pass_stopped').set(ylim=(0, .40))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.title("Is the 4-3 Formation better at stopping the pass than other formations?")
plt.xticks([0,1], ['Other Formation', '4-3 Formation'])
plt.show()

## Are passes stopped dependent on how defensive formation(3-4)?

- $H_0$: There is no dependence between 3-4 formation and pass stopped
- $H_a$: There is a dependence between 3-4 formation and pass stopped

In [None]:
observed = pd.crosstab(train.three_four, train.pass_stopped)

chi2, p, degf, expected = stats.chi2_contingency(observed)

if p < alpha:
    print("We reject the null hypothesis")
else:
    print("We fail to reject the null hypothesis")
p

In [None]:
sns.barplot(data=train,x='three_four', y='pass_stopped').set(ylim=(0, .40))
plt.xlabel('')
plt.ylabel('Pass Stopped %')
plt.title("Is the 3-4 Formation better at stopping the pass than other formations?")
plt.xticks([0,1], ['Other Formation', '3-4 Formation'])
plt.show()

## Incomplete Passes by Position

In [None]:
df = prep_season()
defensedf = df[df["position"].isin(["CB", "OLB", "SS","FS","ILB","DE","LB","MLB","S","DT","DL","DB"])]

In [None]:
plt.rcParams['figure.figsize']=(13,7)
posdf = defensedf.groupby('position')['event'].sum().reset_index()
#pos20 = posdf.sort_values(by='event', ascending=False)
posdf = posdf.sort_values(by=['event'], ascending =False)
#plt.grid()
sns.set_style("darkgrid")
sns.barplot(data=posdf, x='position', y= 'event', palette='mako')
sns.color_palette('Blues')
plt.title('Position and Incompletions', fontsize=13)
plt.xlabel('Defensive Position',fontsize=13)
plt.ylabel('Incomplete Passes',fontsize=13)
posdf

In [None]:
cbdf = defensedf[defensedf['position'] == 'CB']
olbdf = defensedf[defensedf['position'] == 'OLB']
ssdf = defensedf[defensedf['position'] == 'SS']
fsdf = defensedf[defensedf['position'] == 'FS']
ilbdf = defensedf[defensedf['position'] == 'ILB']

### Cornerback

In [None]:
get_viz(cbdf)

**Takeaway:** 

**Ages:**
-    23, 25, 27, 26, 28

**Colleges:**
-    Ohio state, Florida state, lsu, Alabama, Florida

**Height:**
-    71", 72", 73", 70", 69"

**Weight:**
-    190lbs, 196lbs, 195lbs, 192lbs, 185lbs

### Outside Linebacker

In [None]:
get_viz(olbdf)

**Takeaway:**

**Ages:**
- 25, 27, 23, 28, 26

**College:**
- Georgia, Florida state, Southern California, Kentucky

**Height:**
- 75", 73", 76", 74", 77"

**Weight:**
- 250lbs, 255lbs, 265lbs, 240lbs, 235lbs

### Strong Safety

In [None]:
get_viz(ssdf)

**Takeaway:**

**Ages:**
- 27, 24, 26, 30, 25

**College:**
- Ohio state, boston college, lsu, Georgia, Texas 

**Height:**
- 72",71",73",74",70"

**Weight:**
- 215lbs, 210lbs, 202lbs, 195lbs, 212lbs

### Free Safety

In [None]:
get_viz(fsdf)

**Takeaway:**

**Ages:**
- 27, 25, 26, 22, 24

**College:**
- Utah, Rutgers, Alabama, South Carolina, ohio state

**Height:**
- 73", 71", 72", 70", 74"

**Weight:**
- 205lbs, 195lbs, 212lbs, 202lbs, 14lbs

### Inside Linebacker

In [None]:
get_viz(ilbdf)

**Takeaway:**

**Ages:**
- 23, 28, 26, 24, 29

**College:**
- Kentucky, Alabama, Washington, Florida state,stanford

**Height:**
- 73", 72", 74", 75", 76"

**Weight:**
- 250lbs, 232lbs, 230lbs, 245lbs, 235lbs

## Who are the top defenders?

### Most Stopped Passes

In [None]:
top_defenders()

## Defenders with the Least Amount of Seperation

# Model

## Prep Data for Modeling

In [None]:
#load data for modeling
df = prep_nfl()

In [None]:
#prepare data for modeling
X_train, y_train, X_validate, y_validate, X_test, y_test = train_validate_test(df)
X_train_scaled, X_validate_scaled, X_test_scaled = min_max_scale(X_train, X_validate, X_test)
X_train_scaled, X_validate_scaled, X_test_scaled = add_clusters(X_train_scaled,
                                                                X_validate_scaled, X_test_scaled,
                                                                X_train,X_validate, X_test)

In [None]:
X_train_scaled = X_train_scaled[['epa', 'time_since_last_x', 'x', 'a', 'yardsToGo', 'down',
                                  'absoluteYardlineNumber', 's', 'y', 'force_per_second',
                                  'QB_under_pressure', 'closest_dist', 'closest_x', 'closest_y']]
X_validate_scaled = X_validate_scaled[['epa', 'time_since_last_x', 'x', 'a', 'yardsToGo', 'down',
                                  'absoluteYardlineNumber', 's', 'y', 'force_per_second',
                                  'QB_under_pressure', 'closest_dist', 'closest_x', 'closest_y']]
X_test_scaled = X_test_scaled[['epa', 'time_since_last_x', 'x', 'a', 'yardsToGo', 'down',
                                  'absoluteYardlineNumber', 's', 'y', 'force_per_second',
                                  'QB_under_pressure', 'closest_dist', 'closest_x', 'closest_y']] 

## Train

### Gradient Boost

In [None]:
# after running through several learning rates 
# from .0001 up to 100, 1 is the best parameter
boost_params = {'learning_rate': [1]}

In [None]:
#setting parameters and fitting model
search = GridSearchCV(GradientBoostingClassifier(), boost_params, cv=5)
search.fit(X_train_scaled, y_train)

In [None]:
#predicting target variable
y_pred = search.predict(X_train_scaled)

In [None]:
print('>>>>>>>>>> Accuracy of Gradient Boost on TRAIN set: {:.4f}'
     .format(search.score(X_train_scaled, y_train)))

In [None]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_train, y_pred))
cm

In [None]:
#classification report
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
class_report

### KNN

In [None]:
#setting parameters and fitting model
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn.fit(X_train_scaled, y_train)

In [None]:
#predicting target variable
y_pred = knn.predict(X_train_scaled)

In [None]:
print('>>>>>>>>>> Accuracy of KNN classifier on TRAIN set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))

In [None]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_train, y_pred))
cm

In [None]:
#classification report
report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
report

### Logistic Regression

In [None]:
#setting parameters and fitting model
logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=123, intercept_scaling=1, solver='lbfgs')
logit.fit(X_train_scaled, y_train)

In [None]:
#predicting target variable
y_pred = logit.predict(X_train_scaled)

print('>>>>>>>>>> Accuracy of Logistic Regression classifier on TRAIN set: {:.2f}'
     .format(logit.score(X_train_scaled, y_train)))

In [None]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_train, y_pred))
cm

In [None]:
#classification report
report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
report

### Random Forest

In [None]:
#setting parameters and fitting model
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=8,
                            n_estimators=100,
                            max_depth=15, 
                            random_state=123)

rf.fit(X_train_scaled, y_train)

In [None]:
#predicting target variable
y_pred = rf.predict(X_train_scaled)

In [None]:
print('>>>>>>>>>> Accuracy of Random Forest classifier on TRAIN set: {:.2f}'
     .format(rf.score(X_train_scaled, y_train)))

In [None]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_train, y_pred))
cm

In [None]:
#classification report
class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))
class_report

## Validate

### Gradient Boost

In [None]:
#predicting target variable
y_pred_val = search.predict(X_validate_scaled)

In [None]:
print('>>>>>>>>>> Accuracy of Gradient Boost on VALIDATE set: {:.4f}'
     .format(search.score(X_validate_scaled, y_validate)))

In [None]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_pred_val, y_validate))
cm

In [None]:
#classification report
class_report = pd.DataFrame(classification_report(y_validate, y_pred_val, output_dict=True))
class_report

### KNN

In [None]:
#predicting target variable
y_pred = knn.predict(X_validate_scaled)

In [None]:
print('>>>>>>>>>> Accuracy of KNN classifier on VALIDATE set: {:.2f}'
      .format(knn.score(X_validate_scaled, y_validate)))

In [None]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_validate, y_pred))
cm

In [None]:
#classifiation report
report = pd.DataFrame(classification_report(y_validate, y_pred, output_dict=True))
report

### Random Forest

In [None]:
#predicting target variable
y_pred = rf.predict(X_validate_scaled)

In [None]:
print('>>>>>>>>>> Accuracy of Random Forest on VALIDATE set: {:.2f}'
      .format(rf.score(X_validate_scaled, y_validate)))

In [None]:
cm = pd.DataFrame(confusion_matrix(y_validate, y_pred))
cm

In [None]:
report = pd.DataFrame(classification_report(y_validate, y_pred, output_dict=True))

## Test

### Gradient Boost

In [None]:
#predicting target variable
y_pred_val = search.predict(X_test_scaled)

In [None]:
print('>>>>>>>>>> Accuracy of Gradient Boost on TEST set: {:.4f}'
      .format(search.score(X_test_scaled, y_test)))

In [None]:
#confusion matrix
cm = pd.DataFrame(confusion_matrix(y_pred_val, y_test))
cm

In [None]:
#classification report
class_report = pd.DataFrame(classification_report(y_test, y_pred_val, output_dict=True))
class_report

### Top Features for Gradient Boost

In [None]:
#obtaining names of features
top_feature = pd.DataFrame(X_train_scaled.columns)
#adding importance measure of values
top_feature['values'] = search.best_estimator_.feature_importances_
#finding top 10 features
top_feature.sort_values('values', ascending = False).head(15)

# Conclusions

- Our Gradient Boost Model was 96% accurate at predicting a pass being stopped.
- EPA & closest_dist turned out to be significant features in our model.
    - EPA was provided by Kaggle
    - closest_dist was a feature engineered
- Success in defending the pass truly depends on the defenders' ability to prevent separation from receiver and their reaction time.
- When pressure is applied to the quarter back, the completion percentage significantly decreases.
- Dime formation (6 defensive backs) had the best success in stopping the pass.

# Next Steps

- work out mislabels and small bugs for closest_dist feature
- use similar algorithm to find the distance of all cornerbacks to their defensive assignments i.e. WR, RB, TE, etc
- further analyze the components of EPA to understand their influence on the model
- explore trick plays to see if the same features carry over from the traditional offensive setup