In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from itertools import combinations
from collections import defaultdict
from tqdm import tqdm

In [2]:
# Read in the notebooks that were created in the "BDB25 - Build Dataset" notebook
motion2 = pd.read_csv("/kaggle/input/nfl-bdb25-build-dataset/motion2.csv")
motion2_control = pd.read_csv("/kaggle/input/nfl-bdb25-build-dataset/motion2_control.csv")
motion2_control2 = pd.read_csv("/kaggle/input/nfl-bdb25-build-dataset/motion2_control2.csv")

# Read in competition data 
games = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2025/games.csv")
players = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2025/players.csv")
player_play = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2025/player_play.csv")

# Merge additional game information
motion2 = motion2.merge(games, left_on='game_id', right_on='gameId')
motion2_control = motion2_control.merge(games, left_on="gameId", right_on="gameId")
motion2_control2 = motion2_control2.merge(games, left_on="gameId", right_on="gameId")

In [3]:
# This code creates a temporary dataframe that will be joined on the main dataframes to create features that can identify what routes were ran

tmp = player_play[['gameId', 'playId','passDefensed','routeRan']].groupby(['gameId', 'playId'])[['passDefensed','routeRan']].agg(lambda x: list(x)).reset_index()

go, hitch, flat, out, cross, IN, post, slant, corner, screen, angle, wheel, none = [],[],[],[],[],[],[],[],[],[],[],[],[]
pass_defensed = []

for i in range(len(tmp)):
    if "GO" in tmp['routeRan'][i]:
        go.append(1)
    else:
        go.append(0)
        
    if "HITCH" in tmp['routeRan'][i]:
        hitch.append(1)
    else:
        hitch.append(0)

    if "FLAT" in tmp['routeRan'][i]:
        flat.append(1)
    else:
        flat.append(0)

    if "OUT" in tmp['routeRan'][i]:
        out.append(1)
    else:
        out.append(0)

    if "CROSS" in tmp['routeRan'][i]:
        cross.append(1)
    else:
        cross.append(0)

    if "IN" in tmp['routeRan'][i]:
        IN.append(1)
    else:
        IN.append(0)

    if "POST" in tmp['routeRan'][i]:
        post.append(1)
    else:
        post.append(0)

    if "SLANT" in tmp['routeRan'][i]:
        slant.append(1)
    else:
        slant.append(0)

    if "CORNER" in tmp['routeRan'][i]:
        corner.append(1)
    else:
        corner.append(0)

    if "SCREEN" in tmp['routeRan'][i]:
        screen.append(1)
    else:
        screen.append(0)

    if "ANGLE" in tmp['routeRan'][i]:
        angle.append(1)
    else:
        angle.append(0)

    if "WHEEL" in tmp['routeRan'][i]:
        wheel.append(1)
    else:
        wheel.append(0)


tmp['GO'] = go
tmp['HITCH'] = hitch
tmp['FLAT'] = flat
tmp['OUT'] = out
tmp['CROSS'] = cross
tmp['IN'] = IN
tmp['POST'] = post 
tmp['SLANT'] = slant 
tmp['CORNER'] = corner
tmp['SCREEN'] = screen
tmp['ANGLE'] = angle
tmp['WHEEL'] = wheel

In [4]:
# The temporary dataframe is joined on the main dataframe to add route information
tmp = tmp.drop(['routeRan'], axis=1)

motion2 = motion2.merge(tmp, left_on=['game_id', 'playId'], right_on=['gameId','playId'])
motion2_control = motion2_control.merge(tmp, left_on=["gameId",'playId'], right_on=["gameId",'playId'])
motion2_control2 = motion2_control2.merge(tmp, left_on=["gameId", "playId"], right_on=["gameId", "playId"])

In [5]:
# A column "ispass" is created to determine if a play was a pass play or not

vals = ["C","I","S","R","IN"]
motion2['ispass'] = [0 if i not in vals else 1 for i in motion2['passResult']]
motion2_control['ispass'] = [0 if not i in vals else 1 for i in motion2_control['passResult']]
motion2_control2['ispass'] = [0 if not i in vals else 1 for i in motion2_control2['passResult']]

In [6]:
# This function turns continuous variables such as passLength and turns them into categorical ones by binning so that they can be 
# analyzed using a Chi-Squared test.

def bin_cont_cols(data):
    passLength_bins = []
    for i in range(len(data['passLength'])):
        if not np.isnan(data['passLength'][i]):
            if data['passLength'][i] < 0:
                passLength_bins.append("Behind_LOS")
            elif (data['passLength'][i] >= 0) and (data['passLength'][i] < 10):
                passLength_bins.append("0-10 yards")
            elif (data['passLength'][i] >= 10) and (data['passLength'][i] < 20):
                passLength_bins.append("10-20 yards")
            elif (data['passLength'][i] >= 20) and (data['passLength'][i] < 30):
                passLength_bins.append("20-30 yards")
            elif (data['passLength'][i] >= 30) and (data['passLength'][i] < 40):
                passLength_bins.append("20-40 yards")
            elif (data['passLength'][i] >= 40):
                passLength_bins.append("> 40 yards")
        else:
            passLength_bins.append("No Pass")
    
    
    yardsGained_bins = []
    for j in range(len(data['yardsGained'])):
        if data['yardsGained'][j]:
            if data['yardsGained'][j] < 0:
                yardsGained_bins.append("< 0 yards")
            elif (data['yardsGained'][j] >= 0) and (data['yardsGained'][j] < 5):
                yardsGained_bins.append("0-5 yards")
            elif (data['yardsGained'][j] >= 5) and (data['yardsGained'][j] < 10):
                yardsGained_bins.append("5-10 yards")
            elif (data['yardsGained'][j] >= 10) and (data['yardsGained'][j] < 15):
                yardsGained_bins.append("10-15 yards")
            elif (data['yardsGained'][j] >= 15) and (data['yardsGained'][j] < 20):
                yardsGained_bins.append("15-20 yards")
            elif (data['yardsGained'][j] >= 20) and (data['yardsGained'][j] < 25):
                yardsGained_bins.append("20-25 yards")
            elif (data['yardsGained'][j] >= 25) and (data['yardsGained'][j] < 30):
                yardsGained_bins.append("25-30 yards")
            elif (data['yardsGained'][j] >= 30):
                yardsGained_bins.append("> 30 yards")
        else:
            yardsGained_bins.append("N/A")
    
    penaltyYards_bins = []
    for k in range(len(data['penaltyYards'])):
        if not np.isnan(data['penaltyYards'][k]):
            if (data['penaltyYards'][k] < -5):
                penaltyYards_bins.append("< -5 yards")
            elif (data['penaltyYards'][k] >= -5) and (data['penaltyYards'][k] < 0):
                penaltyYards_bins.append("0 to -5 yards")
            elif (data['penaltyYards'][k] >= 0) and (data['penaltyYards'][k] < 5):
                penaltyYards_bins.append("0 to 5 yards")
            elif (data['penaltyYards'][k] >= 5) and (data['penaltyYards'][k] < 10):
                penaltyYards_bins.append("5 to 10 yards")
            elif (data['penaltyYards'][k] >= 10) and (data['penaltyYards'][k] < 15):
                penaltyYards_bins.append("10 to 15 yards")
            elif (data['penaltyYards'][k] >= 15) and (data['penaltyYards'][k] < 20):
                penaltyYards_bins.append("15 to 20 yards")
            elif (data['penaltyYards'][k] >= 20):
                penaltyYards_bins.append("> 20 yards")
        else:
            penaltyYards_bins.append("N/A")

    endzoneDist_bins = []
    for l in range(len(data['absoluteYardlineNumber'])):
        if data['absoluteYardlineNumber'][l]:
            if (data['absoluteYardlineNumber'][l] < 10):
                endzoneDist_bins.append("< 10 yards")
            elif (data['absoluteYardlineNumber'][l] >= 10) and (data['absoluteYardlineNumber'][l] < 20):
                endzoneDist_bins.append("10-20 yards")
            elif (data['absoluteYardlineNumber'][l] >= 20) and (data['absoluteYardlineNumber'][l] < 30):
                endzoneDist_bins.append("20-30 yards")
            elif (data['absoluteYardlineNumber'][l] >= 30) and (data['absoluteYardlineNumber'][l] < 40):
                endzoneDist_bins.append("30-40 yards")
            elif (data['absoluteYardlineNumber'][l] >= 40) and (data['absoluteYardlineNumber'][l] < 50):
                endzoneDist_bins.append("40-50 yards")
            elif (data['absoluteYardlineNumber'][l] >= 50) and (data['absoluteYardlineNumber'][l] < 60):
                endzoneDist_bins.append("50-60 yards")
            elif (data['absoluteYardlineNumber'][l] >= 60) and (data['absoluteYardlineNumber'][l] < 70):
                endzoneDist_bins.append("60-70 yards")
            elif (data['absoluteYardlineNumber'][l] >= 70) and (data['absoluteYardlineNumber'][l] < 80):
                endzoneDist_bins.append("70-80 yards")
            elif (data['absoluteYardlineNumber'][l] >= 80) and (data['absoluteYardlineNumber'][l] < 90):
                endzoneDist_bins.append("80-90 yards")
            elif (data['absoluteYardlineNumber'][l] >= 90):
                endzoneDist_bins.append("> 90 yards")
        else:
            endzoneDist_bins.append("N/A")

    offenseWinProbAdded, defenseWinProbAdded = [],[]
    for m in range(len(data)):
        home = data['homeTeamAbbr'][m]
        if data['possessionTeam'][m] == home:
            offenseWinProbAdded.append(data['homeTeamWinProbabilityAdded'][m])
            defenseWinProbAdded.append(data['visitorTeamWinProbilityAdded'][m])
        else:
            defenseWinProbAdded.append(data['homeTeamWinProbabilityAdded'][m])
            offenseWinProbAdded.append(data['visitorTeamWinProbilityAdded'][m])
    
    offenseWinProbAdded_bins = []
    for n in range(len(offenseWinProbAdded)):
        if offenseWinProbAdded[n]:
            if offenseWinProbAdded[n] < -0.50:
                offenseWinProbAdded_bins.append("< -0.50")
            elif (offenseWinProbAdded[n] >= -0.5) and (offenseWinProbAdded[n] < -0.25):
                offenseWinProbAdded_bins.append("-0.5 to -0.25")
            elif (offenseWinProbAdded[n] >= -0.25) and (offenseWinProbAdded[n] < 0):
                offenseWinProbAdded_bins.append("-0.25 to 0")
            elif (offenseWinProbAdded[n] >= 0) and (offenseWinProbAdded[n] < 0.25):
                offenseWinProbAdded_bins.append("0 to 0.25")
            elif (offenseWinProbAdded[n] >= 0.25) and (offenseWinProbAdded[n] < 0.50):
                offenseWinProbAdded_bins.append("0.25 to 0.50")
            elif (offenseWinProbAdded[n] >= 0.50):
                offenseWinProbAdded_bins.append("> 0.50")
        else:
            offenseWinProbAdded_bins.append("N/A")
        
    defenseWinProbAdded_bins = []
    for o in range(len(defenseWinProbAdded)):
        if defenseWinProbAdded[o]:
            if defenseWinProbAdded[o] < -0.50:
                defenseWinProbAdded_bins.append("< -0.50")
            elif (defenseWinProbAdded[o] >= -0.5) and (defenseWinProbAdded[o] < -0.25):
                defenseWinProbAdded_bins.append("-0.5 to -0.25")
            elif (defenseWinProbAdded[o] >= -0.25) and (defenseWinProbAdded[o] < 0):
                defenseWinProbAdded_bins.append("-0.25 to 0")
            elif (defenseWinProbAdded[o] >= 0) and (defenseWinProbAdded[o] < 0.25):
                defenseWinProbAdded_bins.append("0 to 0.25")
            elif (defenseWinProbAdded[o] >= 0.25) and (defenseWinProbAdded[o] < 0.50):
                defenseWinProbAdded_bins.append("0.25 to 0.50")
            elif (defenseWinProbAdded[o] >= 0.50):
                defenseWinProbAdded_bins.append("> 0.50")
        else:
            defenseWinProbAdded_bins.append("N/A")

    timeToThrow_bins = []
    for p in range(len(data['timeToThrow'])):
        if not np.isnan(data['timeToThrow'][p]):
            if data['timeToThrow'][p] < 2:
                timeToThrow_bins.append("< 2 seconds")
            elif (data['timeToThrow'][p] >= 2) and (data['timeToThrow'][p] < 5):
                timeToThrow_bins.append("2-5 seconds")
            elif (data['timeToThrow'][p] >= 5) and (data['timeToThrow'][p] < 10):
                timeToThrow_bins.append("5-10 seconds")
            elif (data['timeToThrow'][p] >= 10):
                timeToThrow_bins.append("> 10 seconds")
        else:
            timeToThrow_bins.append("No Pass")

    dropbackDist_bins = []
    for q in range(len(data['dropbackDistance'])):
        if not np.isnan(data['dropbackDistance'][q]):
            if (data['dropbackDistance'][q] >= 0) and (data['dropbackDistance'][q] < 3):
                dropbackDist_bins.append("0-3 yards")
            elif (data['dropbackDistance'][q] >= 3) and (data['dropbackDistance'][q] < 5):
                dropbackDist_bins.append("3-5 yards")
            elif (data['dropbackDistance'][q] >= 5) and (data['dropbackDistance'][q] < 8):
                dropbackDist_bins.append("5-8 yards")
            elif (data['dropbackDistance'][q] >= 8):
                dropbackDist_bins.append("> 8 yards")
        else:
            dropbackDist_bins.append("No Pass")

    minutes_remaining = []
    for r in range(len(data)):
        m = int(data['gameClock'][r].split(":")[0])
        if m < 2:
            minutes_remaining.append("< 2 minutes")
        elif (m >= 2) and (m < 4):
            minutes_remaining.append("2-4 minutes")
        elif (m >= 4) and (m < 6):
            minutes_remaining.append("4-6 minutes")
        elif (m >= 6) and (m < 8):
            minutes_remaining.append("6-8 minutes")
        elif (m >= 8) and (m < 10):
            minutes_remaining.append("8-10 minutes")
        elif (m >= 10) and (m < 12):
            minutes_remaining.append("10-12 minutes")
        elif (m >= 12):
            minutes_remaining.append("> 12 minutes")
    
    
    data['gameDate'] = pd.to_datetime(data['gameDate'])
    data['gameMonth'] = data['gameDate'].dt.month
    data['gameDay_of_week'] = data['gameDate'].dt.day_of_week
    data['yardsGained_bins'] = yardsGained_bins
    data['passLen_bins'] = passLength_bins
    data['penaltyYards_bins'] = penaltyYards_bins
    data['endzoneDist_bins'] = endzoneDist_bins
    data['offenseWinProbAdded_bins'] = offenseWinProbAdded_bins
    data['defenseWinProbAdded_bins'] = defenseWinProbAdded_bins
    data['timeToThrow_bins'] = timeToThrow_bins
    data['dropBackDist_bins'] = dropbackDist_bins
    data['minutesRemaininginQuarter'] = minutes_remaining

    return data

In [7]:
# Calling the bin_cont_cols function on the dataframes

motion2 = bin_cont_cols(motion2)
motion2_control = bin_cont_cols(motion2_control)
motion2_control2 = bin_cont_cols(motion2_control2)

In [8]:
# This is the main function which performs the Chi-Squared analysis. 
# It takes in the data which has plays with TTT shift and a control dataset consisting of plays that did not.
# It also takes in a list of features which are a mix of situations and outcomes.

# The function is recursive and is able to identify any significant differences in percent frequencies of outcomes
# under different situations.

sigs = defaultdict(list)
cache = []

def find_sigs(data, control, analysis_columns, previous):
    routes = ['GO', 'HITCH', 'FLAT', 'OUT', 'CROSS','IN', 'POST', 'SLANT', 'CORNER', 'SCREEN', 'ANGLE', 'WHEEL']
    
    for i in range(len(analysis_columns)):
        if analysis_columns[i] in routes:
            vals = [1]
        else:
            vals = data[analysis_columns[i]].unique()
            
        if analysis_columns[i] in list(control.columns):
            control_vals = control[analysis_columns[i]].unique()
            
            for j in range(len(vals)):
                if previous == "":
                    levels = analysis_columns[i] + "(" + str(vals[j]) + ")"
                else:
                    levels = previous + "|" + analysis_columns[i] + "(" + str(vals[j]) + ")"

                cache_check = "|".join(sorted(levels.split("|")))
                if (vals[j] in control_vals) and (cache_check not in cache): # --> route combos only
                    cache.append("|".join(sorted(levels.split("|"))))
                
                    tmp1 = data[data[analysis_columns[i]] == vals[j]].reset_index(drop=True)
                    tmp2 = control[control[analysis_columns[i]] == vals[j]].reset_index(drop=True)
    
                    tmp3 = data[data[analysis_columns[i]] != vals[j]].reset_index(drop=True)
                    tmp4 = control[control[analysis_columns[i]] != vals[j]].reset_index(drop=True)

                    if (tmp1.shape[0] >= 20) and (tmp2.shape[0] >= 20) and (tmp3.shape[0] >= 20) and (tmp4.shape[0] >= 20):
                        
                        chi_data = {
                            'Type': ['Motion 1', 'No Motion 1'],
                            'Value1': [tmp1.shape[0], tmp2.shape[0]],
                            'Value2': [tmp3.shape[0], tmp4.shape[0]]
                        }
        
                        chi_df = pd.DataFrame(chi_data)
                        chi_df.set_index('Type', inplace=True)
        
                
                        chi2_stat, p_val, dof, expected = chi2_contingency(chi_df)
                        
                        if p_val < 0.05:
                            if tmp1.shape[0] > expected[0][0]:
                                result = "More often than expected"
                            else:
                                result = "Less often than expected"

                            control_prob = (tmp2.shape[0] / (tmp2.shape[0] + tmp4.shape[0])) * 100
                            test_prob = (tmp1.shape[0] / (tmp1.shape[0] + tmp3.shape[0])) * 100
                                
                            sigs["Analysis"].append(levels)
                            sigs["Significance"].append(p_val)
                            sigs["Result"].append(result)
                            sigs["P(feature)"].append(analysis_columns[i])
                            sigs["P(val)"].append(vals[j])
                            sigs["Control Probability"].append(np.round(control_prob, 2))
                            sigs["Test Probability"].append(np.round(test_prob,2))
                        
        
                        cols = analysis_columns.copy()
                        cols.remove(analysis_columns[i])
                      
                        find_sigs(tmp1, tmp2, cols, levels)
                        levels = "|".join(levels.split("|")[0:-1])

    return

In [9]:
# Here the find_sigs functions is called and the dataset consisting of all TTT plays and the dataset consisting 
# of all non-TTT plays are passed into the function.

sigs = defaultdict(list)
cache = []

analysis_cols = ["offenseFormation", "pff_passCoverage", "down", "passLen_bins", "yardsGained_bins","passResult","ispass",
                 'GO', 'HITCH', 'FLAT', 'OUT', 'CROSS','IN', 'POST', 'SLANT', 'CORNER', 'SCREEN', 'ANGLE', 'WHEEL']


find_sigs(motion2, motion2_control, analysis_cols, "")
analysis1 = pd.DataFrame.from_dict(sigs)

In [10]:
# Here the find_sigs functions is called and the dataset consisting of all plays with 2x2 reciever alignment but had no
# shift are passed into the function.

sigs = defaultdict(list)
cache = []

analysis_cols = ["offenseFormation", "pff_passCoverage", "down","passLen_bins", "yardsGained_bins","passResult",'ispass',
                 'GO', 'HITCH', 'FLAT', 'OUT', 'CROSS','IN', 'POST', 'SLANT', 'CORNER', 'SCREEN', 'ANGLE', 'WHEEL']

find_sigs(motion2, motion2_control2, analysis_cols, "")

analysis2 = pd.DataFrame.from_dict(sigs)

In [11]:
# The results from both function calls are then stored into two csv files 

analysis1.to_csv("analysis1.csv")
analysis2.to_csv("analysis2.csv")