In [96]:
from matplotlib import pyplot as plt
from scipy.stats import linregress
import numpy as np
from sklearn import datasets
import pandas as pd

In [97]:
player_position_df = pd.read_csv('data/nfl-big-data-bowl-2024/players_basic.csv')
player_position_df.head()

Unnamed: 0,position,displayName
0,QB,Tom Brady
1,T,Jason Peters
2,QB,Aaron Rodgers
3,TE,Marcedes Lewis
4,QB,Matt Ryan


In [98]:
matchups_df = pd.read_csv('data/nfl-big-data-bowl-2024/matchups.csv')
matchups_df.head()

Unnamed: 0,matchup,gameId,week,home,away
0,LA_BUF,2022090800,1,LA,BUF
1,ATL_NO,2022091100,1,ATL,NO
2,CAR_CLE,2022091101,1,CAR,CLE
3,CHI_SF,2022091102,1,CHI,SF
4,CIN_PIT,2022091103,1,CIN,PIT


In [99]:
plays_df = pd.read_csv('data/nfl-big-data-bowl-2024/plays.csv')
plays_df.head()

Unnamed: 0,gameId,playId,ballCarrierId,ballCarrierDisplayName,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,...,preSnapHomeTeamWinProbability,preSnapVisitorTeamWinProbability,homeTeamWinProbabilityAdded,visitorTeamWinProbilityAdded,expectedPoints,expectedPointsAdded,foulName1,foulName2,foulNFLId1,foulNFLId2
0,2022100908,3537,48723,Parker Hesse,(7:52) (Shotgun) M.Mariota pass short middle t...,4,1,10,ATL,TB,...,0.976785,0.023215,-0.00611,0.00611,2.360609,0.981955,,,,
1,2022091103,3126,52457,Chase Claypool,(7:38) (Shotgun) C.Claypool right end to PIT 3...,4,1,10,PIT,CIN,...,0.160485,0.839515,-0.010865,0.010865,1.733344,-0.263424,,,,
2,2022091111,1148,42547,Darren Waller,(8:57) D.Carr pass short middle to D.Waller to...,2,2,5,LV,LAC,...,0.756661,0.243339,-0.037409,0.037409,1.312855,1.133666,,,,
3,2022100212,2007,46461,Mike Boone,(13:12) M.Boone left tackle to DEN 44 for 7 ya...,3,2,10,DEN,LV,...,0.620552,0.379448,-0.002451,0.002451,1.641006,-0.04358,,,,
4,2022091900,1372,47857,Devin Singletary,(8:33) D.Singletary right guard to TEN 32 for ...,2,1,10,BUF,TEN,...,0.83629,0.16371,0.001053,-0.001053,3.686428,-0.167903,,,,


In [100]:
plays_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12486 entries, 0 to 12485
Data columns (total 35 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   gameId                            12486 non-null  int64  
 1   playId                            12486 non-null  int64  
 2   ballCarrierId                     12486 non-null  int64  
 3   ballCarrierDisplayName            12486 non-null  object 
 4   playDescription                   12486 non-null  object 
 5   quarter                           12486 non-null  int64  
 6   down                              12486 non-null  int64  
 7   yardsToGo                         12486 non-null  int64  
 8   possessionTeam                    12486 non-null  object 
 9   defensiveTeam                     12486 non-null  object 
 10  yardlineSide                      12319 non-null  object 
 11  yardlineNumber                    12486 non-null  int64  
 12  game

In [101]:
plays_df = pd.merge(player_position_df, plays_df, how='left', left_on='displayName', right_on='ballCarrierDisplayName')
plays_df.head()

Unnamed: 0,position,displayName,gameId,playId,ballCarrierId,ballCarrierDisplayName,playDescription,quarter,down,yardsToGo,...,preSnapHomeTeamWinProbability,preSnapVisitorTeamWinProbability,homeTeamWinProbabilityAdded,visitorTeamWinProbilityAdded,expectedPoints,expectedPointsAdded,foulName1,foulName2,foulNFLId1,foulNFLId2
0,QB,Tom Brady,2022102000.0,2725.0,25511.0,Tom Brady,"(:27) (No Huddle, Shotgun) T.Brady scrambles u...",3.0,2.0,8.0,...,0.766339,0.233661,0.030954,-0.030954,4.522505,-0.680631,,,,
1,QB,Tom Brady,2022093000.0,2687.0,25511.0,Tom Brady,(1:08) (Shotgun) T.Brady scrambles up the midd...,3.0,3.0,3.0,...,0.236263,0.763737,-0.052861,0.052861,0.35447,-1.556648,Offensive Holding,,53027.0,
2,QB,Tom Brady,2022102000.0,3652.0,25511.0,Tom Brady,(7:07) T.Brady up the middle to PIT 29 for no ...,4.0,3.0,1.0,...,0.738573,0.261427,0.027505,-0.027505,3.937956,-1.125045,,,,
3,QB,Tom Brady,2022103000.0,3083.0,25511.0,Tom Brady,(13:22) (Shotgun) T.Brady scrambles left end t...,4.0,3.0,10.0,...,0.331841,0.668159,-0.044003,0.044003,3.793404,-1.062754,,,,
4,T,Jason Peters,,,,,,,,,...,,,,,,,,,,


In [102]:
# Fill non-finite values with 0
plays_df['defendersInTheBox'] = plays_df['defendersInTheBox'].fillna(0)
plays_df['gameId'] = plays_df['gameId'].fillna(0)
# Convert the column to integer
plays_df['defendersInTheBox'] = plays_df['defendersInTheBox'].astype(int)
plays_df['gameId'] = plays_df['gameId'].astype(int)
# Change column type to string for columns: 'yardlineSide', 'possessionTeam', 'defensiveTeam'
plays_df = plays_df.astype({'yardlineSide': 'string', 'position':'string','possessionTeam': 'string', 'defensiveTeam': 'string','offenseFormation': 'string','ballCarrierId': 'string', 'ballCarrierDisplayName': 'string','passResult':'string','playNullifiedByPenalty':'string','playDescription':'string'})
# Change column type to datetime64[ns] for column: 'gameClock'
plays_df = plays_df.astype({'gameClock': 'datetime64[ns]',})
# Drop columns: 'passProbability', 'preSnapHomeTeamWinProbability' and 5 other columns
plays_df_clean = plays_df.drop(columns=['passProbability', 'preSnapHomeTeamWinProbability', 'preSnapVisitorTeamWinProbability', 'homeTeamWinProbabilityAdded', 'visitorTeamWinProbilityAdded', 'expectedPoints', 'expectedPointsAdded','foulName1', 'foulName2', 'foulNFLId1', 'foulNFLId2','prePenaltyPlayResult','penaltyYards','displayName'])
plays_df_clean.head()

Unnamed: 0,position,gameId,playId,ballCarrierId,ballCarrierDisplayName,playDescription,quarter,down,yardsToGo,possessionTeam,...,gameClock,preSnapHomeScore,preSnapVisitorScore,passResult,passLength,playResult,playNullifiedByPenalty,absoluteYardlineNumber,offenseFormation,defendersInTheBox
0,QB,2022102301,2725.0,25511.0,Tom Brady,"(:27) (No Huddle, Shotgun) T.Brady scrambles u...",3.0,2.0,8.0,TB,...,2024-03-17 00:27:00,14.0,0.0,R,,1.0,N,92.0,SHOTGUN,7
1,QB,2022092512,2687.0,25511.0,Tom Brady,(1:08) (Shotgun) T.Brady scrambles up the midd...,3.0,3.0,3.0,TB,...,2024-03-17 01:08:00,6.0,14.0,R,,-10.0,Y,80.0,EMPTY,5
2,QB,2022101607,3652.0,25511.0,Tom Brady,(7:07) T.Brady up the middle to PIT 29 for no ...,4.0,3.0,1.0,TB,...,2024-03-17 07:07:00,20.0,12.0,,,0.0,N,81.0,SINGLEBACK,7
3,QB,2022102700,3083.0,25511.0,Tom Brady,(13:22) (Shotgun) T.Brady scrambles left end t...,4.0,3.0,10.0,TB,...,2024-03-17 13:22:00,10.0,17.0,R,,1.0,N,23.0,SHOTGUN,5
4,T,0,,,,,,,,,...,NaT,,,,,,,,,0


In [107]:

qb_plays = plays_df_clean[plays_df_clean['position'] == "QB"]
qb_plays.head()

Unnamed: 0,gameId,playId,ballCarrierId,ballCarrierDisplayName,position,playDescription,quarter,down,yardsToGo,possessionTeam,...,gameClock,preSnapHomeScore,preSnapVisitorScore,passResult,passLength,playResult,playNullifiedByPenalty,absoluteYardlineNumber,offenseFormation,defendersInTheBox
0,2022102301,2725.0,25511.0,Tom Brady,QB,"(:27) (No Huddle, Shotgun) T.Brady scrambles u...",3.0,2.0,8.0,TB,...,2024-03-17 00:27:00,14.0,0.0,R,,1.0,N,92.0,SHOTGUN,7
1,2022092512,2687.0,25511.0,Tom Brady,QB,(1:08) (Shotgun) T.Brady scrambles up the midd...,3.0,3.0,3.0,TB,...,2024-03-17 01:08:00,6.0,14.0,R,,-10.0,Y,80.0,EMPTY,5
2,2022101607,3652.0,25511.0,Tom Brady,QB,(7:07) T.Brady up the middle to PIT 29 for no ...,4.0,3.0,1.0,TB,...,2024-03-17 07:07:00,20.0,12.0,,,0.0,N,81.0,SINGLEBACK,7
3,2022102700,3083.0,25511.0,Tom Brady,QB,(13:22) (Shotgun) T.Brady scrambles left end t...,4.0,3.0,10.0,TB,...,2024-03-17 13:22:00,10.0,17.0,R,,1.0,N,23.0,SHOTGUN,5
5,2022110603,3064.0,29851.0,Aaron Rodgers,QB,(12:24) (Shotgun) A.Rodgers scrambles up the m...,4.0,3.0,17.0,GB,...,2024-03-17 12:24:00,15.0,6.0,R,,18.0,N,46.0,SHOTGUN,4


In [103]:
plays_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13960 entries, 0 to 13959
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   position                13960 non-null  string        
 1   gameId                  13960 non-null  int64         
 2   playId                  12763 non-null  float64       
 3   ballCarrierId           12763 non-null  string        
 4   ballCarrierDisplayName  12763 non-null  string        
 5   playDescription         12763 non-null  string        
 6   quarter                 12763 non-null  float64       
 7   down                    12763 non-null  float64       
 8   yardsToGo               12763 non-null  float64       
 9   possessionTeam          12763 non-null  string        
 10  defensiveTeam           12763 non-null  string        
 11  yardlineSide            12594 non-null  string        
 12  yardlineNumber          12763 non-null  float6

In [104]:
plays_df_clean = plays_df_clean[[ 'gameId', 'playId', 'ballCarrierId', 'ballCarrierDisplayName', 'position','playDescription', 'quarter', 'down', 'yardsToGo', 'possessionTeam', 'defensiveTeam', 'yardlineSide', 'yardlineNumber', 'gameClock', 'preSnapHomeScore', 'preSnapVisitorScore', 'passResult', 'passLength', 'playResult', 'playNullifiedByPenalty', 'absoluteYardlineNumber', 'offenseFormation', 'defendersInTheBox']]


In [105]:
# Filter rows based on column: 'down'
plays_3rd_down_df = plays_df_clean[plays_df_clean['down'] == 3]
plays_3rd_down_df.head()
    

Unnamed: 0,gameId,playId,ballCarrierId,ballCarrierDisplayName,position,playDescription,quarter,down,yardsToGo,possessionTeam,...,gameClock,preSnapHomeScore,preSnapVisitorScore,passResult,passLength,playResult,playNullifiedByPenalty,absoluteYardlineNumber,offenseFormation,defendersInTheBox
1,2022092512,2687.0,25511.0,Tom Brady,QB,(1:08) (Shotgun) T.Brady scrambles up the midd...,3.0,3.0,3.0,TB,...,2024-03-17 01:08:00,6.0,14.0,R,,-10.0,Y,80.0,EMPTY,5
2,2022101607,3652.0,25511.0,Tom Brady,QB,(7:07) T.Brady up the middle to PIT 29 for no ...,4.0,3.0,1.0,TB,...,2024-03-17 07:07:00,20.0,12.0,,,0.0,N,81.0,SINGLEBACK,7
3,2022102700,3083.0,25511.0,Tom Brady,QB,(13:22) (Shotgun) T.Brady scrambles left end t...,4.0,3.0,10.0,TB,...,2024-03-17 13:22:00,10.0,17.0,R,,1.0,N,23.0,SHOTGUN,5
5,2022110603,3064.0,29851.0,Aaron Rodgers,QB,(12:24) (Shotgun) A.Rodgers scrambles up the m...,4.0,3.0,17.0,GB,...,2024-03-17 12:24:00,15.0,6.0,R,,18.0,N,46.0,SHOTGUN,4
7,2022091812,2969.0,29851.0,Aaron Rodgers,QB,(6:38) (Shotgun) A.Rodgers right end to GB 13 ...,4.0,3.0,2.0,GB,...,2024-03-17 06:38:00,24.0,10.0,,,4.0,N,101.0,SHOTGUN,6


In [106]:
plays_4th_down_df = plays_df_clean[plays_df_clean['down'] == 4]
plays_4th_down_df.head()

Unnamed: 0,gameId,playId,ballCarrierId,ballCarrierDisplayName,position,playDescription,quarter,down,yardsToGo,possessionTeam,...,gameClock,preSnapHomeScore,preSnapVisitorScore,passResult,passLength,playResult,playNullifiedByPenalty,absoluteYardlineNumber,offenseFormation,defendersInTheBox
14,2022092502,3467.0,33084.0,Matt Ryan,QB,(6:39) M.Ryan up the middle to IND 35 for 2 ya...,4.0,4.0,1.0,IND,...,2024-03-17 06:39:00,13.0,17.0,,,2.0,N,43.0,SINGLEBACK,8
71,2022102700,4249.0,37079.0,Julio Jones,WR,(:53) (Shotgun) T.Brady pass short right to J....,4.0,4.0,8.0,TB,...,2024-03-17 00:53:00,16.0,27.0,C,3.0,8.0,N,18.0,SHOTGUN,5
205,2022103000,1533.0,38605.0,Russell Wilson,QB,(5:23) R.Wilson up the middle to JAX 34 for 2 ...,2.0,4.0,1.0,DEN,...,2024-03-17 05:23:00,10.0,0.0,,,2.0,N,46.0,SINGLEBACK,7
209,2022103000,4058.0,38605.0,Russell Wilson,QB,(:36) R.Wilson right guard to JAX 25 for 1 yar...,4.0,4.0,1.0,DEN,...,2024-03-17 00:36:00,17.0,21.0,,,1.0,N,84.0,I_FORM,10
250,2022102304,4412.0,38696.0,Marvin Jones,WR,(:22) (Shotgun) T.Lawrence pass deep middle to...,4.0,4.0,15.0,JAX,...,2024-03-17 00:22:00,17.0,23.0,C,22.0,43.0,N,50.0,EMPTY,4


merged_df = pd.merge(matchups_df, tracking_test_df, how='left', left_on='gameId', right_on='gameId')
merged_df.head()