In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


import warnings
warnings.filterwarnings('ignore')

Game data: The games.csv contains the teams playing in each game. The key variable is gameId.

Play data: The plays.csv file contains play-level information from each game. The key variables are gameId and playId.

Player data: The players.csv file contains player-level information from players that participated in any of the tracking data files. The key variable is nflId.

Tracking data: Files tracking[season].csv contain player tracking data from season [season]. The key variables are gameId, playId, and nflId.

PFF Scouting data: The PFFScoutingData.csv file contains play-level scouting information for each game. The key variables are gameId and playId.

In [None]:

scoutingData = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/PFFScoutingData.csv')
# tracking2018 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/tracking2018.csv')
# tracking2019 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/tracking2019.csv')
# tracking2020 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/tracking2020.csv')

In [None]:

print('The shape of dataset for Scouting :', scoutingData.shape)
# print('The shape of dataset for 2018 Season :', tracking2018.shape)
# print('The shape of dataset for 2019 Season :', tracking2019.shape)
# print('The shape of dataset for 2020 Season :', tracking2020.shape)

## Processing and Exploring Player data

In [None]:
players = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/players.csv')
print('data loading complete...')
print('*'*50)
print('The shape of dataset for players :', players.shape)
print('*'*50)
display(players.head().T)
print('*'*50)
display(players.tail().T)
print('*'*50)
print('Unique values in player height :', players['height'].unique())
print('*'*50)

height = []

for i in players['height']:
    i = i.split('-')
    
    if len(i) == 1:
        height.append(int(i[0]))
        
    else:
        height.append((int(i[0])*12) + int(i[1]))
        
players['height'] = height

print('Height in inches of all players :', players['height'].unique())
print('*'*50)

print('Total Number of Colleges : ', players['collegeName'].nunique())
print('*'*50)
print('Top 20 Colleges contributing to player pool : \n', players['collegeName'].value_counts()[:20])
print('*'*50)
players['Position'].value_counts()

# Fixing the dates
players['birthDate'] = pd.to_datetime(players.birthDate)
players['birthDate'] = players['birthDate'].dt.strftime('%Y-%m-%d')
display(players.tail(10).T)

# Extracting Birthyear
players['birthYear'] = pd.DatetimeIndex(players['birthDate']).year

In [None]:
college = players.groupby('collegeName').size().sort_values(0, ascending = False).reset_index()
college.columns = ['Name', 'Count']

In [None]:
f, axs = plt.subplots(3,2, figsize = (20,15))

plt.subplots_adjust(left=0.1,bottom=0.5,right=0.9, top=2, wspace=0.4)


sns.histplot(players['height'],  binwidth = 1, kde = True, ax = axs[0,0])
axs[0,0].set_xlabel('Height(in inches)')

sns.histplot(players['weight'],  binwidth = 1, kde = True, ax = axs[0,1])
axs[0,1].set_xlabel('Weight(in pounds)')

sns.histplot(players['birthYear'],  binwidth = 1, kde = True, ax = axs[1,0])
axs[1,0].set_xlabel('Year of Birth')

sns.barplot( x = 'Name', y = 'Count', data = college[:20],  ax = axs[1,1])
axs[1,1].set_xlabel('Top 20 colleges')
axs[1,1].set_xticklabels(college['Name'][:20], rotation = 50)

sns.histplot(players['Position'],  binwidth = 1, kde = True, ax = axs[2,0])
axs[2,0].set_xlabel('Field Position')
axs[2,0].set_xticklabels(players['Position'], rotation = 50)

## Processing and Exploring Games data

In [None]:
games = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/games.csv')
print('The shape of dataset for games :', games.shape)

In [None]:
games['gameDate'] = pd.to_datetime(games.gameDate)
games['gameDate'] = games['gameDate'].dt.strftime('%Y-%m-%d')
games['dayofweek'] = pd.to_datetime(games.gameDate).dt.dayofweek
games['gameMonth'] = pd.to_datetime(games.gameDate).dt.month
display(games.head().T)
display(games.tail().T)

In [None]:
fig , ax = plt.subplots(figsize = (10,10))
sns.countplot(games['season'])
for p in ax.patches:
    ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+0.5),ha='center', va='top', color='black', size=14)

In [None]:
fig , ax = plt.subplots(2,1, figsize = (20,20))

sns.countplot(games['homeTeamAbbr'], ax = ax[0])
ax[0].set_xlabel('Teams Hosting')
for p in ax[0].patches:
    ax[0].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+0.5),ha='center', va='top', color='black', size=14)
    
sns.countplot(games['visitorTeamAbbr'], ax = ax[1])
ax[1].set_xlabel('Teams Visiting')
for p1 in ax[1].patches:
    ax[1].annotate('{:.1f}'.format(p1.get_height()), (p1.get_x()+0.4, p1.get_height()+0.5),ha='center', va='top', color='black', size=14)


In [None]:
fig , ax = plt.subplots(3, 1, figsize = (20,20))

fig.suptitle('Games on different days of the week')

sns.countplot(x= 'dayofweek', data = games.loc[games['season'] == 2018], ax = ax[0])
ax[0].set_xlabel('Season 2018')
ax[0].set_xticklabels(['Monday','Thursday','Saturday','Sunday'])
for p in ax[0].patches:
    ax[0].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+9),ha='center', va='top',\
                   color='black', size=14)
    
sns.countplot(x= 'dayofweek', data = games.loc[games['season'] == 2019], ax = ax[1])
ax[1].set_xlabel('Season 2019')
ax[1].set_xticklabels(['Monday','Thursday','Saturday','Sunday'])
for p in ax[1].patches:
    ax[1].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+9),ha='center', va='top',\
                   color='black', size=14)
    
sns.countplot(x= 'dayofweek', data = games.loc[games['season'] == 2020], ax = ax[2])
ax[2].set_xlabel('Season 2020')
ax[2].set_xticklabels(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
for p in ax[2].patches:
    ax[2].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+9),ha='center', va='top',\
                   color='black', size=14)

In [None]:
fig , ax = plt.subplots(3, 1, figsize = (20,20))

fig.suptitle('Games on different months')

sns.countplot(x= 'gameMonth', data = games.loc[games['season'] == 2018], ax = ax[0])
ax[0].set_xlabel('Season 2018')
ax[0].set_xticklabels(['September','October','November','December'])
for p in ax[0].patches:
    ax[0].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+3),ha='center', va='top',\
                   color='black', size=14)
    
sns.countplot(x= 'gameMonth', data = games.loc[games['season'] == 2019], ax = ax[1])
ax[1].set_xlabel('Season 2019')
ax[1].set_xticklabels(['September','October','November','December'])
for p in ax[1].patches:
    ax[1].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+3),ha='center', va='top',\
                   color='black', size=14)
    
sns.countplot(x= 'gameMonth', data = games.loc[games['season'] == 2020], ax = ax[2])
ax[2].set_xlabel('Season 2020')
ax[2].set_xticklabels(['January','September','October','November','December'])
for p in ax[2].patches:
    ax[2].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+3),ha='center', va='top',\
                   color='black', size=14)

In [None]:
fig , ax = plt.subplots(figsize = (20,10))

sns.countplot(games['week'])
ax.set_xlabel('Games per week of the season')
for p in ax.patches:
    ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+1),ha='center', va='top',\
                   color='black', size=14)

In [None]:
fig , ax = plt.subplots(3, 1, figsize = (20,20))

fig.suptitle('Games Timings during different seasons')

sns.countplot(x= 'gameTimeEastern', data = games.loc[games['season'] == 2018], ax = ax[0])
ax[0].set_xlabel('Season 2018')
for p in ax[0].patches:
    ax[0].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+5),ha='center', va='top',\
                   color='black', size=14)
    
sns.countplot(x= 'gameTimeEastern', data = games.loc[games['season'] == 2019], ax = ax[1])
ax[1].set_xlabel('Season 2019')
for p in ax[1].patches:
    ax[1].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+5),ha='center', va='top',\
                   color='black', size=14)
    
sns.countplot(x= 'gameTimeEastern', data = games.loc[games['season'] == 2020], ax = ax[2])
ax[2].set_xlabel('Season 2020')
for p in ax[2].patches:
    ax[2].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+5),ha='center', va='top',\
                   color='black', size=14)

## Processing Games and Exploring plays data

In [None]:
plays = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/plays.csv')
print('The shape of dataset for plays :', plays.shape)

In [None]:
#merge data with games data to extract more information
plays = plays.merge(games, on = 'gameId')

In [None]:
homeTeam = []
for i,j in zip(plays['homeTeamAbbr'], plays['possessionTeam']):
    if i == j:
        homeTeam.append(1)
    else:
        homeTeam.append(0)
    
plays['HomeTeamPossesion'] = homeTeam
plays.head().T

In [None]:
fig , ax = plt.subplots(3, 1, figsize = (20,20))

fig.suptitle('Number of plays per Quater')

sns.countplot(x= 'quarter', data = plays.loc[plays['season'] == 2018], ax = ax[0])
ax[0].set_xlabel('Season 2018')
# ax[0].set_xticklabels(['September','October','November','December'])
for p in ax[0].patches:
    ax[0].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+10),ha='center', va='top',\
                   color='black', size=14)
    
sns.countplot(x= 'quarter', data = plays.loc[plays['season'] == 2019], ax = ax[1])
ax[1].set_xlabel('Season 2019')
# ax[1].set_xticklabels(['September','October','November','December'])
for p in ax[1].patches:
    ax[1].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+10),ha='center', va='top',\
                   color='black', size=14)
    
sns.countplot(x= 'quarter', data = plays.loc[plays['season'] == 2020], ax = ax[2])
ax[2].set_xlabel('Season 2020')
# ax[2].set_xticklabels(['January','September','October','November','December'])
for p in ax[2].patches:
    ax[2].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+10),ha='center', va='top',\
                   color='black', size=14)

In [None]:
fig , ax = plt.subplots(figsize = (20,10))

sns.countplot(y = plays['possessionTeam'], order = plays['possessionTeam'].value_counts().index)
ax.set_xlabel('Number of plays')
ax.set_ylabel('Team in Possession')
# for p in ax.patches:
#     ax.annotate('{:.1f}'.format(p.get_x()), (p.get_height()+1, p.get_x()+0.4),\
#                    color='black', size=14)



In [None]:
fig , ax = plt.subplots(3, 1, figsize = (25,20))

fig.suptitle('Plays made by teams per season')

sns.countplot(x= 'possessionTeam', data = plays.loc[plays['season'] == 2018],ax = ax[0])
ax[0].set_xlabel('Season 2018')
# ax[0].set_xticklabels(['September','October','November','December'])
for p in ax[0].patches:
    ax[0].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+8),ha='center', va='top',\
                   color='black', size=14)
    
sns.countplot(x= 'possessionTeam', data = plays.loc[plays['season'] == 2019], ax = ax[1])
ax[1].set_xlabel('Season 2019')
# ax[1].set_xticklabels(['September','October','November','December'])
for p in ax[1].patches:
    ax[1].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+8),ha='center', va='top',\
                   color='black', size=14)
    
sns.countplot(x= 'possessionTeam', data = plays.loc[plays['season'] == 2020], ax = ax[2])
ax[2].set_xlabel('Season 2020')
# ax[2].set_xticklabels(['January','September','October','November','December'])
for p in ax[2].patches:
    ax[2].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+8),ha='center', va='top',\
                   color='black', size=14)

In [None]:
fig , ax = plt.subplots(3, 1, figsize = (20,20))

fig.suptitle('Number of plays per Quater')

sns.countplot(x= 'quarter', data = plays.loc[plays['season'] == 2018], hue = 'HomeTeamPossesion',ax = ax[0])
ax[0].set_xlabel('Season 2018')
# ax[0].set_xticklabels(['September','October','November','December'])
for p in ax[0].patches:
    ax[0].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.2, p.get_height()+8),ha='center', va='top',\
                   color='black', size=14)
    
sns.countplot(x= 'quarter', data = plays.loc[plays['season'] == 2019], hue = 'HomeTeamPossesion', ax = ax[1])
ax[1].set_xlabel('Season 2019')
# ax[1].set_xticklabels(['September','October','November','December'])
for p in ax[1].patches:
    ax[1].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.2, p.get_height()+8),ha='center', va='top',\
                   color='black', size=14)
    
sns.countplot(x= 'quarter', data = plays.loc[plays['season'] == 2020], hue = 'HomeTeamPossesion', ax = ax[2])
ax[2].set_xlabel('Season 2020')
# ax[2].set_xticklabels(['January','September','October','November','December'])
for p in ax[2].patches:
    ax[2].annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.2, p.get_height()+8),ha='center', va='top',\
                   color='black', size=14)

In [None]:
fig , ax = plt.subplots(figsize = (20,10))
# , order = plays['yardlineNumber'].value_counts().index
sns.histplot( plays['yardlineNumber'], kde = True)
ax.set_xlabel('Number of plays')
ax.set_ylabel('Team in Possession')
for p in ax.patches:
    ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.5, p.get_height()+250),ha='center', va='top',\
                   color='black', size=14)


In [None]:
plays['specialTeamsResult'].unique()

In [None]:
fig , ax = plt.subplots(figsize = (20,10))

sns.countplot(x = plays['specialTeamsResult'], order = plays['specialTeamsResult'].value_counts().index)
ax.set_xlabel('Result of the play')
ax.set_ylabel('Count of Plays')
ax.set_xticklabels(['Touchback', 'Return', 'Kick Attempt Good', 'Fair Catch', 'Downed','Muffed', 'Kick Attempt No Good',\
                    'Out of Bounds','Non-Special Teams Result', 'Blocked Kick Attempt', 'Blocked Punt',\
                    'Kickoff Team Recovery'],rotation = 50)
for p in ax.patches:
    ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+140),ha='center', va='top',\
                   color='black', size=14)


In [None]:
fig , ax = plt.subplots(figsize = (20,10))
# , order = plays['yardlineNumber'].value_counts().index
sns.histplot( plays['kickLength'], kde = True)
ax.set_xlabel('Distance of Kicks')
ax.set_ylabel('Count')
# for p in ax.patches:
#     ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.5, p.get_height()+100),ha='center', va='top',\
#                    color='black', size=14)

In [None]:
print('Total Number of Plays :', len(plays['passResult']))
print('*'*50)
print('Total Number of non pass plays :', plays['passResult'].isnull().sum())

## Processing 2018 Tracking data

- x : 0 to 120 (in yards)
- y : 0 to 53.3 (in yards)
- s : yards/sec
- a : acc/sec^2
- dis : Distance travelled from pervious point (in yards)
- o : player Orientation (degree)
- dir : Direction of movement (degree) 

In [None]:
# tracking2018 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/tracking2018.csv')
# print('The shape of dataset for 2018 Season :', tracking2018.shape)
# print('*'*50)
# tracking2018.head().T