In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Introduction
- The National Football League (NFL) is a professional American football league 
- The NFL is one of the four major North American professional sports leagues, the highest professional level of American football in the world.
- The NFL's eighteen-week regular season runs from early September to early January, with each team playing seventeen games and having one bye week.
- The NFL was formed in 1920 as the American Professional Football Association (APFA) before renaming itself the National Football League for the 1922 season. 

**Will keep updating**

![](http://unsplash.com/photos/0ndQ1W0w99Q)

# 2. Games Dataset

In [None]:
df_game = pd.read_csv('../input/nfl-big-data-bowl-2022/games.csv')

# Dataset Description 

|**Column Name**|**Description**|
| :-- | :--
|**gameId**|Game identifier, unique|
|**gameDate**|Game Date (time, mm/dd/yyyy)|
|**gameTimeEastern**|Start time of game (time, HH:MM:SS, EST)|
|**homeTeamAbbr**|Home team three-letter code|
|**visitorTeamAbbr**|Visiting team three-letter code|
|**week**|Week of game|

In [None]:
df_game.head()

**Dataset Shape**

In [None]:
df_game.shape

**Samples total - 764 Dimensionality - 7**

**Dataset Type**

In [None]:
df_game.info()

**Dataset has a mixture of 4 categorical and 3 numerical data**

**Checking for missing values**

In [None]:
df_game.isnull().sum()

**There are no missing values**

# 3. EDA

**Number of games played in each season**

In [None]:
df_game['season'].value_counts()

In [None]:
import matplotlib.pyplot as plt
labels = ['2020','2019','2018']
sizes =[256, 255,253]
colors = ['gold','lightskyblue', 'green']
explode = (0.1, 0,0)  

plt.pie(sizes, explode=explode, labels=labels, colors=colors, 
        autopct='%1.1f%%', shadow=True, startangle=140)

plt.axis('equal')
plt.show()

**256 regular season games were played in 2020**

**Feature Engineering**

In [None]:
df_game['month'] = df_game['gameDate'].apply(lambda x: int(x.split('/')[0]))

In [None]:
df_game['day'] = df_game['gameDate'].apply(lambda x: int(x.split('/')[1]))

In [None]:
df_game['hour'] = df_game['gameTimeEastern'].apply(lambda x: int(x.split(':')[0]))

In [None]:
df_game['month'].value_counts()

In [None]:
import seaborn as sns
plt.figure(figsize=(7,7))
sns.countplot(x="month", data=df_game)

In [None]:
plt.figure(figsize=(16,7))
sns.countplot(x="day", data=df_game)

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x="hour", data=df_game)

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x="gameTimeEastern", data=df_game)

**Majority of games were played at 13:00 hrs, 16:00 hrs, and 20:00 hrs onwards**

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x="week", data=df_game)

**All 256 regular season games were played within the original 17-week span**

In [None]:
x=df_game.corr()
x

In [None]:
plt.figure(figsize=(14,6))
sns.heatmap(x, annot=True)

# 4. Players

|**Column Name**|**Description**|
| :-- | :--
|**nflId**|Player identification number, unique across players|
|**height**|Player height|
|**weight**|Player weight|
|**birthDate**|Date of birth (YYYY-MM-DD)|
|**collegeName**|Player college|
|**position**|Player position|
|**displayName**|Player name|

In [None]:
df_player=pd.read_csv('../input/nfl-big-data-bowl-2022/players.csv')

**Dataset Shape**

In [None]:
df_player.shape

**Samples total - 2732 Dimensionality - 7**

**Dataset Type**

In [None]:
df_player.info()

**Dataset has a mixture of 5 categorical and 2 numerical data**

**Checking for missing values**

In [None]:
df_player.isnull().sum()

In [None]:
df_player.describe(include='all')

In [None]:
df_player['height'].value_counts()

**Feature Engineering**

In [None]:
check = df_player['height'].str.split('-',expand=True)

In [None]:
check.columns=['first', 'second']

In [None]:
check.loc[(check['second'].notnull()),'first'] = check[check['second'].notnull()]['first'].astype(np.int64)*12+check[check['second'].notnull()]['second'].astype(np.int16)

In [None]:
df_player['height']=check['first']

In [None]:
df_player['height']= df_player['height'].astype(np.float32)

In [None]:
df_player['height']/= 12

In [None]:
df_player

In [None]:
df_player['height']

In [None]:
sns.distplot(df_player['height'], color='b')

In [None]:
sns.distplot(df_player['weight'], color='g')

In [None]:
df_player['collegeName'].value_counts().sort_values(ascending=False)[:20]

In [None]:
#Top Colleges playing NFL
df_player['collegeName'].value_counts().sort_values(ascending=False)[:5].plot(kind='bar')

In [None]:
df_player['displayName'].value_counts().sort_values(ascending=False)[:5].plot(kind='bar')

In [None]:
#Top Players from different Colleges
plt.figure(figsize=(14,7))
sns.scatterplot(x=df_player['displayName'][:5], y=df_player['collegeName'][:5], data=df_player)

# Football Abbreviations Positions
**WR = Wide Receiver**
**CB = Corner Back**
**DE = Defensive End**
**OLB = Outside Linebacker**
**TE = Tight End**
**T = Tackle**
**RB = Running Back**
**G = Guard**
**DT = Defensive Tackle**
**ILB = Inside Linebacker**
**FS =  Free Safety**
**SS = Strong Safety**
**C = Center**
**K = Placekicker**
**P = Punter**
**NT = Nose Tackle**
**LS = Long Snapper**
**LB = Line Backer**
**DB = Defensive Backs**
**MLB = Middle Line Backer**
**FB = Fullback**
**OT = Overtime**
**QB = Quarterback**
**OG = Own Goal**
**S = Saftey**
**HB = Halfback**

In [None]:
plt.figure(figsize=(14,7))
sns.barplot(x=df_player['Position'], y=df_player['height'], data=df_player)

In [None]:
print("birthDate :",df_player['birthDate'])

In [None]:
from datetime import datetime

In [None]:
from datetime import date

In [None]:
df_player['birthDate'] = pd.to_datetime(df_player['birthDate'], errors='coerce')

In [None]:
df_player['formatted_df'] = df_player["birthDate"].dt.strftime("%Y")
df_player['formatted_df']

In [None]:
sns.distplot(df_player['formatted_df'], color='y')

**Maximum year - 1972 and Minimum year - 1999**

# 5. Plays

**Dataset Description**

|**Column Name**|**Description**|
| :-- | :--
|**gameId**|Game identifier, unique|
|**playId**|Play identifier, not unique across games|
|**playDescription**|Description of play|
|**quarter**|Game quarter|
|**down**|Down|
|**yardsToGo**|Distance needed for a first down|
|**possessionTeam**|Team punting, placekicking or kicking off the ball|
|**specialTeamsPlayType**|Formation of play: Extra Point, Field Goal, Kickoff or Punt|
|**specialTeamsPlayResult**|Special Teams outcome of play dependent on play type: Blocked Kick Attempt, Blocked Punt, Downed, Fair Catch, Kick Attempt Good, Kick Attempt No Good, Kickoff Team Recovery, Muffed, Non-Special Teams Result, Out of Bounds, Return or Touchback|
|**kickerId**|nflId of placekicker, punter or kickoff specialist on play|
|**returnerId**|nflId(s) of returner(s) on play if there was a special teams return. Multiple returners on a play are separated by a ;|
|**kickBlockerId**|nflId of blocker of kick on play if there was a blocked field goal or blocked punt|
|**yardlineSide**|3-letter team code corresponding to line-of-scrimmage|
|**yardlineNumber**|Yard line at line-of-scrimmage|
|**gameClock**|Time on clock of play (MM:SS)|
|**penaltyCodes**|NFL categorization of the penalties that occurred on the play. Multiple penalties on a play are separated by a ;|
|**penaltyJerseyNumber**|Jersey number and team code of the player committing each penalty. Multiple penalties on a play are separated by a ;|
|**penaltyYards**|yards gained by possessionTeam by penalty|
|**preSnapHomeScore**|Home score prior to the play|
|**preSnapVisitorScore**|Visiting team score prior to the play|
|**passResult**|Scrimmage outcome of the play if specialTeamsPlayResult is "Non-Special Teams Result" (C: Complete pass, I: Incomplete pass, S: Quarterback sack, IN: Intercepted pass, R: Scramble)|
|**kickLength**|Kick length in air of kickoff, field goal or punt|
|**kickReturnYardage**|Yards gained by return team if there was a return on a kickoff or punt|
|**playResult**|Net yards gained by the kicking team, including penalty yardage|
|**absoluteYardlineNumber**|Location of ball downfield in tracking data coordinates|


In [None]:
df_plays = pd.read_csv('../input/nfl-big-data-bowl-2022/plays.csv')

In [None]:
df_plays.shape

In [None]:
df_plays.info()

In [None]:
df_plays.describe(include='all')

Too much of null values in **Penalty codes, penaltyJerseyNumbers, penaltyYards, passResult, kickReturnYardage** columns

In [None]:
df_plays.isnull().sum()

**Which is the highest scoring quarter in NFL?**

In [None]:
df_plays['quarter'].value_counts()

**A quarter in football is the main unit of time measurement for the game. The game of football is played in four quarters, each with a 15 minute game clock.**

In [None]:
import matplotlib.pyplot as plt

labels = ['1','2','3','4','5']
sizes =[4781,5661,4724,4711,102]
colors = ['gold','lightskyblue', 'green', 'blue', 'pink']
explode = (0,0,0,0,0.1)  

plt.figure(figsize=(14,7))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, 
        autopct='%1.1f%%', shadow=True, startangle=140)

plt.axis('equal')
plt.show()

**After the chess match that is the first quarter of an NFL game, it's time for the action to start in the second quarter. This is typically the highest scoring quarter of the game and the one where one team may start to pull away.**

**How many plays downs to get a 1st down?**

In [None]:
plt.figure(figsize=(14,7))
sns.distplot(df_plays['down'], color='r')

A team's offense is given **four downs** (plays) to move ten yards toward the opponent's end zone.

**How many yards to go to get a first down in the NFL?**

In [None]:
plt.figure(figsize=(14,7))
sns.countplot(data=df_plays, x=df_plays['yardsToGo'])

**Each time the offense gets the ball, it has four downs,which gains 10 yards. If the offensive team successfully moves the ball 10 or more yards, it earns a first down, and another set of four downs. If the offense fails to gain 10 yards, it loses possession of the ball.**

**Home VS Visitor Score**

In [None]:
plt.figure(figsize=(15,7))
sns.distplot(df_plays['preSnapHomeScore'], color='g')

In [None]:
plt.figure(figsize=(15,7))
sns.distplot(df_plays['preSnapVisitorScore'], color='b')

In [None]:
plt.figure(figsize=(15,7))
sns.distplot(df_plays['playResult'], bins= 15, color='y')

# 6. Tracking 2018 Data

In [None]:
df_track_1= pd.read_csv('../input/nfl-big-data-bowl-2022/tracking2018.csv')
df_track_1

In [None]:
df_track_1.shape

In [None]:
df_track_1.info()

In [None]:
df_track_1.isnull().sum()

In [None]:
df_track_1['time'].value_counts()

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(y='time',data=df_track_1)