## Analyzing the NFL Games 

Game data: The games.csv contains the teams playing in each game. The key variable is gameId

- gameId: Game identifier, unique (numeric)
- gameDate: Game Date (time, mm/dd/yyyy)
- gameTimeEastern: Start time of game (time, HH:MM:SS, EST)
- homeTeamAbbr: Home team three-letter code (text)
- visitorTeamAbbr: Visiting team three-letter code (text)
- week: Week of game (numeric)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

In [None]:
games_df = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/games.csv')

In [None]:
games_df.head()

In [None]:
games_df.shape

In [None]:
games_df.isnull().sum()

In [None]:
games_df.dtypes

In [None]:
# Converting to datetime.date values
games_df['gameDate'] = pd.to_datetime(games_df['gameDate']).dt.date

# Converting to datetime.time values
games_df['gameTimeEastern'] = pd.to_datetime(games_df['gameTimeEastern']).dt.time

In [None]:
games_df.head()

In [None]:
games_df['gameDate'].value_counts().reset_index()

There are a total of 50 different game dates.

In [None]:
# Checking the frequency of games in relation to game dates
date_dist = games_df['gameDate'].value_counts().reset_index()

# Renaming the columns
date_dist.columns = ['date', 'frequency']

In [None]:
date_dist.head()

In [None]:
# Sorting the DataFrame based on the date values
sorted_date_dist = date_dist.sort_values('date').set_index('date')

# Looking at the first five rows
sorted_date_dist.head()

In [None]:
def find_dist(df, col_name):
    
    # Checking the frequency of games in relation to the column values
    dist = df[col_name].value_counts().reset_index()
    
    # Renaming the columns
    dist.columns = [col_name, 'frequency']
        
    # Sorting the DataFrame based on the column values
    sorted_dist = dist.sort_values(col_name, ascending=True).set_index(col_name)

    # Plotting a bar plot
    sorted_dist.plot(kind='bar', figsize=(20,4))

    # Return a boolean indicating the function was successfully executed
    return True

# Visualizing the frequency distribution of games in relation to the date
find_dist(games_df, 'gameDate')

In [None]:
games_df.head()

In [None]:
# Visualizing frequency distribution of games in relation to the time
find_dist(games_df, 'gameTimeEastern')

In [None]:
# Visualizing frequency distribution of games in relation to the week
find_dist(games_df, 'week')

In [None]:
games_df.head()

In [None]:
# Creating a column containing the day of the week information extracted from the date
games_df['gameDay'] = games_df['gameDate'].apply(lambda x: x.strftime('%A'))

In [None]:
games_df.head()

In [None]:
# Visualizing frequency distribution of games in relation to the day of the week
plt.figure(figsize=(16,6))
graph = sns.countplot(x='gameDay',data=games_df,palette='plasma')

for p in graph.patches:
    height = p.get_height()
    graph.text(p.get_x()+p.get_width()/2., height + 0.3,height ,ha="center")
    
plt.show()

## Analyzing the Players

Player data: The players.csv file contains player-level information from players that participated in any of the tracking data files. 
The key variable is nflId.

- nflId: Player identification number, unique across players (numeric)
- height: Player height (text)
- weight: Player weight (numeric)
- birthDate: Date of birth (YYYY-MM-DD)
- collegeName: Player college (text)
- position: Player position (text)
- displayName: Player name (text)

In [None]:
players_df = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/players.csv')

In [None]:
players_df.head()

In [None]:
players_df.shape

In [None]:
players_df.isnull().sum()

In [None]:
# Converting to datetime.date values
players_df['birthDate'] = pd.to_datetime(players_df['birthDate']).dt.date

# Extracting the year
players_df['birthYear'] = pd.to_datetime(players_df['birthDate']).dt.year

In [None]:
players_df.head()

In [None]:
# Finding the age of the players in respect to the year 2018.
players_df['age'] = 2018 - players_df['birthYear']

players_df.head()

In [None]:
def find_dist(df, col_name):
    
    # Checking the frequency of games in relation to the column values
    dist = df[col_name].value_counts().reset_index()
    
    # Renaming the columns
    dist.columns = [col_name, 'frequency']
        
    # Sorting the DataFrame based on the column values
    sorted_dist = dist.sort_values(col_name, ascending=True).set_index(col_name)

    # Plotting a bar plot
    sorted_dist.plot(kind='bar', figsize=(20,4))

    # Return a boolean indicating the function was successfully executed
    return True

In [None]:
# Visualizing frequency distribution of players in relation to their age using the function defined
find_dist(players_df, 'age')

In [None]:
# Visualizing frequency distribution of players in relation to their age with Seaborn
plt.figure(figsize=(16,6))
graph = sns.countplot(x='age',data=players_df,palette='plasma')

for p in graph.patches:
    height = p.get_height()
    graph.text(p.get_x()+p.get_width()/2., height + 0.5,height ,ha="center")
    
plt.show()

In [None]:
# Visualizing frequency distribution of players in relation to their born year with Seaborn
plt.figure(figsize=(16,6))
graph = sns.countplot(x='birthYear',data=players_df,palette='viridis')

for p in graph.patches:
    height = p.get_height()
    graph.text(p.get_x()+p.get_width()/2., height + 0.5,height ,ha="center")
    
plt.show()

In [None]:
players_df.head()

In [None]:
# Visualizing frequency distribution of players in relation to their colleges
plt.figure(figsize=(16,6))
graph = sns.countplot(x='collegeName',data=players_df,palette='coolwarm',
                      order=players_df.collegeName.value_counts().iloc[:15].index)

for p in graph.patches:
    height = p.get_height()
    graph.text(p.get_x()+p.get_width()/2., height + 0.3,height ,ha="center")
    
plt.show()

In [None]:
# Visualizing frequency distribution of players in relation to their positions using the function defined
find_dist(players_df, 'position')

In [None]:
# Visualizing frequency distribution of players in relation to their positions using Seaborn
plt.figure(figsize=(16,6))
graph = sns.countplot(x='position',data=players_df)

for p in graph.patches:
    height = p.get_height()
    graph.text(p.get_x()+p.get_width()/2., height + 0.3,height ,ha="center")
    
plt.show()

Age distribution of players in the CB (Cornerback) and WR (Wide Receiver) positions. 
For this, we select the data points for either of the positions and then, find their age distribution.

In [None]:
# Selecting position = CB
players_df.query('position == "CB"')

In [None]:
# Visualizing frequency distribution of players in relation to the CB position
find_dist(players_df.query('position == "CB"'), 'age')

In [None]:
# Visualizing frequency distribution of players in relation to the WR position
find_dist(players_df.query('position == "WR"'), 'age')

In [None]:
players_df.head(10)

In [None]:
# Fixing the inconsistency by converting all data to inches
players_df['height'] = players_df['height'].apply(lambda x: int(x[0])*12 + int(x[2]) if '-' in x else int(x))

In [None]:
players_df.head(10)

In [None]:
players_df.describe()

In [None]:
# Extracting the height values
players_df['height'].values

In [None]:
# Assigning the height and weight values
height = players_df['height'].values
weight = players_df['weight'].values

In [None]:
# Plotting a joint plot
sns.jointplot(weight, height)

## Analyzing the NFL PLays 

Play data: The plays.csv file contains play-level information from each game. The key variables are gameId and playId

- gameId: Game identifier, unique (numeric)

- playId: Play identifier, not unique across games (numeric)

- playDescription: Description of play (text)

- quarter: Game quarter (numeric)

- down: Down (numeric)

- yardsToGo: Distance needed for a first down (numeric)

- possessionTeam: Team on offense (text)

- playType: Outcome of dropback: sack or pass (text)

- yardlineSide: 3-letter team code corresponding to line-of-scrimmage (text)

- yardlineNumber: Yard line at line-of-scrimmage (numeric)

- offenseFormation: Formation used by possession team (text)

- personnelO: Personnel used by offensive team (text)

- defendersInTheBox: Number of defenders in close proximity to line-of-scrimmage (numeric)

- numberOfPassRushers: Number of pass rushers (numeric)

- personnelD: Personnel used by defensive team (text)

- typeDropback: Dropback categorization of quarterback (text)

- preSnapHomeScore: Home score prior to the play (numeric)

- preSnapVisitorScore: Visiting team score prior to the play (numeric)

- gameClock: Time on clock of play (MM:SS)

- absoluteYardlineNumber: Distance from end zone for possession team (numeric)

- penaltyCodes: NFL categorization of the penalties that ocurred on the play. For purposes of this contest, the most important penalties are Defensive Pass Interference (DPI), Offensive Pass Interference (OPI), Illegal Contact (ICT), and Defensive Holding (DH). Multiple penalties on a play are separated by a ; (text)

- penaltyJerseyNumber: Jersey number and team code of the player commiting each penalty. Multiple penalties on a play are separated by a ; (text)

- passResult: Outcome of the passing play (C: Complete pass, I: Incomplete pass, S: Quarterback sack, IN: Intercepted pass, text)

- offensePlayResult: Yards gained by the offense, excluding penalty yardage (numeric)

- playResult: Net yards gained by the offense, including penalty yardage (numeric)

- epa: Expected points added on the play, relative to the offensive team. Expected points is a metric that estimates the average of every next scoring outcome given the play's down, distance, yardline, and time remaining (numeric)

- isDefensivePI: An indicator variable for whether or not a DPI penalty ocurred on a given play (TRUE/FALSE)

In [None]:
plays_df = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/plays.csv')

In [None]:
plays_df.head()

In [None]:
plays_df.shape

In [None]:
f,ax=plt.subplots(1,2,figsize=(13,5))

sns.countplot(plays_df["quarter"], ax=ax[0])
ax[0].set_title("o distribution")

sns.countplot(plays_df["down"], ax=ax[1])

plt.show()

In [None]:
f,ax=plt.subplots(1,2,figsize=(16,5))

g1 = sns.countplot(plays_df["yardsToGo"], ax=ax[0])
g1.set_xticklabels(g1.get_xticklabels(),rotation=90)

g2 = sns.countplot(plays_df["possessionTeam"], ax=ax[1])
g2.set_xticklabels(g2.get_xticklabels(),rotation=90)

plt.show()

In [None]:
f,ax=plt.subplots(1,2,figsize=(15,4))

g1 = sns.countplot(plays_df["offenseFormation"], ax=ax[0])
g1.set_xticklabels(g1.get_xticklabels(),rotation=90)

g2 = sns.countplot(plays_df["defendersInTheBox"], ax=ax[1])
g2.set_xticklabels(g2.get_xticklabels(),rotation=90)

for p in g1.patches:
    height = p.get_height()
    g1.text(p.get_x()+p.get_width()/2., height + 0.3,height ,ha="center")
    
for p in g2.patches:
    height = p.get_height()
    g2.text(p.get_x()+p.get_width()/2., height + 0.3,height ,ha="center")
    
plt.show()

In [None]:
# Visualizing Pass Result using Seaborn
plt.figure(figsize=(10,5))
graph = sns.countplot(x='passResult',data=plays_df)

for p in graph.patches:
    height = p.get_height()
    graph.text(p.get_x()+p.get_width()/2., height + 0.3,height ,ha="center")
    
plt.show()