<h1><center>NFL Big Data Bowl 2021. EDA.</center></h1>

<center><img src="https://operations.nfl.com/media/3607/big-data-bowl_baked.jpg?anchor=center&mode=crop&width=1417&height=740&rnd=131956651790000000&quality=85"></center>

In [None]:
import numpy as np
import pandas as pd
        
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

pd.set_option('display.max_columns', None)

<a id="1"></a>
<h2 style='background:transparent; border:0; color:black'><center>1. Game Data<center><h2>

### **Game data:** The games.csv contains the teams playing in each game. The key variable is gameId.

* **gameId:** Game identifier, unique (numeric)

* **gameDate:** Game Date (time, mm/dd/yyyy)

* **gameTimeEastern:** Start time of game (time, HH:MM:SS, EST)

* **homeTeamAbbr:** Home team three-letter code (text)

* **visitorTeamAbbr:** Visiting team three-letter code (text)

* **week:** Week of game (numeric)

In [None]:
games = pd.read_csv('../input/nfl-big-data-bowl-2022/games.csv')
games

In [None]:
check = games['gameDate'].value_counts().reset_index()

check.columns = [
    'date', 
    'games'
]

check = check.sort_values('games')

fig = px.bar(
    check, 
    y='date', 
    x="games", 
    orientation='h', 
    title='Number of games for every date', 
    height=900, 
    width=800
)

fig.show()

In [None]:
check = games['gameTimeEastern'].value_counts().reset_index()

check.columns = [
    'time', 
    'games'
]

check = check.sort_values('games')

fig = px.bar(
    check, 
    y='time', 
    x="games", 
    orientation='h', 
    title='Number of games for every time', 
    height=400, 
    width=800
)

fig.show()

In [None]:
check = games['homeTeamAbbr'].value_counts().reset_index()

check.columns = [
    'team', 
    'games'
]

check = check.sort_values('games')

fig = px.bar(
    check, 
    y='team', 
    x="games", 
    orientation='h', 
    title='Number of games for every team (home)', 
    height=700, 
    width=800
)

fig.show()

In [None]:
check = games['visitorTeamAbbr'].value_counts().reset_index()

check.columns = [
    'team', 
    'games'
]

check = check.sort_values('games')

fig = px.bar(
    check, 
    y='team', 
    x="games", 
    orientation='h', 
    title='Number of games for every team (away)', 
    height=700, 
    width=800
)

fig.show()

In [None]:
check = games['week'].value_counts().reset_index()

check.columns = [
    'week', 
    'games'
]

check = check.sort_values('games')
check['week'] = check['week'].astype(str) + '-'

fig = px.bar(
    check, 
    y='week', 
    x="games", 
    orientation='h', 
    title='Number of games for every week', 
    height=500, 
    width=800
)

fig.show()

<a id="2"></a>
<h2 style='background:transparent; border:0; color:black'><center>2. Player Data<center><h2>

### **Player data:** The players.csv file contains player-level information from players that participated in any of the tracking data files. The key variable is nflId

* **nflId:** Player identification number, unique across players (numeric)

* **height:** Player height (text)

* **weight:** Player weight (numeric)

* **birthDate:** Date of birth (YYYY-MM-DD)

* **collegeName:** Player college (text)

* **position:** Player position (text)

* **displayName:** Player name (text)

In [None]:
players = pd.read_csv('../input/nfl-big-data-bowl-2022/players.csv')
players

Convert all heights to feet.

In [None]:
check = players['height'].str.split('-',expand=True)

check.columns = [
    'first', 
    'second'
]

check.loc[(check['second'].notnull()), 'first'] = check[check['second'].notnull()]['first'].astype(np.int16) * 12 + check[check['second'].notnull()]['second'].astype(np.int16)

In [None]:
players['height'] = check['first']
players['height'] = players['height'].astype(np.float32)
players['height'] /= 12

players

In [None]:
fig = make_subplots(rows=1, cols=2)
traces = [
    go.Histogram(
        x=players[col[0]], 
        nbinsx=col[1], 
        name=col[0]
    ) for col in [('height', 20), ('weight', 50)]
]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2) + 1
    )

fig.update_layout(
    title_text='Height & weight distributions',
    height=400,
    width=800
)

fig.show()

In [None]:
check = players['collegeName'].value_counts().reset_index()

check.columns = [
    'college', 
    'players'
]

check = check.sort_values('players').tail(50)

fig = px.bar(
    check, 
    y='college', 
    x="players", 
    orientation='h', 
    title='Top 50 colleges by number of players',
    height=900,
    width=800
)

fig.show()

### Players positions abbreviation

* WR: Wide Receiver;
* CB: Cornerback;
* RB: Running Back;
* TE: Tight End;
* OLB: Outside Linebacker;
* QB: Quarterback;
* FS: Free Safety;
* LB: Linebacker;
* SS: Strong Safety;
* ILB: Inside Linebacker;
* DE: Defensive End;
* DB: Defensive Back;
* MLB: Middle Linebacker;
* DT: Defensive Tackle;
* FB: Fullback;
* P: Punter;
* LS: Long snapper;
* S: Safety; 
* K: Kicker;
* HB: Running back;
* NT: Nose Tackle

<a id="3"></a>
<h2 style='background:transparent; border:0; color:black'><center>3. Play Data<center><h2>

### **Play data:** The plays.csv file contains play-level information from each game. The key variables are gameId and playId

* **gameId:** Game identifier, unique (numeric)

* **playId:** Play identifier, not unique across games (numeric)

* **playDescription:** Description of play (text)

* **quarter:** Game quarter (numeric)

* **down:** Down (numeric)

* **yardsToGo:** Distance needed for a first down (numeric)

* **possessionTeam:** Team on offense (text)

* **playType:** Outcome of dropback: sack or pass (text)

* **yardlineSide:** 3-letter team code corresponding to line-of-scrimmage (text)

* **yardlineNumber:** Yard line at line-of-scrimmage (numeric)

* **offenseFormation:** Formation used by possession team (text)

* **personnelO:** Personnel used by offensive team (text)

* **defendersInTheBox:** Number of defenders in close proximity to line-of-scrimmage (numeric)

* **numberOfPassRushers:** Number of pass rushers (numeric)

* **personnelD:** Personnel used by defensive team (text)

* **typeDropback:** Dropback categorization of quarterback (text)

* **preSnapHomeScore:** Home score prior to the play (numeric)

* **preSnapVisitorScore:** Visiting team score prior to the play (numeric)

* **gameClock:** Time on clock of play (MM:SS)

* **absoluteYardlineNumber:** Distance from end zone for possession team (numeric)

* **penaltyCodes:** NFL categorization of the penalties that ocurred on the play. For purposes of this contest, the most important penalties are Defensive Pass Interference (DPI), Offensive Pass Interference (OPI), Illegal Contact (ICT), and Defensive Holding (DH). Multiple penalties on a play are separated by a ; (text)

* **penaltyJerseyNumber:** Jersey number and team code of the player commiting each penalty. Multiple penalties on a play are separated by a ; (text)

* **passResult:** Outcome of the passing play (C: Complete pass, I: Incomplete pass, S: Quarterback sack, IN: Intercepted pass, text)

* **offensePlayResult:** Yards gained by the offense, excluding penalty yardage (numeric)

* **playResult:** Net yards gained by the offense, including penalty yardage (numeric)

* **epa:** Expected points added on the play, relative to the offensive team. Expected points is a metric that estimates the average of every next scoring outcome given the play's down, distance, yardline, and time remaining (numeric)

* **isDefensivePI:** An indicator variable for whether or not a DPI penalty ocurred on a given play (TRUE/FALSE)

In [None]:
plays = pd.read_csv('../input/nfl-big-data-bowl-2022/plays.csv')

plays

In [None]:
check = plays['quarter'].value_counts().reset_index()

check.columns = [
    'quarter', 
    'plays'
]

check = check.sort_values('plays')

fig = px.pie(
    check, 
    names='quarter', 
    values="plays",  
    title='Number of plays of every quarter',
    height=500,
    width=800
)

fig.show()

In [None]:
check = plays['down'].value_counts().reset_index()

check.columns = [
    'down', 
    'plays'
]

check = check.sort_values('plays')

fig = px.pie(
    check, 
    names='down', 
    values="plays",  
    title='Number of plays of every down',
    height=500,
    width=800
)

fig.show()

In [None]:
check = plays['yardsToGo'].value_counts().reset_index()

check.columns = [
    'yardsToGo', 
    'plays'
]

check['yardsToGo'] = check['yardsToGo'].astype(str) + '-'
check = check.sort_values('plays')

fig = px.bar(
    check, 
    y='yardsToGo', 
    x="plays", 
    orientation='h', 
    title='Number of plays for every yards to go category',
    height=800,
    width=800
)

fig.show()

In [None]:
check = plays['possessionTeam'].value_counts().reset_index()

check.columns = [
    'team', 
    'plays'
]

check = check.sort_values('plays')

fig = px.bar(
    check, 
    y='team', 
    x="plays", 
    orientation='h', 
    title='Number of plays for every team',
    height=800,
    width=800
)

fig.show()

In [None]:
check = plays['yardlineSide'].value_counts().reset_index()

check.columns = [
    'yardlineSide', 
    'plays'
]

check = check.sort_values('plays')

fig = px.bar(
    check, 
    y='yardlineSide', 
    x="plays", 
    orientation='h', 
    title='Number of plays for every team yard side',
    height=800,
    width=800
)

fig.show()

In [None]:
check = plays['yardlineNumber'].value_counts().reset_index()

check.columns = [
    'yardline', 
    'plays'
]

check = check.sort_values('plays')

fig = px.bar(
    check, 
    x='yardline', 
    y="plays",  
    title='Number of plays for every yardline',
    height=500,
    width=800
)

fig.show()

In [None]:
fig = px.histogram(
    plays, 
    x='absoluteYardlineNumber',
    width=800,
    height=500,
    nbins=50,
    title='Absolute Yardline Number distribution'
)

fig.show()

In [None]:
check = plays['passResult'].value_counts().reset_index()

check.columns = [
    'passResult', 
    'plays'
]

check = check.sort_values('plays')

fig = px.pie(
    check, 
    names='passResult', 
    values="plays",  
    title='Number of plays for every pass result',
    height=500,
    width=800
)

fig.show()

In [None]:
fig = px.histogram(
    plays, 
    x="playResult",
    width=800,
    height=500,
    nbins=50,
    title='Play result distribution'
)

fig.show()

In [None]:
fig = px.histogram(
    plays, 
    x="preSnapVisitorScore",
    width=800,
    height=500,
    nbins=50,
    title='Pre Snap Visitor Score distribution'
)

fig.show()

In [None]:
fig = px.histogram(
    plays, 
    x="preSnapHomeScore",
    width=800,
    height=500,
    nbins=50,
    title='Pre Snap Home Score distribution'
)

fig.show()

<a id="4"></a>
<h2 style='background:transparent; border:0; color:black'><center>4. Football field visualization<center><h2>

For plays analysis I will use football field visualization shared in  <a href="https://www.kaggle.com/robikscube/nfl-big-data-bowl-plotting-player-position">this perfect kernel by @robikscube</a>

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

def create_football_field(
    linenumbers=True,
    endzones=True,
    highlight_line=False,
    highlight_line_number=50,
    highlighted_name='Line of Scrimmage',
    fifty_is_los=False,
    figsize=(12, 6.33)
):

    rect = patches.Rectangle((0, 0), 120, 53.3, linewidth=0.1, edgecolor='r', facecolor='darkgreen', zorder=0)

    fig, ax = plt.subplots(1, figsize=figsize)
    ax.add_patch(rect)

    plt.plot([10, 10, 10, 20, 20, 30, 30, 40, 40, 50, 50, 60, 60, 70, 70, 80,
              80, 90, 90, 100, 100, 110, 110, 120, 0, 0, 120, 120],
             [0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3,
              53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 53.3, 0, 0, 53.3],
             color='white')
    if fifty_is_los:
        plt.plot([60, 60], [0, 53.3], color='gold')
        plt.text(62, 50, '<- Player Yardline at Snap', color='gold')
 
    if endzones:
        ez1 = patches.Rectangle((0, 0), 10, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ez2 = patches.Rectangle((110, 0), 120, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ax.add_patch(ez1)
        ax.add_patch(ez2)
    plt.xlim(0, 120)
    plt.ylim(-5, 58.3)
    plt.axis('off')
    if linenumbers:
        for x in range(20, 110, 10):
            numb = x
            if x > 50:
                numb = 120 - x
            plt.text(x, 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white')
            plt.text(x - 0.95, 53.3 - 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white', rotation=180)
    if endzones:
        hash_range = range(11, 110)
    else:
        hash_range = range(1, 120)

    for x in hash_range:
        ax.plot([x, x], [0.4, 0.7], color='white')
        ax.plot([x, x], [53.0, 52.5], color='white')
        ax.plot([x, x], [22.91, 23.57], color='white')
        ax.plot([x, x], [29.73, 30.39], color='white')

    if highlight_line:
        hl = highlight_line_number + 10
        plt.plot([hl, hl], [0, 53.3], color='yellow')
        plt.text(hl + 2, 50, '<- {}'.format(highlighted_name),
                 color='yellow')
    return fig, ax

create_football_field()
plt.show()