In [None]:
import pandas as pd
import numpy as np
import calendar
from datetime import datetime, timedelta
from collections import Counter
import base64
from datetime import datetime
from datetime import timedelta

import seaborn as sns
#pal_hls = sns.hls_palette(100, l=.3, s=.8).as_hex() # Seaborn Color Palette for colors
import matplotlib as mpl
import matplotlib.pyplot as plt
#sns.set_style('darkgrid')

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import plotly.graph_objects as go
init_notebook_mode()

# Ignore Warnings
import warnings
#warnings.filterwarnings("ignore")

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

############################################################
### https://www.kaggle.com/c/nfl-big-data-bowl-2022/data ###
############################################################

# Welcome to my Kaggle notebook


If you like what you see please give it an upvote, and if you don't feel free to leave any constructive criticism you may have about it =)

The graphs are done with Plotly, which means that they are interactive, hover over the graph to get more details

## Data inspection

We are given quite a bit of data to play with and we can join the files in different ways:

* "game", "PFF Scouting data" and "play" csv can be joined with gameId
* "play" and "player" on kickerId = nflId
* "player" and "tracking" on nflId

There are probably other ways too, we should be able to find lots of insights with those different combinations

# 1. Games

Let's start by looking at the games.csv file, check for missing values etc..

In [None]:
games_df = pd.read_csv("../input/nfl-big-data-bowl-2022/games.csv")

print(f"NaN check:\n\n{games_df.isna().sum()} \n\nData shape :\n{games_df.shape}")

# Renaming the columns, casting the Date & Time ones as DateTime objects

games_df.rename(columns={'gameDate' : "Date", 'gameTimeEastern' : "Time", 'homeTeamAbbr' : "Home Team", 'visitorTeamAbbr' : "Visitor Team"}, inplace=True)
games_df.columns = [i.title() for i in games_df.columns]

games_df["Date"] = pd.to_datetime(games_df["Date"])
games_df["Day"] = games_df["Date"].apply(lambda x : calendar.day_name[x.weekday()])

In [None]:
visitors = sorted(dict(games_df["Visitor Team"].value_counts()).items())
home = sorted(dict(games_df["Home Team"].value_counts()).items())

for a, b, c, d in zip((i[0] for i in visitors), (i[1] for i in visitors), (i[0] for i in home), (i[1] for i in home)):
    if b > d:
        print(f"{a} played {abs(b-d)} more game as a visitor")
    elif b < d:
        print(f"{a} played {abs(b-d)} more game at home")        


Not all teams have played an equal amount of games at home and as visitors

Let's look at the games distrbution time, what day and time is NFL usually on?

In [None]:
fig = px.histogram(data_frame=games_df, 
        x="Day", 
        color=games_df["Time"].apply(lambda x : x[:-3]),
        labels={"color" : "Time :"}    # to change the legend title
        )

fig.update_traces(hovertemplate='<br>Total : <b>%{y:.0f}<br></b>' +
                                '<br>Date : <b>%{x}<br></b>',
                  )

fig.update_yaxes(title="Count")                                                                                  
fig.update_xaxes(categoryorder='total descending')

fig.update_layout(
    title=f"<b> Game day distribution and their start time </b><br><sup>{len(games_df['Gameid'].unique())} games in total<sup>",
    title_x=0.5,
    showlegend=True,
    paper_bgcolor="darkgray", 
    barmode="stack"
    )

fig.show()

Over 50% of all games (401 precisely) happened on Sundays at 1PM, 119 on Sundays at 4.25 PM and those 2 slots gather already 68% of all games

# 2. Players

I don't know anything about American Football, there are much more players on the field at a time that i thought, what are the different positions and what kind of player occupies that position?

In order to do that i'll change all imperial system values to metric ones, calculations will be easier that way. I'll also change the positions names from abbreviations to their full names

In [None]:
players_df = pd.read_csv("../input/nfl-big-data-bowl-2022/players.csv")

players_df["birthDate"] = pd.to_datetime(players_df["birthDate"])

In [None]:
# Some of the heights are in inches only, some in foot / inch

for i in range(len(players_df)):
    _height = players_df["height"][i][:]
    if '-' in _height:
        _height_split = _height.split('-')
        new_height = (int(_height_split[0]) * 30.48 +  int(_height_split[1]) * 2.54)
    else:
        new_height = int(_height) * 2.54
    
    players_df.loc[i, "height"] = new_height  # PROPER WAY OF INSERTING DATA TO AVOID SETTING SLICE COPY WARNING

players_df["weight"] = round(players_df["weight"] * 0.453592, ndigits=2)
players_df["height"] = players_df["height"].astype("float")

position_dic = {"T" : "Tackle", "OT" : "Offensive tackle",
                "TE" : "Tight end", "G" : "Guard",
                "OG" : "Offensive guard", "DE" :"Defensive ends", 
                "C" : "Center", "QB" : "QuarterBack", 
                "DT" : "Defensive tackle", "NT" : "Nose tackle", 
                "LS" : "Long snapper", "OLB" : "Outside linebacker", 
                "P" : "Punter", "LB" : "Linepacker", 
                "ILB" : "Inside linebacker", "MLB" : "Middle linebacker", 
                "FB" : "Fullback", "WR" : "Wide receiver", 
                "DB" : "Defensive back", "SS" : "Strong safety",
                "FS" : "Free safety", "S" : "Safety", "K" : "Kicker", 
                "CB" : "Cornerback", "RB" : "Running back", "HB" : "Halfback"
                 }

players_df["Position"] = players_df["Position"].apply(lambda x : position_dic[x])

In [None]:
# Some D.O.B are missing, i gotta use try / except to avoid having my code crash on NaN values

for i in range(len(players_df)):
    try:
        _d_o_b = players_df["birthDate"][i]
        _delta = datetime.now() - _d_o_b
        _year_old = int(_delta.total_seconds() // (365.25*24*60*60))
        players_df.loc[i, "age"]  = _year_old
    except:
        pass

In [None]:
_weight_mean = round(players_df.pivot_table(values='weight', index='Position', aggfunc='mean').reset_index(), ndigits=2)
_height_mean = round(players_df.pivot_table(values='height', index='Position', aggfunc='mean').reset_index(), ndigits=2)
_age_mean = round(players_df.pivot_table(values='age', index='Position', aggfunc='mean').reset_index(), ndigits=2)

h_w_df = pd.concat(
    (_each_df.set_index('Position') for _each_df in [_height_mean, _weight_mean, _age_mean]),
    axis=1, join='inner').reset_index()

h_w_df.sort_values(by="height", ascending=False, inplace=True)
h_w_df.reset_index(drop=True, inplace=True)
h_w_df["age"] = h_w_df["age"].apply(lambda x : round(x))
h_w_df.rename(columns={"height" : "Height mean (cm)", "weight" : "Weight mean (kg)", "age" : "Age mean"}, inplace=True)

In [None]:
fig = go.Figure(data=[
    go.Bar(
        x=h_w_df["Position"], 
        y=h_w_df["Height mean (cm)"], 
        name='Height (cm)',
        customdata=h_w_df["Age mean"],
        hovertemplate="<br>".join([
                "Position : <b>%{x}</b>",
                "Height : <b>%{y}</b> cm",
                "Age : <b>%{customdata}</b>" 
            ]),

        texttemplate='%{customdata}', #h_w_df["Position"],
        textposition="inside"
    ),
    go.Bar(
        x=h_w_df["Position"], 
        y=h_w_df["Weight mean (kg)"], 
        name="Weight (kg)", 
        customdata=h_w_df["Age mean"],
        hovertemplate="<br>".join([
                "Position : <b>%{x}</b>",
                "Weight : <b>%{y}</b> kg",
                "Age : <b>%{customdata}</b>" 
            ]),
    )
])

fig.update_layout(
    title_text="<b>Players mean weight, height and age by field position</b><br><sup>sorted by height descending<sup>",
    title_x=0.5, 
    yaxis_title="Cm - Kg",
    barmode='group',
    showlegend=True, 
    paper_bgcolor="darkgray", 
    bargap=0.2,
    bargroupgap=0.1
)

display(fig, h_w_df.head(5))

Waoh some big boys out there ! We can find some useful informations for a guy like me who knows nothing about NFL:

* I'd hate being chased by one Tackler ! 
* Halfback players are on average the lightest, shortest and youngest players of the team.
* Quarterbacks and Punters need to be both tall and light !
* Defensive and Nose Tacklers are on the other hand tall and heavy (about the same weight than Tacklers but 8 cm shorter)

Another thing that could be fun is looking at the first name frequency amongst the players

In [None]:
_first_names = [_name.split()[0] for _name in players_df["displayName"]]

_name_counter = Counter(_first_names) 

_text = " ".join([i[0] for i in _name_counter.most_common() if 4 <= len(i[0]) <= 8])

icon_path = '../input/picture/clipart1905967.png'
icon = Image.open(icon_path)

_mask = Image.new("RGB", icon.size, (255, 255, 255))
_mask.paste(icon,icon)
_mask = np.array(_mask)

wc = WordCloud(
        background_color="black", 
        max_words=300, 
        mask=_mask,
        max_font_size=150)

wc.generate_from_text(_text)

plt.figure(figsize=(25,18))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)

plt.show()

In [None]:
pd.DataFrame(_name_counter.most_common(10), columns=["Name", "Count"])

In [None]:
plays_df = pd.read_csv("../input/nfl-big-data-bowl-2022/plays.csv")
games_df2 = pd.read_csv("../input/nfl-big-data-bowl-2022/games.csv")
pffs_df = pd.read_csv("../input/nfl-big-data-bowl-2022/PFFScoutingData.csv")

df_2018 = pd.read_csv("../input/nfl-big-data-bowl-2022/tracking2018.csv")
df_2019 = pd.read_csv("../input/nfl-big-data-bowl-2022/tracking2019.csv")
df_2020 = pd.read_csv("../input/nfl-big-data-bowl-2022/tracking2020.csv")

df_2018_games = df_2018["gameId"].unique()
df_2019_games = df_2019["gameId"].unique()
df_2020_games = df_2020["gameId"].unique()

In [None]:
def game_df_generator(game_id : int):

    _game_info = games_df2[games_df2["gameId"] == game_id].reset_index(drop=True)
    _plays = plays_df[plays_df["gameId"] == game_id].reset_index(drop=True)
    _pffs = pffs_df[pffs_df["gameId"] == game_id].reset_index(drop=True)

    if game_id in df_2018_games:
        _tracking = df_2018[df_2018["gameId"] == game_id].reset_index(drop=True)
    elif game_id in df_2019_games:
        _tracking = df_2019[df_2019["gameId"] == game_id].reset_index(drop=True)
    elif game_id in df_2020_games:
        _tracking = df_2020[df_2020["gameId"] == game_id].reset_index(drop=True)        

    game_moves = pd.merge(_plays, _pffs, on=["gameId", "playId"])
    game_moves = pd.merge(game_moves, _tracking, on=["gameId", "playId"])
    game_moves["time"] = pd.to_datetime(game_moves["time"])

    unique_players_per_game = game_moves["nflId"].dropna().astype("int").unique()
    _players_in_game = pd.DataFrame(columns=players_df.columns)

    for _player_id in unique_players_per_game:
        temp_df = players_df[players_df["nflId"] == _player_id]
        _players_in_game = pd.concat([_players_in_game, temp_df], axis=0, ignore_index=True)

    if len(game_moves["nflId"].dropna()) != len(game_moves["nflId"]): 
        _missing_players = (f"""We are missing {len(game_moves['nflId']) - len(game_moves['nflId'].dropna())} moves, that's {round((len(game_moves['nflId']) - len(game_moves['nflId'].dropna())) * 100 / len(game_moves['nflId']), ndigits=2)}% of total""")
    else:
        _missing_players = ""

    return _game_info, game_moves, _missing_players, _players_in_game

In [None]:
d, e, f, g = game_df_generator(2018101406)

#display(d, e, f, g)

# 3. Tracking

We have plenty of data about the players position at any given moment on the field, we can therefore recreate their movements and analyse them

The code in the cell below is inspired by the work of [JARON_MICHAL](https://www.kaggle.com/jaronmichal/tracking-data-visualization)
and [DATAROHITINGOLE](https://www.kaggle.com/datarohitingole/nfl-data-analysis-and-plotly-animation#4.-Tracking-Data) may their names be praised for the generations to come

In [None]:
fb_ground_filename = '../input/football-ground/football_ground.png'
fb_groud_plotly = Image.open(fb_ground_filename)

def animated_game(gameId, playId, speed):

    _game_info = games_df2[games_df2["gameId"] == gameId].reset_index(drop=True)
    _home_team = _game_info["homeTeamAbbr"].iloc[0]
    _visitor_team = _game_info["visitorTeamAbbr"].iloc[0]
    _date_game = _game_info["gameDate"].iloc[0]
    _time_game = _game_info["gameTimeEastern"].iloc[0]
    	

    var_date = datetime.strptime(_date_game, '%m/%d/%Y')
    _day = calendar.day_name[var_date.weekday()]
    _month = calendar.month_name[int(_date_game[:2])]
    _day_number = _date_game[3:5]

    if str(_day_number[0]) == '0':
        _day_number = _day_number[1:]
    
    if gameId in df_2018_games:
        _tracking = df_2018[df_2018["gameId"] == gameId].reset_index(drop=True)
        temp_tracking_mask = (df_2018['gameId'] == gameId) & (df_2018['playId'] == playId)
        _tracking_df = (df_2018[temp_tracking_mask][['x', 'y', 'time', 'nflId', 'team', 'displayName']]
        .fillna(0.)
        .sort_values(['team', 'time'])
    )

    elif gameId in df_2019_games:
        _tracking = df_2019[df_2019["gameId"] == gameId].reset_index(drop=True)
        temp_tracking_mask = (df_2019['gameId'] == gameId) & (df_2019['playId'] == playId)
        _tracking_df = (df_2019[temp_tracking_mask][['x', 'y', 'time', 'nflId', 'team', 'displayName']]
        .fillna(0.)
        .sort_values(['team', 'time'])
    )

    elif gameId in df_2020_games:
        _tracking = df_2020[df_2020["gameId"] == gameId].reset_index(drop=True)
        temp_tracking_mask = (df_2020['gameId'] == gameId) & (df_2020['playId'] == playId)
        _tracking_df = (df_2020[temp_tracking_mask][['x', 'y', 'time', 'nflId', 'team', 'displayName']]
        .fillna(0.)
        .sort_values(['team', 'time'])
    )

    _tracking_df["time"] = _tracking_df["time"].apply(lambda x : x[14:-2])

    fig = px.scatter(
        _tracking_df,
        x='x',
        y='y',
        animation_frame='time',
        color='team',
        animation_group="nflId",
        hover_name="displayName"
    )

    fig.update_traces(marker=dict(size=12,line=dict(width=2,color='DarkSlateGrey')),
                      selector=dict(mode='markers'))

    fig.update_layout(
        title=f"<b>{_home_team} (home) versus {_visitor_team} on {_day} the {_day_number} of {_month} at {str(_time_game)[:-3]}</b><br><sup>Animation of play : {playId} from game : {gameId}<sup><br> pouet",
        title_x=0.5, 
        autosize=False,
        width=1000,
        height=600,
        images= [dict(source=fb_groud_plotly,
                      xref="paper", yref="paper",x=0.01, y=1, sizex=1, sizey=1,
                      layer="below", opacity=0.8)],
        xaxis=dict(showgrid=False),
        yaxis=dict(showgrid=False), 
        showlegend=True, 
        paper_bgcolor="darkgray", 
        margin=dict(
                    l=20,
                    r=20,
                    b=50,
                    t=50,
                    pad=0
                ),
    )

    fig.update_xaxes(title="")                                                                                  
    fig.update_yaxes(title="")  
    
    fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = speed
    
    #print(_date_game, type(_date_game), _day, _month)

    return fig #.show("notebook")  

In [None]:
_gameId = 2018090907
_playId = 522
_speed = 60

animated_game(_gameId, _playId, _speed)

## more to come !