In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load Datasets

In [None]:
games_df = pd.read_csv('../input/nfl-big-data-bowl-2022/games.csv')
PFFScoutingData_df = pd.read_csv('../input/nfl-big-data-bowl-2022/PFFScoutingData.csv')
players_df = pd.read_csv('../input/nfl-big-data-bowl-2022/players.csv')
plays_df = pd.read_csv('../input/nfl-big-data-bowl-2022/plays.csv')
tracking2018_df = pd.read_csv('../input/nfl-big-data-bowl-2022/tracking2018.csv')
tracking2019_df = pd.read_csv('../input/nfl-big-data-bowl-2022/tracking2019.csv')
tracking2020_df = pd.read_csv('../input/nfl-big-data-bowl-2022/tracking2020.csv')

## Features Summarization

In [None]:
# Function for making feature summary
def resumetable(df):
    print(f'Shape : {df.shape}')
    summary = pd.DataFrame(df.dtypes, columns=['Data Type'])
    summary = summary.reset_index()
    summary = summary.rename(columns={'index': 'Feature'})
    # Return Series with sum of missing values for each features
    summary['Num of null'] = df.isnull().sum().values
    # Return Series with number of distinct elements. Can ignore NaN values.
    summary['Num of unique'] = df.nunique().values
    # Show the first three values of all features
    summary['First value'] = df.loc[0].values
    summary['Second value'] = df.loc[1].values
    summary['Third value'] = df.loc[2].values
    return summary

In [None]:
resumetable(games_df)

In [None]:
resumetable(PFFScoutingData_df)

In [None]:
resumetable(players_df)

In [None]:
resumetable(plays_df)

In [None]:
resumetable(tracking2018_df)

In [None]:
resumetable(tracking2019_df)

In [None]:
resumetable(tracking2020_df)

---
# _**Analysis on Punt**_

In [None]:
# Libraries importation
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

pd.set_option('display.max_columns', None)

import ipywidgets as wg
from IPython.core.display import HTML
import matplotlib.animation as anim
import matplotlib.pyplot as plt
import seaborn as sns

**A punter (P) in the NFL is a special teams player who receives the snapped ball directly from the line of scrimmage and then punts (kicks) the football to the opposing team so as to limit any field position advantage. This generally happens on a fourth down in American football and a third down in Canadian football. In this NFL study part, we extensively study the punt by providing significant analysis and its correlation among other features.**

**We firstly studied the different play types of each team and observed we have 4 distinct types of plays: field goal, extra point, punt and kickoff. Among those play types, kickoff is the common play type followed by punt as displayed In below histogram.**

In [None]:
plays_df['specialTeamsPlayType'].value_counts().plot.barh(figsize=(8,4), title="Special teams plays by type")

**Since we are interested in punt play type, we will filter the data according to punt. Therefore, from now, our new data will be related to punt**

In [None]:
punt_data = plays_df.loc[plays_df['specialTeamsPlayType'] == 'Punt']
punt_data.head()

**In the following part of this analysis, we filter the data and only consider those related to the punt play type. What are the statistics regarding the common special team results based on punt? The analysis showed that punt strategy leads more on “return” results. Therefore we will filter the rest of the data according to “return” results**

In [None]:
punt_data['specialTeamsResult'].value_counts().plot.barh(figsize=(9,4), title="Punt by result")

In [None]:
punt_returns_data = punt_data.loc[punt_data['specialTeamsResult'] == 'Return']
punt_returns_data.head()

**Yards gained is important in NFL games. Based on the above filters, we analyzed the length of yards gained by the team based on a punt “return”. Below figure showed that the maximum yard gained frequency is almost 1400 grouped between 0 and 20, more in 10. The boxplot presented that most punt returns are happening around 10 yards, and none reached 100 yards making us believe that a punt does not lead to touchdowns or seldom lead to touchdowns. This theory is further confirmed by data exploration showing that only 5 entries over 2286 reached touchdows, so a percentage of 0.21%. It is relatively small to  consider punt returns as a winning touchdowns strategy.**

In [None]:
punt_returns_data[['kickReturnYardage']].plot.hist(figsize=(14,6), title="Punt Return Yards gained")

In [None]:
ax = punt_returns_data[['possessionTeam', 'kickReturnYardage']].boxplot(by='possessionTeam', figsize=(14,10), vert=False)
ax.set_xlabel(None)
ax.set_title(None)

**The above boxplot demonstrates that most of punt return are happening around 10 yards, and none reached 100 yards making us believe that punt does not lead to touchdowns.**

In [None]:
touch_downs_returns = punt_returns_data.loc[punt_returns_data['playDescription'].str.contains('TOUCHDOWN')]
touch_downs_returns.head()

**The above results have demonstrated that punt does not lead to touchdowns. This theory is once confirmed by the above dataframe showing only 5 entries over 2286 entries reached  touchdowns, so a percentage of 0.21%. It is relatively small to consider punt returns as winning touchdowns strategy.**

In [None]:
ax = touch_downs_returns.groupby('possessionTeam').size().plot.barh(figsize=(14,6), title="Punt returns for a TouchDown")
ax.set_ylabel('Kicking Team')

### Refer to one of the tracking data files (tracking2018.csv) so we have the details for the animation:

In [None]:
tracking2018_df.head()

In [None]:
def getPlayAnimation(df, playId=2626, gameId=2018091000, color_dict={'home': 'red', 'away': 'blue', 'football': 'brown'}):
    # function to draw one frame
    def plotFrame(i):
        one_frame = one_play[ one_play['frameId'] == i+1 ]
        ax.cla()

        plt.xticks(range(0, 121, 10), fontsize=15)
        ax.set_xticklabels([None, 'G', '10', '20', '30', '40', '50', '40', '30', '20', '10', 'G', None])
        ax.tick_params(axis='x', colors='white')
        
        # Constants from frame to frame
        ax.set_xlim([0, 120])
        ax.set_ylim([0, 53.3])
        for yardline in range(10, 111, 10):
            ax.plot([yardline, yardline], [0, 53.3], c='white')
        
        # What changes
        ax.scatter(one_frame['x'], one_frame['y'], c=one_frame['team'].transform(lambda x: color_dict[x]), s=64)
        ax.set_title('Frame: {}'.format(i+1), fontsize=15, c='white')
    
    # Get df of tracking data for one play
    one_play = df[(df['playId'] == playId) & (df['gameId'] == gameId)]
    
    fig, ax = plt.subplots(figsize=(12,6))
    
    # Don't show y-axis ticks or tick labels
    ax.axes.get_yaxis().set_visible(False)
    
    # Set background colors
    fig.patch.set_facecolor('xkcd:leaf green')
    ax.set_facecolor('xkcd:leaf green')
    
    # Set the field border color and line width
    for spine in ax.spines.values():
        spine.set_edgecolor('white')
        spine.set_linewidth(2.5)
    
    play_anim = anim.FuncAnimation(fig, plotFrame, frames=one_play['frameId'].max()-1, interval=100)
    html = play_anim.to_html5_video()
    plt.close(fig)
    return HTML(html)

In [None]:
animation = touch_downs_returns.loc[329]['playDescription']
animation

In [None]:
team_colors = {'home': 'xkcd:light turquoise',
               'away': 'xkcd:cobalt',
               'football': 'xkcd:mud'}
getPlayAnimation(tracking2018_df, playId=2626, gameId=2018091000, color_dict=team_colors)

### Extract information from scouting data:

In [None]:
PFFScoutingData_df.head()

In [None]:
PFFScoutingData_df['kickContactType'].value_counts().plot.barh(figsize=(9,4), title="Punt by result")

**Clean catch from air is the common kick contact type.**

In [None]:
# Keep the columns that are related to kickoffs
punts_cols = ['gameId', 'playId', 'hangTime', 'kickType', 'kickDirectionIntended', 'kickDirectionActual', 'missedTackler',
                'returnDirectionIntended', 'returnDirectionActual', 'kickoffReturnFormation', 'specialTeamsSafeties','snapDetail',
               'operationTime','gunners','puntRushers']
scouting_data_2 = PFFScoutingData_df[punts_cols]
scouting_data_2.head()

In [None]:
scouting_data_2.columns

In [None]:
# Merge the scouting data with the punt that were returned.
punt_returns_data = punt_returns_data.merge(scouting_data_2, how='left', on=['gameId', 'playId'])
punt_returns_data.head()

In [None]:
punt_returns_data['kickType'].value_counts()

In [None]:
ax = punt_returns_data[['kickType', 'kickReturnYardage']].boxplot(by='kickType', figsize=(14,6), vert=False)
ax.set_xlabel(None)
ax.set_title(None)

**Normal - standard punt style is the most play**

In [None]:
punt_returns_data['kickoffReturnFormation'].isna().sum()

In [None]:
punt_returns_data['returnDirectionIntended'].value_counts()

**Further analysis on punt shows clean catch from air is the common way on how punt was fielded and normal standard punt style is the most played. The dominant punt direction from the kicking team’s perspective is center, right and then left.**

In [None]:
punt_returns_data['returnDirectionIntended'].value_counts().plot.barh(figsize=(9,4), title="Punt by result")

In [None]:
punt_returns_data['kickDirectionIntended'].value_counts()

In [None]:
punt_returns_data['kickDirectionIntended'].value_counts().plot.barh(figsize=(9,4), title="Punt by result")

In [None]:
punt_returns_data['kickDirectionActual'].value_counts()

In [None]:
punt_returns_data['kickDirectionActual'].value_counts().plot.barh(figsize=(9,4), title="Punt by result")

In [None]:
punt_returns_data['puntRushers'].value_counts()

In [None]:
kickoff_scouting_df_plot = punt_returns_data['puntRushers'].sort_values().head(20)

kickoff_scouting_df_plot.value_counts().head().plot.barh(figsize=(15,14), title="Punt by result")

In [None]:
punt_returns_data['puntRushers'].value_counts().head().plot.barh(figsize=(19,14), title="Punt by result")

In [None]:
punt_returns_data['gunners'].value_counts()

In [None]:
kickoff_scouting_df_plot = scouting_data_2['gunners'].sort_values().head()
#kickoff_scouting_df_plot = kickoff_scouting_df_plot.head(5)

kickoff_scouting_df_plot.value_counts().plot.barh(figsize=(15,14), title="Punt by result")

In [None]:
punt_returns_data['gunners'].value_counts(5).head().plot.barh(figsize=(19,14), title="Punt by result")

In [None]:
punt_returns_data.columns

In [None]:
punt_returns_data['puntRushers'].head()

In [None]:
punt_returns_data['kickReturnYardage'].head()

In [None]:
scouting_data_2['hangTime'].plot.kde(figsize=(14, 3), title="Distribution of Kickoff Hang Times (s)")

In [None]:
punt_returns_data.plot.scatter(x='hangTime', y='kickReturnYardage', figsize=(14,6), title="Kickoff Return Yards vs. Kick Hang Time (s)")

In [None]:
punt_returns_data['kickDirectionIntended'].value_counts().plot.barh(figsize=(9,3), title="Intended Kick Direction")

In [None]:

punt_returns_data['kickDirectionActual'].value_counts().plot.barh(figsize=(9,3), title="Actual Kick Direction")

In [None]:
(punt_returns_data['kickDirectionIntended'] == punt_returns_data['kickDirectionActual']).sum() / len(punt_returns_data)

In [None]:
missed_direction = punt_returns_data[punt_returns_data['kickDirectionIntended'] != punt_returns_data['kickDirectionActual']]
missed_direction['kickReturnYardage'].plot.kde(figsize=(14,3), title="Distribution of Return Yards on Missed Directional punts")

In [None]:
missed_tackles_df = punt_returns_data[punt_returns_data['missedTackler'].notnull()]
missed_tackles_df['kickReturnYardage'].plot.kde(figsize=(14,3), title="Distribution of Return Yards on Missed Tackles")

In [None]:
missed_tackles_df['missedTackles'] = missed_tackles_df['missedTackler'].str.split(';').str.len()
missed_tackles_df['missedTackles'].value_counts().plot.barh(figsize=(9,4), title="Returns with Missed Tackles")

In [None]:
ax = missed_tackles_df[['missedTackles', 'kickReturnYardage']].boxplot(by='missedTackles', figsize=(14,8), vert=False)
ax.set_xlabel(None)
ax.set_title(None)

In [None]:
punt_returns_data[['gunners', 'puntRushers','specialTeamsSafeties','kickType','kickDirectionActual','kickReturnYardage']].head()

In [None]:
punt_returns_data['kickReturnYardage'].max()

In [None]:
ax = punt_returns_data[['specialTeamsSafeties', 'kickReturnYardage']].head(100).boxplot(by='specialTeamsSafeties', figsize=(14,8), vert=False)
ax.set_xlabel(None)
ax.set_title(None)

**By studying correlation between kick return yardage and hang time, we notice that they are not correlated since their values are below 0. But According to some NFL game statistics, the average hang time for a coffin punt in the NFL was 4.40 seconds, which came at 4.44 seconds the ideal hang time. The below scatter plot shows that for that duration, most of the time, the ball barely exceeds fifty yards.**

In [None]:
punt_returns_data_corr = punt_returns_data[['kickReturnYardage','hangTime']].corr()

plt.figure(figsize=(10,4))
sns.heatmap(punt_returns_data_corr.corr(),annot=False,cmap='summer')
plt.show()

In [None]:
punt_returns_data[['kickReturnYardage','hangTime']].corr()

In [None]:
punt_returns_data[['gunners', 'puntRushers','kickDirectionIntended','kickType','kickDirectionActual','kickReturnYardage','hangTime']].head()

In [None]:
p = punt_returns_data[['kickReturnYardage','hangTime']]
ax = p.plot.bar(figsize=(13,13),rot=0)

In [None]:
sns.scatterplot(data=p, y="kickReturnYardage", x="hangTime")

**In this first step, we analyzed the importance of punt in the NFL game. To further our analysis, we will answer above questions and explore more play types (field goal, kickoff, extra point) :**
 - **What is the best punt startegy?**
 - **Who are the best punter?**
 - **How can punt impact the game?**

---
# _**Analysis on Field Goal**_

**Extract Field Goal Play:**

In [None]:
play_field_goal = plays_df.loc[plays_df["specialTeamsPlayType"] == "Field Goal"]
print("Total number of Field Goal Play: ", play_field_goal.shape[0])

**By refer to games.csv, we can add 2 features (homeTeamAbbr, visitorTeamAbbr) to the play dataset to determine the home team & away team of each play. Then, we can know that whether the possession team is home team or away team by referring to possessionTeam.**

In [None]:
home_team = []
visitor_team = []
for gameId in play_field_goal["gameId"]:
    game = games_df[games_df["gameId"]==gameId]
    home = game.iloc[0,5]
    visitor = game.iloc[0,6]
    home_team.append(home)
    visitor_team.append(visitor)
play_field_goal["homeTeamAbbr"] = home_team
play_field_goal["visitorTeamAbbr"] = visitor_team
play_field_goal.head()

**Add 1 feature about the possession team for the ease of further analysis:**
* **possessionTeamScore: it will show that whether the situation of possession team's current score, i.e., Leading, Tied, Trailing.**

In [None]:
play_field_goal["possessionTeamScore"] = np.nan

play_field_goal.reset_index(drop=True)

for index in range(play_field_goal.shape[0]):
    if play_field_goal.iloc[index, 6] == play_field_goal.iloc[index, -3]:
        if play_field_goal.iloc[index, 18] > play_field_goal.iloc[index, 19]:
            play_field_goal.iloc[index, -1] = "Leading"
        elif play_field_goal.iloc[index, 18] < play_field_goal.iloc[index, 19]:
            play_field_goal.iloc[index, -1] = "Trailing"
        else:
            play_field_goal.iloc[index, -1] = "Tied"
    else:
        if play_field_goal.iloc[index, 18] < play_field_goal.iloc[index, 19]:
            play_field_goal.iloc[index, -1] = "Leading"
        elif play_field_goal.iloc[index, 18] > play_field_goal.iloc[index, 19]:
            play_field_goal.iloc[index, -1] = "Trailing"
        else:
            play_field_goal.iloc[index, -1] = "Tied"

play_field_goal.reset_index(drop=True, inplace=True)
play_field_goal.head()

**Until this step, we have gathered enough information to do some analysis about the field goal. We will focus on players (field-goal kickers) analysis for the following reasons:**
* **An important reference in deciding line-up and strategy of game.**
* **Invaluable information for NFL transfer market.**

**Specifically, we will analyse 3 types of field-goal kickers:**
* **Highest Mentality : High successful field-goal rate under stress (tied/ trailing)**
* **Highest Accuracy : High successful field-goal rate overall(without consider the score situation)**
* **Long-range Expert : Long kick-length with high accuracy**

### Field-Goal Kickers with Highest Mentality

**Extract play which the kicker will face high pressure:**

In [None]:
play_field_goal_HighPressure = play_field_goal.loc[(play_field_goal["possessionTeamScore"] == "Tied") | (play_field_goal["possessionTeamScore"] == "Trailing")]
play_field_goal_HighPressure.dropna(axis='rows',subset=['kickerId'], inplace=True)
play_field_goal_HighPressure.reset_index(drop=True, inplace=True)
play_field_goal_HighPressure.head()

**Extract the playerID of those kickers who were under pressure**:

In [None]:
field_goal_kicker = play_field_goal_HighPressure.kickerId.unique()
field_goal_kicker

* **Filter out the kickers who barely kick less than 25 times (only consider the kickers with more experience)**
* **Calculate their successful field-goal rate/ accuracy**

In [None]:
kicker_performance = dict()
for kicker in field_goal_kicker:
    kicker_play = play_field_goal_HighPressure.loc[play_field_goal_HighPressure["kickerId"] == kicker]
    total_kicks = kicker_play.shape[0]
    good_kicks = (kicker_play.loc[kicker_play["specialTeamsResult"] == "Kick Attempt Good"]).shape[0]
    if total_kicks >= 25:
        successful_ratio = good_kicks/total_kicks
        kicker_performance[kicker] = [good_kicks,total_kicks,successful_ratio]

kicker_performance_df = (pd.DataFrame(kicker_performance)).T
kicker_performance_df.rename(columns={0: 'Number of Good Kick', 1: 'Total Number of Field-Goal Kick', 2: 'Accuracy'}, inplace=True)
kicker_performance_df.head()

**Sort the kickers based on their Successful Field-Goal Rate/ Accuracy in descending order:**

In [None]:
sorted_kicker_performance = dict(sorted(kicker_performance.items(), key=lambda item: item[1][2],reverse=True))

top_kicker = []
top_kicker_goal_rate = []
for key, value in sorted_kicker_performance.items():
    kicker_name = (players_df.loc[players_df["nflId"] == key]).values[0][6]
    top_kicker.append(kicker_name)
    top_kicker_goal_rate.append(value[2])

**Obtain the Top 10 Kickers with Best Mentality:**

In [None]:
list_tuples = list(zip(top_kicker[:10], top_kicker_goal_rate[:10]))  
top_kicker_df = pd.DataFrame(list_tuples,columns=['Player', 'Accuracy'])
top_kicker_df

**Visualize the Top 10 Kickers with Best Mentality**

In [None]:
import plotly.express as px
fig = px.bar(top_kicker_df, x='Player', y='Accuracy')
fig.update_layout(
    title="Kickers with Best Mentality - TOP 10",
    title_x=0.5,
    xaxis_title="Player Name",
    yaxis_title="Accuracy Under Stress",
    font=dict(
        family="Old Standard TT",
        size=18,
        color="Black"
    )
)

fig.show()

### Field-Goal Kickers with Highest Accuracy

**In this part, we will not consider the mentality of kickers based on the score. Instead,  we will rank the kickers based on their successful field-goal rate/ accuracy ONLY.**

**Extract the playerID of all field-goal kickers**:

In [None]:
play_field_goal.dropna(axis='rows',subset=['kickerId'], inplace=True)
field_goal_kicker = play_field_goal.kickerId.unique()
field_goal_kicker

* **Filter out the kickers who barely kick less than 25 times (only consider the kickers with more experience)**
* **Calculate their successful field-goal rate/ accuracy**

In [None]:
kicker_performance = dict()
for kicker in field_goal_kicker:
    kicker_play = play_field_goal.loc[play_field_goal["kickerId"] == kicker]
    total_kicks = kicker_play.shape[0]
    good_kicks = (kicker_play.loc[kicker_play["specialTeamsResult"] == "Kick Attempt Good"]).shape[0]
    if total_kicks >= 25:
        successful_ratio = good_kicks/total_kicks
        kicker_performance[kicker] = [good_kicks,total_kicks,successful_ratio]

kicker_performance_df = (pd.DataFrame(kicker_performance)).T
kicker_performance_df.rename(columns={0: 'Number of Good Kick', 1: 'Total Number of Field-Goal Kick', 2: 'Accuracy'}, inplace=True)
kicker_performance_df.head()

**Sort the kickers based on their Successful Field-Goal Rate/ Accuracy in descending order:**

In [None]:
sorted_kicker_performance = dict(sorted(kicker_performance.items(), key=lambda item: item[1][2],reverse=True))

top_kicker = []
top_kicker_goal_rate = []
for key, value in sorted_kicker_performance.items():
    kicker_name = (players_df.loc[players_df["nflId"] == key]).values[0][6]
    top_kicker.append(kicker_name)
    top_kicker_goal_rate.append(value[2])

**Obtain the Top 10 Kickers with Best Accuracy:**

In [None]:
list_tuples = list(zip(top_kicker[:10], top_kicker_goal_rate[:10]))  
top_kicker_df = pd.DataFrame(list_tuples,columns=['Player', 'Accuracy'])
top_kicker_df

**Visualize the Top 10 Kickers with Best Accuracy**

In [None]:
import plotly.express as px
fig = px.bar(top_kicker_df, x='Player', y='Accuracy')
fig.update_layout(
    title="Kickers with Best Accuracy - TOP 10",
    title_x=0.5,
    xaxis_title="Player Name",
    yaxis_title="Accuracy",
    font=dict(
        family="Old Standard TT",
        size=18,
        color="Black"
    )
)

fig.show()

### Field-Goal Kickers (Long-range Expert)

**In this part, we will analyse the long-range expert with high accuracy.**

**We need to add a feature 'average_kicklength' which calculates the average kicklength (successful kick) of each experience kickers (number of kicks > 24) and set a threshold to consider only those kickers who possess accuracy larger than preset threshold, then we will rank the rest of the kickers based on their average kicklength.**

**Threshold:**
* **Experience/ Number of Field-Goal kicks > 24**
* **Accuracy > 0.8**

In [None]:
kicker_performance = dict()
for kicker in field_goal_kicker:
    kicker_play = play_field_goal.loc[play_field_goal["kickerId"] == kicker]
    total_kicks = kicker_play.shape[0]
    kicker_play_good_kick = kicker_play.loc[kicker_play["specialTeamsResult"] == "Kick Attempt Good"]
    total_good_kick_length = sum(kicker_play_good_kick['kickLength'].tolist())
    good_kicks = kicker_play_good_kick.shape[0]
    
    if total_kicks >= 25:
        average_kicklength = total_good_kick_length/good_kicks
        successful_ratio = good_kicks/total_kicks
        if successful_ratio > 0.8:
            kicker_performance[kicker] = [good_kicks,total_kicks,successful_ratio,average_kicklength]

kicker_performance_df = (pd.DataFrame(kicker_performance)).T
kicker_performance_df.rename(columns={0: 'Number of Good Kick', 1: 'Total Number of Field-Goal Kick', 2: 'Accuracy', 3: 'Average Kicklength (Yards)'}, inplace=True)
kicker_performance_df.head()

**Sort the kickers based on their Average Kicklength (Yards) in descending order:**

In [None]:
sorted_kicker_performance = dict(sorted(kicker_performance.items(), key=lambda item: item[1][3],reverse=True))

top_kicker = []
top_kicker_goal_rate = []
top_kicker_kicklength = []
for key, value in sorted_kicker_performance.items():
    kicker_name = (players_df.loc[players_df["nflId"] == key]).values[0][6]
    top_kicker.append(kicker_name)
    top_kicker_goal_rate.append(value[2])
    top_kicker_kicklength.append(value[3])

**Obtain the Top 10 Long-range Expert:**

In [None]:
list_tuples = list(zip(top_kicker[:10], top_kicker_goal_rate[:10], top_kicker_kicklength[:10]))  
top_kicker_df = pd.DataFrame(list_tuples,columns=['Player', 'Accuracy', 'Average Kicklength (Yards)'])
top_kicker_df

**Visualize the Top 10 Long-Range Expert**

In [None]:
import plotly.express as px
fig = px.bar(top_kicker_df, x='Player', y='Average Kicklength (Yards)')
fig.update_layout(
    title="Long-range Expert - TOP 10",
    title_x=0.5,
    xaxis_title="Player Name",
    yaxis_title="Average Kicklength (Yards)",
    font=dict(
        family="Old Standard TT",
        size=18,
        color="Black"
    )
)

fig.show()

**From the analysis on Field-Goal Kickers, we can list out some high quality players:**
* **Player with best mentality (Top 3): Nick Folk, Younghoe Koo, Josh Lambo**
* **Player with best accuracy (Top 3): Josh Lambo, Graham Gano, Justin Tucker**
* **Long-range Expert (Top 3): Matt Bryant, Graham Gano, Brandon McManus**

**Besides, we noticed that some names appeared in different categories which show their invaluable value, such as:**
* **Josh Lambo: Best Mentality - Top 3, Best Accuracy - Top 1, Long-range Expert - Top 10**
* **Graham Gano: Best Mentality - Top 6, Best Accuracy - Top 2, Long-range Expert - Top 2**

**Their teams should be proud and confidence to have them on the pitch!**

### Additional： Promising Young Talents

**Due to the lack of data, we will consider the player is still young for those under age of 27.**

In [None]:
every_birthDate = []

for kicker in play_field_goal.kickerId:
    player = players_df.loc[players_df["nflId"] == kicker]
    every_birthDate.append(player.birthDate.tolist()[0])
#print(every_birthDate)
play_field_goal['birthDate'] = every_birthDate
play_field_goal.fillna(0,inplace=True)
play_field_goal.head()

In [None]:
age = []
for birthdate in play_field_goal.birthDate:
    if birthdate == 0 :
        age.append(1000)
    elif '-' in birthdate:
        age.append(2020-int(birthdate.split("-")[0]))
    elif '/' in birthdate:
        age.append(2020-int(birthdate.split("/")[2]))
play_field_goal['age'] = age
play_field_goal.head()

In [None]:
kicker_performance = dict()
for kicker in field_goal_kicker:
    kicker_play = play_field_goal.loc[play_field_goal["kickerId"] == kicker]
    total_kicks = kicker_play.shape[0]
    kicker_play_good_kick = kicker_play.loc[kicker_play["specialTeamsResult"] == "Kick Attempt Good"]
    good_kicks = kicker_play_good_kick.shape[0]
    age = int(kicker_play.age.tolist()[0])
    #print(age)
    if total_kicks >= 25:
        if age <= 27:
            successful_ratio = good_kicks/total_kicks
            kicker_performance[kicker] = [good_kicks,total_kicks,successful_ratio,int(age)]

kicker_performance_df = (pd.DataFrame(kicker_performance)).T
kicker_performance_df.rename(columns={0: 'Number of Good Kick', 1: 'Total Number of Field-Goal Kick', 2: 'Accuracy', 3: 'Age (till 2020)'}, inplace=True)
kicker_performance_df.head()

In [None]:
sorted_kicker_performance = dict(sorted(kicker_performance.items(), key=lambda item: item[1][2],reverse=True))

top_kicker = []
top_kicker_goal_rate = []
top_kicker_age = []
for key, value in sorted_kicker_performance.items():
    kicker_name = (players_df.loc[players_df["nflId"] == key]).values[0][6]
    top_kicker.append(kicker_name)
    top_kicker_goal_rate.append(value[2])
    top_kicker_age.append(value[3])

In [None]:
list_tuples = list(zip(top_kicker[:10], top_kicker_goal_rate[:10], top_kicker_age[:10]))  
top_kicker_df = pd.DataFrame(list_tuples,columns=['Player', 'Accuracy', 'Age (till 2020)'])
top_kicker_df

**Visualize Young Talents with Highest Accuracy - TOP 10**

In [None]:
import plotly.express as px
fig = px.bar(top_kicker_df, x='Player', y='Accuracy')
fig.update_layout(
    title="Young Talents with Highest Accuracy - TOP 10",
    title_x=0.5,
    xaxis_title="Player Name",
    yaxis_title="Accuracy",
    font=dict(
        family="Old Standard TT",
        size=18,
        color="Black"
    )
)

fig.show()

---
# _**Analysis on Kickoff**_

In [None]:
kickoff = plays_df.loc[plays_df['specialTeamsPlayType'] == 'Kickoff']
kickoff.head()

**In this section, we are going to analyse the kickoff from the plays dataset.  
Therefore, we have to extract kickoff dataset from the origial dataset.**

In [None]:
kickoff_col = ['gameId', 'playId', 'hangTime', 'kickType', 'kickDirectionIntended', 'kickDirectionActual', 'tackler',
                'returnDirectionIntended', 'returnDirectionActual', 'kickoffReturnFormation', 'specialTeamsSafeties']
scouting_data_2 = PFFScoutingData_df[kickoff_col]
scouting_data_2.head()

In [None]:
kickoff = kickoff.merge(scouting_data_2, how='left', on=['gameId', 'playId'])
kickoff.head()

**Select some features from PFFScouting dataset which may have impact on the performance of kickoff.  
Merge those selected features on kickoff dataset base on the gameid and playid.**

In [None]:
kickoff['specialTeamsResult'].value_counts().plot.barh(figsize=(9,4), title="Kickoff by result")

**As the result shown, the touchback appears most and the return comes the second.  
The reason lead to this distribution is that applying touchback while receiving ball is the safest approach action.  
However, if the team want to have some progress, applying return action will be the choice.**


## Analysis on Return Action

In [None]:
kickoff_return = kickoff.loc[kickoff['specialTeamsResult'] == 'Return']
kickoff_return.head()

In [None]:
ax = kickoff_return[['possessionTeam', 'kickLength']].boxplot(by='possessionTeam', figsize=(14,10), vert=False)
ax.set_xlabel(None)
ax.set_title('KickLength of Each Team')

In [None]:
ax = kickoff_return[['possessionTeam', 'kickReturnYardage']].boxplot(by='possessionTeam', figsize=(14,10), vert=False)
ax.set_xlabel(None)
ax.set_title('KickReturnYardage of Each Team')

In [None]:
ax = kickoff_return[['possessionTeam', 'playResult']].boxplot(by='possessionTeam', figsize=(14,10), vert=False)
ax.set_xlabel(None)
ax.set_title('PlayResult of Each Team')

**The goal of kickoff is to kick the ball and stop the returning side as far as they can.  
Therefore, the feature playResult which is the linear combination of the feature kickLength and kickReturnYardage become the performace matric of kickoff.  
As the result shown, team TB has the highest median of kickLength, team JAX has the lowest median of kickReturnYardage and team ARI has the highest median of playResult.**

In [None]:
ax = kickoff_return[['kickType', 'playResult']].boxplot(by='kickType', figsize=(14,10), vert=False)
ax.set_xlabel(None)
ax.set_title('PlayResult of Each KickType')

**The above boxplot show that kickType 'D' have the highest median of playResult.  
It's worth noting that the kickType 'S' and 'O' have low median of playResult due to the different purpose of kicking.  
The main purpoes of those kickType is to do the onside kick, so the playResult is not that meaningful to them.**

In [None]:
touch_downs_returns = kickoff_return.loc[kickoff_return['playDescription'].str.contains('TOUCHDOWN')]
print("number of kickoff return: {}".format(len(kickoff_return), 2))
print("number of kickoff return touch down: {}".format(len(touch_downs_returns), 2))
print("kickoff touch down success rate: {} %".format(round(len(touch_downs_returns)/len(kickoff_return)*100, 3)))

In [None]:
ax = touch_downs_returns.groupby('possessionTeam').size().plot.barh(figsize=(14,6), title="Kickoff returns for a TouchDown")
ax.set_ylabel('Kicking Team')

In [None]:
onside_kick = kickoff.loc[kickoff['specialTeamsResult'] == 'Kickoff Team Recovery']
print("number of kickoff: {}".format(len(kickoff), 2))
print("number of onside kick: {}".format(len(onside_kick), 2))
print("onside kick success rate: {} %".format(round(len(onside_kick)/len(kickoff)*100, 3)))

In [None]:
ax = onside_kick.groupby('possessionTeam').size().plot.barh(figsize=(14,6), title="Kickoff returns for a onside kick")
ax.set_ylabel('Kicking Team')

**The results above show the success rate of the return touch down and onside kick, which are the most excited moment of the game.  
However, the success rate is to low to make any anaiysis on it, so the only thing we can do is to plot the frequence of each team.**

In [None]:
kickoff_return['possessionTeam'].value_counts().plot.barh(figsize=(20,10), title='Return Times of Each Team')

**As mentioned above, applying return action is more risky than apply touchback action but it can give some advantage to the team.  
The follwing section is going to analyse return action.**

In [None]:
kick_return = kickoff_return['possessionTeam'].value_counts()
total = kickoff['possessionTeam'].value_counts()
ax = (kick_return / total).sort_values(ascending=False).plot.barh(figsize=(20,10), title='Return rate of Each Team')
ax.bar_label(ax.containers[0])

**The plot above shows that the team DET has the highest probability 57.9% to take return action than the other team and the team CAR has the lowest probability 14.2%.**

In [None]:
kickoff_return['kickDirectionActual'].value_counts().plot.barh(figsize=(14,6), title='Return Times of Each KickDirectionActual')

In [None]:
ax = kickoff_return[['kickDirectionActual', 'playResult']].boxplot(by='kickDirectionActual', figsize=(14,6), vert=False)
ax.set_xlabel(None)
ax.set_title('PlayResult of Each KickDirectionActual')

## Analysis of returning over 40 yards

In [None]:
return_over40 = kickoff_return.loc[kickoff['playResult'] > 40]

**The next part we are going to take some look on the plays that have playResult over 40 yards.  
In this part, we define successful return as return over 40 yards because taking touchback action gerentee the return team 40 yards of playResult.  
Therefore, considering return over 40 yards as the successful return seem reasonable.**

In [None]:
over40 = return_over40['kickDirectionActual'].value_counts()
total = kickoff_return['kickDirectionActual'].value_counts()
ax = (over40 / total).sort_values(ascending=False).plot.barh(figsize=(14,6), title='Return over 40 yard success rate of Each KickDirectionActual')
ax.bar_label(ax.containers[0])

In [None]:
over40 = return_over40['returnDirectionActual'].value_counts()
total = kickoff_return['returnDirectionActual'].value_counts()
ax = (over40 / total).sort_values(ascending=False).plot.barh(figsize=(14,6), title='Return over 40 yard success rate of Each KickDirectionActual')
ax.bar_label(ax.containers[0])

In [None]:
over40 = return_over40['kickType'].value_counts()
total = kickoff_return['kickType'].value_counts()
ax = (over40 / total).sort_values(ascending=False).plot.barh(figsize=(14,6), title='Return over 40 yard success rate of Each KickType')
ax.bar_label(ax.containers[0])

**The results above show that when kicking the ball in left or right direction can make the return team's success lower.  
However, not kick the ball in the center may make the ball out of bounds.  
Moreover, kicking the ball in the unusial type can also prevent the return team from returning over 40 yards.**

In [None]:
over40 = return_over40['possessionTeam'].value_counts()
total = kickoff_return['possessionTeam'].value_counts()
ax = (over40 / total).sort_values(ascending=False).plot.barh(figsize=(20,10), title='Return over 40 yard success rate of Each Team')
ax.bar_label(ax.containers[0])

**The plot above indicate that the team HOU has the highest success rate 65.7% of return, which means that they have difiiculty on stop the returner before they pushing 40 yards.  
On the other hand, if you are the return team it is suggested that taking return action while facing team HOU.**

In [None]:
tackle = kickoff.dropna(subset=["tackler"])
print("number of kickoff: {}".format(len(kickoff), 2))
print("number of tackle: {}".format(len(tackle), 2))
print("kickoff touch down success rate: {} %".format(round(len(tackle)/len(kickoff)*100, 3)))

---