In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
pd.set_option('display.max_columns', 500)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from kaggle.competitions import nflrush

# You can only call make_env() once, so don't lose it!
env = nflrush.make_env()

In [None]:
train_df = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2020/train.csv', low_memory=False)

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
# train_df[train_df['PlayId']==20181007011551]

**1----- GameId - a unique game identifier**

In [None]:
train_df["GameId"].value_counts()

There are 512 games.

The records of each games are not really same, MAX is 1870 and MIN is 616.

In [None]:
print("From ",train_df["GameId"].astype(str).str[:6].min(),"To ",train_df["GameId"].astype(str).str[:6].max())

The first 8 digits of GameId are probably the date of game, so these games are from 201709 to 201812

In [None]:
pd.Series(train_df["GameId"].unique()).astype(str).str[:6].value_counts().sort_index().plot(kind="barh")

But not every month has a game

> **2----- PlayId - a unique play identifier**

In [None]:
print(len(train_df["PlayId"].unique()))
print(int(509762/22))

PlayId which also could be called "DownID" has 23,171 unique values.

In this dataset, a row means a player(not a down, not a team). Since evrey down has 22 players ⇨ 23,171 × 22 = 509,762 (the len of dataset)

**Attention:**  In this competition, we need to predict "How many yards will an NFL player gain after receiving a handoff?", since every 22 rows indicates a down, the 22 rows should always be the same result(Yards). Maybe it's better to group the rows when modeling. 

**3 -----Team - home or away**

In [None]:
team_df=train_df[['PlayId','Team','NflId','Yards']].merge(
    train_df[['PlayId','NflIdRusher']].drop_duplicates(),left_on=['PlayId','NflId'],
    right_on=['PlayId','NflIdRusher'],how='inner')

In [None]:
team_df['Team'].value_counts().plot(kind="barh",title='Offense times', figsize=(10,2))

In [None]:
print('Average yards gained by home team : ',team_df[team_df["Team"]=='home']['Yards'].mean())
print('Average yards gained by away team : ',team_df[team_df["Team"]=='away']['Yards'].mean())
print('Median yards gained by home team :  ',team_df[team_df["Team"]=='home']['Yards'].median())
print('Median yards gained by away team :  ',team_df[team_df["Team"]=='away']['Yards'].median())

In [None]:
f, axes = plt.subplots(1, 2, figsize=(10,4))
sns.distplot(team_df[team_df["Team"]=='home']['Yards'],ax=axes[0],kde=None).set_title("Home")
sns.distplot(team_df[team_df["Team"]=='away']['Yards'],ax=axes[1],kde=None).set_title("Away")
print("Gained Yards Distplot:")

**4 -----X - player position along the long axis of the field. See figure below.**

**5 -----Y - player position along the short axis of the field. See figure below.**

**6 -----S - speed in yards/second**

**7 -----A - acceleration in yards/second^2**

**8 -----Dis - distance traveled from prior time point, in yards**

**9 -----Orientation - orientation of player (deg)**

**10 -----Dir - angle of player motion (deg)**

In [None]:
train_df[['X', 'Y', 'S', 'A', 'Dis', 'Orientation','Dir']].describe()

Orientation and Dir have missing data (below)

In [None]:
print("Orientation: ",train_df[train_df["Orientation"].isnull()]["DisplayName"].drop_duplicates().values)

print("Dir: ",train_df[train_df["Dir"].isnull()]["DisplayName"].drop_duplicates().values)

Dir seems a very important feature:

https://www.kaggle.com/sudalairajkumar/simple-exploration-notebook-nfl

The difference between Orientation and Dir:

Orientation is the direction in which the player is facing.

Dir is the direction in which the player is moving.

https://www.kaggle.com/c/nfl-big-data-bowl-2020/discussion/111918#latest-649659

**Distplot:**

In [None]:
f, axes = plt.subplots(2, 4, figsize=(18,9))
sns.distplot(train_df["X"],ax=axes[0,0])
sns.distplot(train_df["Y"],ax=axes[0,1])
sns.distplot(train_df["S"],ax=axes[0,2])
sns.distplot(train_df["A"],ax=axes[0,3])
sns.distplot(train_df["Dis"],ax=axes[1,0])
sns.distplot(train_df[~train_df["Orientation"].isnull()]["Orientation"],ax=axes[1,1])
sns.distplot(train_df[~train_df["Dir"].isnull()]["Dir"],ax=axes[1,2])

Dis is the distance covered by each player in roughly the last 0.1 seconds of play

https://www.kaggle.com/c/nfl-big-data-bowl-2020/discussion/112246#latest-647281

**11----- NflId - a unique identifier of the player**

**12----- DisplayName - player's name**

Nflld and name are almost one-to-one match, except Michael Thomas:

In [None]:
train_df[train_df["DisplayName"]=="Michael Thomas"][["DisplayName","NflId"]].drop_duplicates()

**13----- JerseyNumber - jersey number**

In [None]:
sns.distplot(train_df["JerseyNumber"],kde=None)

**14----- Season - year of the seasonr**

In [None]:
train_df["Season"].value_counts().plot(kind="barh")

**15----- YardLine - the yard line of the line of scrimmage**

In [None]:
sns.distplot(train_df["YardLine"],kde=None)

In [None]:
print("The peak: ",train_df["YardLine"].value_counts().max())

In [None]:
train_df.groupby("PlayId")[["YardLine","Yards"]].max().corr()

corr between YardLine and Yards is 0.064551

**16----- Quarter - game quarter (1-5, 5 == overtime)**

In [None]:
train_df["Quarter"].value_counts().plot(kind="barh")

In [None]:
train_df.groupby("PlayId")[["Quarter","Yards"]].max().corr()

corr between Quarter and Yards is 0.064551, seems Quarter is not important

**17----- GameClock - time on the game clock**

In [None]:
train_df["GameClock"].value_counts()[:5]

Why 15:00:00 is the most?

**18----- PossessionTeam - team with possession**

In [None]:
train_df["PossessionTeam"].value_counts().plot(kind="bar",figsize=(20,5))

In [None]:
train_df[["PlayId","PossessionTeam","Yards"]].drop_duplicates().groupby(["PossessionTeam"])["Yards"].mean().sort_values().plot(kind="bar",figsize=(20,5))

Mean yards of each Team

In [None]:
train_df[["PlayId","PossessionTeam","Yards"]].drop_duplicates().groupby(["PossessionTeam"])["Yards"].median().sort_values().plot(kind="bar",figsize=(20,5))

Median yards of each Team (mean looks better)

**19----- Down - the down (1-4)**

In [None]:
train_df["Down"].value_counts().plot(kind="barh")

In [None]:
train_df[["PlayId","Down","Yards"]].drop_duplicates().groupby(["Down"])["Yards"].mean().sort_values().plot(kind="barh")

Mean yards for each down

**20----- Distance - yards needed for a first down**

In [None]:
train_df["Distance"].value_counts().sort_index().plot(kind="bar",figsize=(20,5))

In [None]:
train_df.groupby("PlayId")[["Distance","Yards"]].max().corr()

corr : 0.071936

In [None]:
print("Distance > 10 : ",sum(train_df["Distance"]>10)/len(train_df))

**21----- FieldPosition - which side of the field the play is happening on**

In [None]:
train_df["FieldPosition"].value_counts().plot(kind="bar",figsize=(20,5))

**22----- HomeScoreBeforePlay - home team score before play started**

**23----- VisitorScoreBeforePlay - visitor team score before play started**

In [None]:
f, axes = plt.subplots(1, 2, figsize=(10,4))
sns.distplot(train_df["HomeScoreBeforePlay"],ax=axes[0],kde=None)
sns.distplot(train_df["VisitorScoreBeforePlay"],ax=axes[1],kde=None)

**24----- NflIdRusher - the NflId of the rushing player (Ball carrier)**

In [None]:
print(len(train_df["NflIdRusher"].value_counts()),"unique players")

**25----- OffenseFormation - offense formation**

In [None]:
train_df["OffenseFormation"].value_counts().plot(kind="barh")

**26----- OffensePersonnel - offensive team positional grouping**

In [None]:
train_df["OffensePersonnel"].value_counts().plot(kind="bar",figsize=(20,5))

**27----- DefendersInTheBox - number of defenders lined up near the line of scrimmage, spanning the width of the offensive line**

In [None]:
train_df["DefendersInTheBox"].value_counts().sort_index(ascending=False).plot(kind="barh")

**28----- DefensePersonnel - defensive team positional grouping**


In [None]:
train_df["DefensePersonnel"].value_counts().plot(kind="bar",figsize=(20,5))

**29----- PlayDirection - direction the play is headed**


In [None]:
train_df["PlayDirection"].value_counts().plot(kind="barh")

**30----- TimeHandoff - UTC time of the handoff**

**31----- TimeSnap - UTC time of the snap**


In [None]:
train_df[["TimeHandoff","TimeSnap"]][:5]

**32----- Yards - the yardage gained on the play (you are predicting this)**

In [None]:
sns.distplot(train_df["Yards"],kde=None)

There are a few outliers

**33----- PlayerHeight - player height (ft-in)**

**34----- PlayerWeight - player weight (lbs)**

In [None]:
f, axes = plt.subplots(1, 2, figsize=(10,4))
sns.distplot((train_df["PlayerHeight"].str[:1].astype(int)*12+train_df["PlayerHeight"].str[-1:].astype(int)),kde=None,ax=axes[0])
sns.distplot(train_df["PlayerWeight"],kde=None,ax=axes[1])

Two peaks for both PlayerHeight and PlayerWeight, seems to be guard and forward

**35----- PlayerBirthDate - birth date (mm/dd/yyyy)**

Check it later

**36----- PlayerCollegeName - where the player attended college**

In [None]:
train_df["PlayerCollegeName"].value_counts().plot(kind="bar",figsize=(20,5))

**37----- Position**


In [None]:
train_df["Position"].value_counts().plot(kind="bar",figsize=(20,5))

**38----- HomeTeamAbbr - home team abbreviation**


In [None]:
train_df["HomeTeamAbbr"].value_counts().plot(kind="bar",figsize=(20,5))

**39----- VisitorTeamAbbr - visitor team abbreviation**


In [None]:
train_df["VisitorTeamAbbr"].value_counts().plot(kind="bar",figsize=(20,5))

**40----- Week - week into the season**

In [None]:
sns.distplot(train_df["Week"],kde=None)

**41----- Stadium - stadium where the game is being played**


In [None]:
train_df["Stadium"].value_counts().plot(kind="bar",figsize=(20,5))

**42----- Location - city where the game is being player**


In [None]:
train_df["Location"].value_counts().plot(kind="bar",figsize=(20,5))

**43----- StadiumType - description of the stadium environment**


In [None]:
train_df["StadiumType"].value_counts().plot(kind="bar",figsize=(20,5))

**44----- Turf - description of the field surface**


In [None]:
train_df["Turf"].value_counts().plot(kind="bar",figsize=(20,5))

**45----- GameWeather - description of the game weather**


In [None]:
train_df["GameWeather"].value_counts().plot(kind="bar",figsize=(20,5))

A little dirty, need to be cleaned

**46----- Temperature - temperature (deg F)**

**47----- Humidity - humidity**


In [None]:
f, axes = plt.subplots(1, 2, figsize=(10,4))
sns.distplot(train_df["Temperature"].fillna(0),kde=None,ax=axes[0])
sns.distplot(train_df["Humidity"].fillna(0),kde=None,ax=axes[1])

Both Temperature and Humidity have missing data


**48----- WindSpeed - wind speed in miles/hour**


In [None]:
train_df["WindSpeed"].value_counts().plot(kind="bar",figsize=(20,5))

Very dirty, need to be cleaned

**49----- WindDirection - wind direction**

In [None]:
train_df["WindDirection"].value_counts().plot(kind="bar",figsize=(20,5))

Very dirty, need to be cleaned