#                                            NFL BIG DATA BOWL 2022


                                                     TEAM : Data Dictators
                                                     
                                                            Members
                                                      
                                                   1. Soumya Sourav Behera (Leader)
                                                   2. J. Venkateswar Achary

# Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import matplotlib.patches as patches
import seaborn as sns
from ipywidgets import interact
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

# PFF Scouting data

In [None]:
#loading pffscouting data
pffs = pd.read_csv('../input/nfl-big-data-bowl-2022/PFFScoutingData.csv')
#checking its shape
pffs.shape

In [None]:
#checking 1st 5 rows
pffs.head()

In [None]:
#checking summary
pffs.info()

In [None]:
#checking for null values
pffs.isnull().sum()

In [None]:
# unique games played can be determined by the game ID.
pffs.gameId.nunique()

In [None]:
#Play identifier, not unique across games (numeric)
pffs.playId.nunique()

In [None]:
# counting different types of snap details
print("Unique NFL Snap Details values and their counts :")
pffs_sd = pffs.pivot_table(index = ['snapDetail'], aggfunc = 'size') 
pffs_sd = pffs_sd.reset_index()
pffs_sd.columns= ["Snap Details", "Counts"]
pffs_sd = pffs_sd.sort_values("Counts", ascending = False)
print(pffs_sd)

In [None]:
px.bar(pffs_sd, x="Snap Details", y="Counts", title="Distinct Count Of Snap Details", color="Snap Details")

*Observation*
- Most shot played is perfect with great accuracy(ok).

In [None]:
fig = px.scatter(pffs, x='hangTime', y='operationTime')

fig.update_traces(marker=dict(size=6,
                              line=dict(width=0.5,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()

*Observation:*
1. Operation time is lies between 1.5 sec to 2.5 sec.
2. Hang time of the player is varies from 2 to 5 seconds.

In [None]:
# Counting different types of kick played
Kick_type = pffs['kickType'].value_counts().reset_index()

In [None]:
Kick_type.rename(columns = {'index' : 'Type_of_kick', 'kickType' : 'Count'}, inplace=True)

In [None]:
Kick_type

In [None]:
# Creating and printing of a pie chart of different types of kick count played in the game
fig= px.pie(Kick_type, values='Count', names='Type_of_kick', title='count of Kick Type',width=800,height=600)
fig.show()

*Observation:*
1. More than half of the shots played is normal deep kick with decent hang time(most played kick) denoted by D.
1. Surprise kick are most rare as the name suggested.

# Players Data

In [None]:
#loading the players dataset
players = pd.read_csv('../input/nfl-big-data-bowl-2022/players.csv')

#checking the shape of players dataset
players.shape

In [None]:
#checking the first 5 rows of the dataset
players.head()

In [None]:
#checking the summary of the dataset
players.info()

*Observation*
1. Null values are present in 'birthDate', 'collegeName' columns
2. Height column is in object Dtype
3. birthDate in in object Dtype

In [None]:
#dropping the null values present in the dataset
players = players.dropna()

#rechecking the shape of the dataset after dropping the null values
players.shape

In [None]:
#checking the counts of unique values
players['height'].value_counts()

*Observation*

1. Some of the values are in feet-inch format and some are in inch format

In [None]:
#Parsing object as string and splitting height values into feet and inch
height_new = players["height"].str.split("-", n = 1, expand = True)
height_new.columns = ['Feet', 'Inch']
players["height_feet"] = pd.to_numeric(height_new["Feet"])
players["height_inch"] = pd.to_numeric(height_new["Inch"])

In [None]:
# Column - height_feet
for i in players["height_feet"]:
    if i > 7:
        j = 0.0833333 * i
        players["height_feet"].replace({i : j}, inplace=True)
        
# Column - height_inch
players["height_inch"] = 0.0833333 * players["height_inch"]

# Converting Nan values to zero in height_inch column 
for i in players["height_inch"]:
    players["height_inch"] = players["height_inch"].fillna(0)

# combining two height columns
players["height_feet"] = players["height_feet"] + players["height_inch"]




In [None]:
#dropping height and height_inch column
players = players.drop(['height','height_inch'], axis=1)

In [None]:
#rechecking top 5 rows
players.head()

In [None]:
#rounding up players height upto 1 decimal
players['height_feet'] = players['height_feet'].round(decimals = 1)

In [None]:
#convering dtype of birthdate object 
players["birthDate"] = pd.to_datetime(players["birthDate"])

In [None]:
#creating a 'birthYear' column
players['birthYear'] = pd.DatetimeIndex(players['birthDate']).year
players["birthYear"].value_counts()

In [None]:
#rechecking the dataset
players.head()

In [None]:
#exploring statistical informations
players.describe()

In [None]:
#checking for outliers
for i in players.columns:
    fig = px.box(players, x=i,  width=800, height=400) 
    fig.show()

In [None]:
#Vizualizing player's height by histogram
px.histogram(players, x="height_feet", title="Player's Height In Feet", nbins=10, color_discrete_sequence=["coral"])

*Observation*
1. Most of the players are in height of 6.2-6.3
2. only 6 people are below 5.5ft

In [None]:
# Vizualizing birthyear by histogram plot
px.histogram(players, x="birthYear", title="Birth Year Of Players", color_discrete_sequence=["darkred"])

*Observation*
1. only 7 players have birth year in 1972-1980
2. Birthyear of 1999 players are the younger ones

In [None]:
# Creating Histogram plot to vizualize weight of the players
px.histogram(players, x="weight", nbins=10,title="Player's Weight In lbs (pounds)", color_discrete_sequence=["darkblue"])


*Observation*
1. Maximum of players are in weight of 200-249lbs

In [None]:
print("Unique college names and their counts : ")
college_names = players.pivot_table(index = ['collegeName'], aggfunc = 'size') 
college_names = college_names.reset_index()
college_names.columns= ["College Names", "Counts"]
college_names = college_names.sort_values("Counts", ascending = False)
print(college_names)

In [None]:
# Creating Tree map plot for vizualizing colleges
px.treemap(college_names, path=['College Names'], values='Counts', color='Counts', title="Arrangement Of Colleges From Highest To Lowest Number Of Players")


In [None]:
#creating dataframe for top10 colleges
top_colleges = college_names[0:10]

In [None]:
#vizualizing top10 collleges using piechart
px.pie(top_colleges, values='Counts', names='College Names', title='Top 10 Colleges Having Higher Number Of Players')


*Observation*:
1. 13.5% (most) players are from Alabama college

In [None]:
#replacing abbrevated words with keywords
players["Position"].replace({"WR": "Wide Receiver", "CB": "Corner Back", "DE": "Defensive End", "OLB": "Outside Linebacker",
                           "TE": "Tight End", "T": "Tackle", "RB": "Running Back", "G": "Guard", "DT": "Defensive Tackle",
                           "ILB": "Inside Linebacker", "FS": "Free Safety", "SS": "Strong Safety", "C": "Center", "K": "Kicker",
                           "P": "Punter", "NT": "Nose Tackle", "LS": "Long Snapper", "LB": "Linebacker", "DB": "Defensive Back",
                           "MLB" : "Middle Linebacker", "FB": "Full Back", "OT": "Offensive Tackle", "QB": "Quarter Back",
                           "OG": "Offensive guard", "S": "Safety", "HB": "Half Back"}, inplace=True)
players["Position"]

In [None]:
print("Unique position values and their counts :")
pos_val = players.pivot_table(index = ['Position'], aggfunc = 'size') 
pos_val = pos_val.reset_index()
pos_val.columns= ["Positions", "Counts"]
pos_val = pos_val.sort_values("Counts", ascending = False)
print(pos_val)

In [None]:
#  Creating Bar plot to vizualize position
px.bar(pos_val, x="Positions", y="Counts", title="Arrangement Of Positions From Frequently Used To Least Used", color=pos_val["Counts"])


*Observation*
1. We have most players available for wide reciever and corner back position
2. Very less number of players are for half bacj, safety, offensive guard position

In [None]:
@interact
def summary(Positions = list(players['Position'].value_counts().index)):
    x = players[players['Position']== Positions]
    z = players.drop(['nflId','collegeName','displayName','Position','birthDate','birthYear'],axis=1)
    for i in z:
        print('Minimum', i, x[i].min())
        print('Average', i, x[i].mean())
        print('Maximum', i, x[i].max())
        print('--------------------------------------------------')

In [None]:
print('Positions which requires tall players', players[players['height_feet']>6.5]['Position'].unique())
print('Position which requires players having high in weight:', players[players['weight']>350]['Position'].unique())

# Games Data

In [None]:
#loading the dataset
games = pd.read_csv('../input/nfl-big-data-bowl-2022/games.csv')

#checking the shape of the dataset
games.shape

In [None]:
#checking first five rows
games.head()

In [None]:
#Concating elements of columns gameDate and gameTimeEastern
games['date']=games.gameDate.str.cat(games.gameTimeEastern)
games.head()

In [None]:
#checking the summary
games.info()

*Observation*
1. "gameDate" and "date" is in object type 

In [None]:
# converting the object to datetime data type 
games['date'] = pd.to_datetime(games['date'], format="%m/%d/%Y%H:%M:%S")

In [None]:
# creating a table for number of games played in particular season
Season = games.pivot_table(index =['season'],aggfunc ='size')
Season = Season.reset_index()
Season.columns =['Seasons','Total games']
Season = Season.sort_values('Total games',ascending = False)
print(Season)

In [None]:
#creating a barplot of total game count vs seasons using plotly 
fig = px.pie(Season,names= 'Seasons',values ='Total games',title = 'SEASONS',width=500, height=500)
fig.show()

*Observation*
1. Highest games are played in 2020.
2. Lowest in 2018.
3. Although they are almost equal to each other.

In [None]:
# Creating a table for Unique NFL weeks and their counts
print("Unique NFL weeks and their counts :")
g_week = games.pivot_table(index = ['week'], aggfunc = 'size') 
g_week = g_week.reset_index()
g_week.columns= ["Weeks", "Counts"]
g_week = g_week.sort_values("Counts", ascending = False)
print(g_week)

In [None]:
# Creating Bar plot of Weeks vs Counts
px.bar(g_week, x="Weeks", y="Counts", title="NFL Weeks", color="Weeks")

*Observation*
1. Trend of frequency of counts per week is first high then decreased significantly till week 11(41)
2. After 11th week counts again increased to highest counts(48).

In [None]:
# Creating a table for Unique NFL dates and their Counts
print("Unique NFL dates and their counts :")
g_date = games.pivot_table(index = ['gameDate'], aggfunc = 'size') 
g_date = g_date.reset_index()
g_date.columns= ["Date", "Counts"]
g_date = g_date.sort_values("Counts", ascending = False)
print(g_date)

In [None]:
# Creating Bar plot
px.bar(g_date, x="Date", y="Counts", title='Dates On Which NFL Events Occurred', color="Counts")


*Observation*
1. Most games played in only three days. one is 01/03/2021 , 12/29/2018 ,12/30/2018
2. Most of the days (half of total days the total events organized) contains only a single game.

In [None]:
print("Data type of gameDate column before parsing : ", games["gameDate"].dtypes)
games["gameDate"] = pd.to_datetime(games["gameDate"])
print("Data type of gameDate column after parsing : ", games["gameDate"].dtypes)
print(games["gameDate"].head())

In [None]:
games['gameYear'] = pd.DatetimeIndex(games['gameDate']).year
print(games["gameYear"])

In [None]:
# Creating a table for different years vs total games played on those years to check the correctness of data given for seasons
print("Unique NFL years and their counts :")
g_year = games.pivot_table(index = ['gameYear'], aggfunc = 'size') 
g_year = g_year.reset_index()
g_year.columns= ["Year", "Counts"]
g_year = g_year.sort_values("Counts", ascending = False)
print(g_year)

In [None]:
# Creating Bar plot
px.bar(g_year, x="Year", y="Counts", title="Distinct Count Of NFL Event Years", color="Year",width=800, height=600)

*Observation*
1. In some rows season and date are not accordingly matched as games are conducted in 2021 but there is no season information given to those dates.
2. This can conclude that* some games of season 2020 is conducted in 2021* due to some problem or crisis most probably CoVID-19.

In [None]:
games["gameMonthName"] = games["gameDate"].dt.month_name()
print(games["gameMonthName"])

In [None]:
print("Unique NFL months and their counts :")
g_month = games.pivot_table(index = ['gameMonthName'], aggfunc = 'size') 
g_month = g_month.reset_index()
g_month.columns= ["Month", "Counts"]
g_month = g_month.sort_values("Counts", ascending = False)
print(g_month)

In [None]:
# Creating Bar plot
px.bar(g_month, x="Month", y="Counts", title="Distinct Count Of NFL Event Months", color="Month",width=900, height=600)

*Observation:*
1. In december most games are played.
2. Least games are played in january.(could be the games played due to some problem and played in other year not according to the season.)

In [None]:
print("Unique NFL timings and their counts :")
g_time = games.pivot_table(index = ['gameTimeEastern'], aggfunc = 'size') 
g_time = g_time.reset_index()
g_time.columns= ["Time", "Counts"]
g_time = g_time.sort_values("Counts", ascending = False)
print(g_time)

In [None]:
# Creating Bar plot
px.bar(g_time, x="Time", y="Counts", title="Distinct Count Of NFL Event Timings", color="Counts",width=800, height=600)


*Observation*
1. Most of the games are played at 1pm.
2. From which we can conclude that the game is heavily played and enjoyed in afternoon.(NO GAMES PLAYED IN MORNING SESSION)

# Plays Data

In [None]:
#loading dataset
pd.pandas.set_option('display.max_columns', None)
plays = pd.read_csv('../input/nfl-big-data-bowl-2022/plays.csv')

In [None]:
#checking shape
plays.shape

In [None]:
#checking top 5 rows
plays.head()

In [None]:
#checking summary of the dataset
plays.info()

In [None]:
#checking for null values
plays.isnull().sum()

In [None]:
#checking statistical distibutions
plays.describe()

In [None]:
#plotting a correlation map
fig=plt.figure(figsize=(14,10))
corr=plays.corr('pearson')
sns.heatmap(corr,cmap='RdBu_r',annot=True)
plt.show()

*Observation*
1. PlayResult -vely correlated with kickReturnYardage and +vely correlated with kicklength, penaltyYards, yardlineNumber.
2. kickReturnYardage is -vely correlated with down, yardsToGo and +vely correlated with kickLength
3. kickLength is -vely correlated with down, yardsToGo
4. preSnapVisitorScore is +vely correlated with preSnapHomeScore and quarter

In [None]:
print("Unique NFL game quarter values and their counts :")
play_qrtr = plays.pivot_table(index = ['quarter'], aggfunc = 'size') 
play_qrtr = play_qrtr.reset_index()
play_qrtr.columns= ["Quarter", "Counts"]
play_qrtr = play_qrtr.sort_values("Counts", ascending = False)
print(play_qrtr)

In [None]:
# Creating Bar plot to vizualize quarter
px.bar(play_qrtr, x="Quarter", y="Counts", title="Distinct Count Of NFL Game Quarters", color="Quarter")


*Observation*
1. Quarter 5 have too much less counts as compared to other quarter

In [None]:
print("Unique NFL game downs and their counts :")
g_down = plays.pivot_table(index = ['down'], aggfunc = 'size') 
g_down = g_down.reset_index()
g_down.columns= ["Down", "Counts"]
g_down = g_down.sort_values("Counts", ascending = False)
print(g_down)

In [None]:
# Creating Bar plot to vizualize down
px.bar(g_down, x="Down", y="Counts", title="Distinct Count Of NFL Game Downs", color="Down")


In [None]:
print("Unique NFL yards to go and their counts :")
g_yards = plays.pivot_table(index = ['yardsToGo'], aggfunc = 'size') 
g_yards = g_yards.reset_index()
g_yards.columns= ["Yards To Go", "Counts"]
g_yards = g_yards.sort_values("Counts", ascending = False)
print(g_yards)

In [None]:
# Creating Bar plot
px.bar(g_yards, x="Yards To Go", y="Counts", title="Distinct Count Of Yards To Go In NFL Game", color="Yards To Go")


In [None]:
plays["possessionTeam"].replace({"ARI": "Arizona Cardinals", "KC": "Kansas City Chiefs", "TEN": "Tennessee Titans", "TB": "Tampa Bay Buccaneers",
                           "SF": "San Francisco 49ers", "SEA": "Seattle Seahawks", "PIT": "Pittsburgh Steelers", "PHI": "Philadelphia Eagles",
                            "NYJ": "New York Jets", "NYG": "New York Giants", "NO": "New Orleans Saints", "NE": "New England Patriots", 
                             "MIA": "Miami Dolphins", "ATL": "Atlanta Falcons", "LA": "Los Angeles Rams", "JAX": "Jacksonville Jaguars", 
                             "IND": "Indianapolis Colts", "BAL": "Baltimore Ravens", "BUF": "Buffalo Bills", "CAR": "Carolina Panthers", 
                             "CIN": "Cincinnati Bengals", "CLE": "Cleveland Browns", "DAL": "Dallas Cowboys", "DET": "Detroit Lions", 
                             "GB": "Green Bay Packers", "HOU": "Houston Texans", "WAS": "Washington Football Team", "DEN": "Denver Broncos", 
                             "MIN": "Minnesota Vikings", "CHI": "Chicago Bears", "LAC": "Los Angeles Chargers", "OAK": "Oakland Raiders", 
                             "LV": "Las Vegas Raiders"}, inplace=True)

In [None]:
print("Unique NFL possession teams and their counts :")
p_team = plays.pivot_table(index = ['possessionTeam'], aggfunc = 'size') 
p_team = p_team.reset_index()
p_team.columns= ["Possession Team", "Counts"]
p_team = p_team.sort_values("Counts", ascending = False)
print(p_team)

In [None]:
# Creating Bar plot
px.bar(p_team, x="Possession Team", y="Counts", title="Distinct Count Of NFL Possession Teams", color="Counts")

*Observation*
1. Las Vegas Raiders and Oakland Raiders have very low Possession team counts

In [None]:
print("Unique NFL special teams play type and their counts :")
gp_type = plays.pivot_table(index = ['specialTeamsPlayType'], aggfunc = 'size') 
gp_type = gp_type.reset_index()
gp_type.columns= ["Special Teams Play Type", "Counts"]
gp_type = gp_type.sort_values("Counts", ascending = False)
print(gp_type)

In [None]:
# Creating Bar plot
px.bar(gp_type, x="Special Teams Play Type", y="Counts", title="Distinct Count Of NFL Special Teams Play Type", color="Counts")


*Observation*
1. Kickoff playtype has highest counts (nearly 39%)
2. Field goal has very low on count as compared to others

In [None]:
print("Unique NFL special teams result and their counts :")
gp_res = plays.pivot_table(index = ['specialTeamsResult'], aggfunc = 'size') 
gp_res = gp_res.reset_index()
gp_res.columns= ["Special Teams Result", "Counts"]
gp_res = gp_res.sort_values("Counts", ascending = False)
print(gp_res)

In [None]:
# Creating Bar plot
px.bar(gp_res, x="Special Teams Result", y="Counts", title="Distinct Count Of NFL Special Teams Result", color="Counts")


*Observation*
1. "Kickoff Team Recovery", "Blocked Punt", "Blocked Kick Attempt" have worst results
2. "Downed", "Out of Bounds", "Kick Attempt No Good" results are not pretty good

In [None]:
plays["yardlineSide"].replace({"ARI": "Arizona Cardinals", "KC": "Kansas City Chiefs", "TEN": "Tennessee Titans", "TB": "Tampa Bay Buccaneers",
                           "SF": "San Francisco 49ers", "SEA": "Seattle Seahawks", "PIT": "Pittsburgh Steelers", "PHI": "Philadelphia Eagles",
                            "NYJ": "New York Jets", "NYG": "New York Giants", "NO": "New Orleans Saints", "NE": "New England Patriots", 
                             "MIA": "Miami Dolphins", "ATL": "Atlanta Falcons", "LA": "Los Angeles Rams", "JAX": "Jacksonville Jaguars", 
                             "IND": "Indianapolis Colts", "BAL": "Baltimore Ravens", "BUF": "Buffalo Bills", "CAR": "Carolina Panthers", 
                             "CIN": "Cincinnati Bengals", "CLE": "Cleveland Browns", "DAL": "Dallas Cowboys", "DET": "Detroit Lions", 
                             "GB": "Green Bay Packers", "HOU": "Houston Texans", "WAS": "Washington Football Team", "DEN": "Denver Broncos", 
                             "MIN": "Minnesota Vikings", "CHI": "Chicago Bears", "LAC": "Los Angeles Chargers", "OAK": "Oakland Raiders", 
                             "LV": "Las Vegas Raiders"}, inplace=True)

In [None]:
print("Unique NFL yardline sides and their counts :")
g_yard = plays.pivot_table(index = ['yardlineSide'], aggfunc = 'size') 
g_yard = g_yard.reset_index()
g_yard.columns= ["Yard Line Side", "Counts"]
g_yard = g_yard.sort_values("Counts", ascending = False)
print(g_yard)

In [None]:
# Creating Bar plot
px.bar(g_yard, x="Yard Line Side", y="Counts", title="Distinct Count Of NFL Yardline Sides", color="Counts")

In [None]:
print("Unique NFL yardline numbers and their counts :")
g_yard = plays.pivot_table(index = ['yardlineNumber'], aggfunc = 'size') 
g_yard = g_yard.reset_index()
g_yard.columns= ["Yard Line Number", "Counts"]
g_yard = g_yard.sort_values("Counts", ascending = False)
print(g_yard)

In [None]:
# Creating Bar plot
px.bar(g_yard, x="Yard Line Number", y="Counts", title="Distinct Count Of NFL Yard Line Numbers", color="Yard Line Number")

In [None]:
plays["gameClockhour"] = pd.to_datetime(plays["gameClock"], format="%H:%M:%S").dt.hour
print(plays["gameClockhour"])

In [None]:
print("Unique NFL game hours and their counts :")
g_hour = plays.pivot_table(index = ['gameClockhour'], aggfunc = 'size') 
g_hour = g_hour.reset_index()
g_hour.columns= ["Game Hour", "Counts"]
g_hour = g_hour.sort_values("Counts", ascending = False)
print(g_hour)

In [None]:
# Creating Bar plot
px.bar(g_hour, x="Game Hour", y="Counts", title="Distinct Count Of NFL Game Hours", color="Counts")


In [None]:
print("Unique NFL penalty codes and their counts :")
g_code = plays.pivot_table(index = ['penaltyCodes'], aggfunc = 'size') 
g_code = g_code.reset_index()
g_code.columns= ["Penalty Codes", "Counts"]
g_code = g_code.sort_values("Counts", ascending = False)
print(g_code)

In [None]:
# Creating Bar plot
px.bar(g_code, x="Penalty Codes", y="Counts", title="Distinct Count Of NFL Penalty Codes", color="Counts")

In [None]:
print("Unique NFL penalty yards and their counts :")
gp_yard = plays.pivot_table(index = ['penaltyYards'], aggfunc = 'size') 
gp_yard = gp_yard.reset_index()
gp_yard.columns= ["Penalty Yards", "Counts"]
gp_yard = gp_yard.sort_values("Counts", ascending = False)
print(gp_yard)

In [None]:
# Creating Histogram plot
px.histogram(plays, x="penaltyYards", nbins=15 , title="Distinct Count Of NFL Visitor Teams", color_discrete_sequence=["royalblue"])


In [None]:
print("Unique NFL pre-snap home scores and their counts :")
g_home = plays.pivot_table(index = ['preSnapHomeScore'], aggfunc = 'size') 
g_home = g_home.reset_index()
g_home.columns= ["Pre-Snap Home Score", "Counts"]
g_home = g_home.sort_values("Counts", ascending = False)
print(g_home)

In [None]:
# Creating Histogram plot
px.histogram(plays, x="preSnapHomeScore", nbins=25, title="Distinct Count Of NFL Pre-Snap Home Scores", color_discrete_sequence=["red"])


In [None]:
print("Unique NFL pre-snap visitor scores and their counts :")
g_visitor = plays.pivot_table(index = ['preSnapVisitorScore'], aggfunc = 'size') 
g_visitor = g_visitor.reset_index()
g_visitor.columns= ["Pre-Snap Visitor Score", "Counts"]
g_visitor = g_visitor.sort_values("Counts", ascending = False)
print(g_visitor)

In [None]:
 #Creating Histogram plot
px.histogram(plays, x="preSnapVisitorScore", nbins=25, title="Distinct Count Of NFL Pre-Snap Visitor Scores", color_discrete_sequence=["blueviolet"])


In [None]:
print("Unique NFL pass results and their counts :")
g_res = plays.pivot_table(index = ['passResult'], aggfunc = 'size') 
g_res = g_res.reset_index()
g_res.columns= ["Pass Results", "Counts"]
g_res = g_res.sort_values("Counts", ascending = False)
print(g_res)

In [None]:
# Creating Bar plot
px.bar(g_res, x="Pass Results", y="Counts", title="Distinct Count Of NFL Pass Results", color="Counts")


In [None]:
print("Unique NFL kick lengths and their counts :")
g_kick = plays.pivot_table(index = ['kickLength'], aggfunc = 'size') 
g_kick = g_kick.reset_index()
g_kick.columns= ["Kick Length", "Counts"]
g_kick = g_kick.sort_values("Counts", ascending = False)
print(g_kick)

In [None]:
# Creating Bar plot
px.bar(g_kick, x="Kick Length", y="Counts", title="Distinct Count Of NFL Kick lengths", color="Kick Length")


In [None]:
print("Unique NFL kick return yardages and their counts :")
g_yard = plays.pivot_table(index = ['kickReturnYardage'], aggfunc = 'size') 
g_yard = g_yard.reset_index()
g_yard.columns= ["Kick Return Yardage", "Counts"]
g_yard = g_yard.sort_values("Counts", ascending = False)
print(g_yard)

In [None]:
# Creating Histogram plot
px.histogram(plays, x="kickReturnYardage", nbins=25 , title="Distinct Count Of NFL Kick Return Yardages",  color_discrete_sequence=["mediumvioletred"])


In [None]:
print("Unique NFL play results and their counts :")
gp_res = plays.pivot_table(index = ['playResult'], aggfunc = 'size') 
gp_res = gp_res.reset_index()
gp_res.columns= ["Play Result", "Counts"]
gp_res = gp_res.sort_values("Counts", ascending = False)
print(gp_res)

In [None]:
# Creating Histogram plot
px.histogram(plays, x="playResult", nbins=10, title="Distinct Count Of NFL Play Results", color_discrete_sequence=["blue"])


In [None]:
print("Unique NFL absolute yardline numbers and their counts :")
g_abyl = plays.pivot_table(index = ['absoluteYardlineNumber'], aggfunc = 'size') 
g_abyl = g_abyl.reset_index()
g_abyl.columns= ["Absolute YardLine Number", "Counts"]
g_abyl = g_abyl.sort_values("Counts", ascending = False)
print(g_abyl)

In [None]:
# Creating Bar plot
px.bar(g_abyl, x="Absolute YardLine Number", y="Counts", title="Distinct Count Of NFL Absolute Yard Line Number", color="Absolute YardLine Number")


# Tracking2018 Data

In [None]:
# loading the tracking2018 dataset
tracking_2018 = pd.read_csv("../input/nfl-big-data-bowl-2022/tracking2018.csv")

In [None]:
#checking its shape
tracking_2018.shape

In [None]:
#cheking 1st 5 rows
tracking_2018.head()

In [None]:
# Fetching the information of each column of tracking_2018 data frame
tracking_2018.info()

*Observation*
1. Dtype of 'time' column is object

In [None]:
# Counting the Null Values
tracking_2018.isnull().sum()

*Observation*
1. so many null values are present in 'o', 'dir', 'nflId', 'jerseyNumber' and 'position' column

In [None]:
# splitting date from datetime
tracking_2018['date'] = pd.DatetimeIndex(tracking_2018['time']).date
print(tracking_2018["date"])

In [None]:
#Listing out unique values from date column
print("Unique NFL dates and their counts :")
tr18_date = tracking_2018.pivot_table(index = ['date'], aggfunc = 'size') 
tr18_date = tr18_date.reset_index()
tr18_date.columns= ["Date", "Counts"]
tr18_date = tr18_date.sort_values("Counts", ascending = False)
print(tr18_date)

In [None]:
# Visualizing NFL dates by Bar plot
px.bar(tr18_date, x="Date", y="Counts", title="Distinct Count Of NFL Dates", color="Counts")

*Observation*
1. SUNDAY is a favorable day for conducting of event.
2. Most games played on 30th Dec 2018.

In [None]:
# Creating a table of coordinates of players position of home team opponents team and foot ball through out the play of playId == 36 and gameId == 2018123000
data_18= tracking_2018.query('playId == 36 and gameId == 2018123000')
print(data_18[["x", "y", "team"]])

In [None]:
# players location throughout the game
px.scatter(data_18, x="x", y="y", color="team",width=1000,height=800)

In [None]:
# Creating a table of coordinates of players position of home team opponents team and foot ball through out the play of playId == 36 and gameId == 2018102107
data_18 = tracking_2018.query('playId == 36 and gameId == 2018102107')
print(data_18[["x", "y", "team"]])

In [None]:
# players location throughout the game
px.scatter(data_18, x="x", y="y", color="team",width=1000,height=800)


In [None]:
# Creating a table of coordinates of players position of home team opponents team and foot ball through out the play of position == "CB" and gameId == 2018111900
data_18 = tracking_2018.query('position == "CB" and gameId == 2018111900')
print(data_18[["x", "y", "team"]])

In [None]:
# players location throughout the game
px.scatter(data_18, x="x", y="y", color="team",width=1000,height=800)

In [None]:
# Creating a table of coordinates of players position of home team opponents team and foot ball through out the play of position == "RB" and gameId == 2018093009
data_18 = tracking_2018.query('position == "RB" and gameId == 2018093009')
print(data_18[["x", "y", "team"]])

In [None]:
# players location throughout the game
px.scatter(data_18, x="x", y="y", color="team",width=1000,height=800)

In [None]:
# Creating a table of coordinates of players position of home team opponents team and foot ball through out the play of position == "LB" and gameId == 2018111900
data_18 = tracking_2018.query('position == "LB" and gameId == 2018111900')
print(data_18[["x", "y", "team"]])

In [None]:
# players location throughout the game
px.scatter(data_18, x="x", y="y", color="team",width=1000,height=800)

In [None]:
tracking_2018['date'] = pd.DatetimeIndex(tracking_2018['date'])
rslt_df = tracking_2018[tracking_2018['date'] == '2018-10-09']
print(rslt_df.head())

In [None]:
# Animation of a game done using coordinates and time 
# this animation shows the game played in 2018-10-09
fig = px.scatter(rslt_df, x="x", y="y", animation_frame="time", color="team")
fig.show()

In [None]:
rslt_df = tracking_2018[tracking_2018['date'] == '2018-11-30']
print(rslt_df.head())

In [None]:
#this animation shows the game played in 2018-11-30
fig = px.scatter(rslt_df, x="x", y="y", animation_frame="time", color="team")
fig.show()

# Tracking2019 Data

In [None]:
#loading the dataset
tracking_2019 = pd.read_csv("../input/nfl-big-data-bowl-2022/tracking2019.csv")

In [None]:
#checking shape of the dataset
tracking_2019.shape

In [None]:
#checking summary of the dataset
tracking_2019.info()

*Observation*
1. Dtype of 'time' column is object

In [None]:
tracking_2019['date'] = pd.DatetimeIndex(tracking_2019['time']).date
print(tracking_2019["date"])

In [None]:
print("Unique NFL dates and their counts :")
tr19_date = tracking_2019.pivot_table(index = ['date'], aggfunc = 'size') 
tr19_date = tr19_date.reset_index()
tr19_date.columns= ["Date", "Counts"]
tr19_date = tr19_date.sort_values("Counts", ascending = False)
print(tr19_date)

In [None]:
px.bar(tr19_date, x="Date", y="Counts", title="Distinct Count Of NFL Dates", color="Counts")

*Observation*
- *SUNDAY* is a favorable day for conducting of event.
- Most games played on *29th dec 2019*.

In [None]:
#checking for null value
tracking_2019.isnull().sum()

*Observation*
1. so many null values are present in 'o', 'dir', 'nflId', 'jerseyNumber' and 'position' column

In [None]:
#Creating a table of coordinates of players position of home team opponents team and foot ball through out the play of playId == 36 and gameId == 2019122900
data_19 = tracking_2019.query('playId == 36 and gameId == 2019122900')
print(data_19[["x", "y", "team"]])

In [None]:
 px.scatter(data_19, x="x", y="y", color="team",width=1000,height=800)

In [None]:
#Creating a table of coordinates of players position of home team opponents team and foot ball through out the play of playId == 3554 and gameId == 2019090901
data_19 = tracking_2019.query('playId == 3554 and gameId == 2019090901')
print(data_19[["x", "y", "team"]])

In [None]:
fig = px.scatter(data_19, x="x", y="y", color="team",width=1000,height=800)
fig.show()
#  players location throughout the game

In [None]:
tracking_2019['date'] = pd.DatetimeIndex(tracking_2019['date'])
rslt_df = tracking_2019[tracking_2019['date'] == '2019-09-10']
print(rslt_df.head())

In [None]:
#This animation shows the game played in 2019-09-10
fig = px.scatter(rslt_df, x="x", y="y", animation_frame="time", color="team")
fig.show()

# Tracking2020 Data

In [None]:
#loading the dataset
tracking_2020 = pd.read_csv("../input/nfl-big-data-bowl-2022/tracking2020.csv")

In [None]:
#checking shape of the dataset
tracking_2020.shape

In [None]:
#checking first five rows
tracking_2020.head()

In [None]:
#checking the summary of the dataset
tracking_2020.info()

*Observation*
1. Dtype of 'time' column is object

In [None]:
#creating another column "date" and storing date from 'time' column in it
tracking_2020['date'] = pd.DatetimeIndex(tracking_2020['time']).date

In [None]:
tracking_2020.head()

In [None]:
#checking for null values
tracking_2020.isnull().sum()

*Observation*
1. so many null values are present in 'o', 'dir', 'nflId', 'jerseyNumber' and 'position' column

In [None]:
#Listing out unique values from date column
print("Unique NFL dates and their counts :")
tr20_date = tracking_2020.pivot_table(index = ['date'], aggfunc = 'size') 
tr20_date = tr20_date.reset_index()
tr20_date.columns= ["Date", "Counts"]
tr20_date = tr20_date.sort_values("Counts", ascending = False)
print(tr20_date)

In [None]:
# Visualizing NFL dates by Bar plot
px.bar(tr20_date, x="Date", y="Counts", title="Distinct Count Of NFL Dates", color="Counts")

*Observation*
1. SUNDAY is a favorable day for conducting of event.
2. Most games played on 3rd Jan 2021.

In [None]:
# Creating a table of coordinates of players position of home team opponents team and foot ball through out the play 
#of playId = 40 and gameId == 2021010300
data_20 = tracking_2020.query('playId == 40 and gameId == 2021010300')
print(data_20[["x", "y", "team"]])

In [None]:
# players location throughout the game
fig = px.scatter(data_20, x="x", y="y", color="team",width=1000,height=600)
fig.show()


In [None]:
tracking_2020['date'] = pd.DatetimeIndex(tracking_2020['date'])
rslt_df20 = tracking_2020[tracking_2020['date'] == '2020-11-27']
print(rslt_df20.head())

In [None]:
#This animation shows the game played in 2020-11-27
fig = px.scatter(rslt_df20, x="x", y="y", animation_frame="time", color="team")
fig.show()