In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.patches as patches

import plotly.express as px
from plotly.subplots import make_subplots

import statsmodels.api as sm
import plotly.graph_objs as go

#sns.set(rc = {'figure.figsize':(12,7)})

In [None]:
path_shared = '/kaggle/input/nfl-big-data-bowl-2021/{}'
df_players = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/players.csv')
df_games = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/games.csv')
df_plays = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/plays.csv')

In [None]:
df_players.tail(3)

### <div class="alert-danger">It can be see that different rows on height and birthDate columns are not having the same format, so we need to fix this</div>

In [None]:
#To change the height column:
df_players['height']=[int(x[0])*12 + int(x[2]) if '-' in x else x for x in df_players['height'] ]
df_players['height'] = pd.to_numeric(df_players['height'])

In [None]:
#To change the birthDate column
df_players['birthDate'] = pd.to_datetime(df_players['birthDate']).dt.strftime('%m/%d/%Y')
df_players['birthDate']= pd.to_datetime(df_players['birthDate'])


In [None]:
df_players.tail(3) #tadaaa, now we have a consistant dataframe.

#### <div class="alert-warning">now based on the position of players we can know which player is going to play as Offense, Defense or Special team</div> 

In [None]:
cat_item = {'Offense': ['QB', 'RB', 'FB', 'WR', 'TE', 'HB'], 
            'Defense': ['OLB', 'MLB', 'LB', 'ILB', 'CB', 'DE', 'DT', 'NT', 'DB', 'S', 'SS', 'FS'], 
            'Special': ['K', 'P', 'LS']}
item_cat = {w: k for k, v in cat_item.items() for w in v}

df_players['team_role'] = df_players['position'].map(item_cat)

df_players.tail(5)

# <div class="alert-info">Visualization on players data</div>


#### <div class="alert-success">lets see how many players each college has (based on Offense Defense and Special</div>

In [None]:
df_college = df_players.groupby(['collegeName', 'team_role'], as_index = False)[['nflId']].count()

barchart_college = px.bar(
data_frame = df_college,
x = 'collegeName',
y = 'nflId',
color = 'team_role',
opacity = 0.9,
orientation = 'v',
barmode = 'relative',
labels = dict(nflId = 'number of players', collegeName = 'College Name', team_role = 'Which team are they playing in?'),
color_discrete_map = {'Offense': 'gray', 'Defense': 'red', 'Special': 'blue'})


barchart_college.update_layout(xaxis={'categoryorder':'total descending'}, autosize = False, width =1000, height =500)

#### <div class="alert-success">lets see what is the age distribution of our players</div>

In [None]:
birthyear_fig = px.histogram(df_players, x="birthDate", labels = {'birthDate': 'Year of Birth'})
birthyear_fig.update_layout(autosize = False, width =1000, height =500)
birthyear_fig.show()


#### <div class="alert-success">lets see the distribution height and weight of our players</div>

* <div class="alert-success">based on the team</div>
* <div class="alert-success">based on the position</div>



In [None]:
#distribution based on team
height_weight_distribution = px.scatter(data_frame = df_players, x = 'weight', y = 'height', 
                                        color="team_role", symbol = 'team_role',
                                        color_discrete_map ={"Defense": "red", "Offense":"gray", "Special": "blue"},
                                        marginal_x = 'histogram', marginal_y = 'box',
                                        labels = {'team_role': 'team they are playing in:'})


height_weight_distribution.update_layout(autosize = False, width =1200, height =600)

In [None]:
#based on position and team
height_weight_scatter = px.scatter(data_frame = df_players, x = 'weight', y = 'height', 
                                    color="team_role", symbol = 'position', facet_col = 'team_role',
                                  symbol_sequence = [3,100,102, 104, 208,14,114,19,125,0,101,4,125],
                                  opacity = 0.7, color_discrete_map ={"Defense": "red", "Offense":"gray", "Special": "blue"},
                                  labels = {'team_role': 'team'})
height_weight_scatter
height_weight_scatter.update_layout(autosize = False, width =1000, height =500)

# <div class="alert-info">Visualization on plays data</div>


#### <div class="alert-success">lets figure out which team obtains the most yards during the whole season</div>


In [None]:
yards_total = df_plays.groupby('possessionTeam').agg({'offensePlayResult': 'sum', 'gameId':'nunique'}).reset_index().rename(columns={'gameId': 'number of matches', 'offensePlayResult': 'yardsGained'})
yards_total['averageYards']= np.round(yards_total['yardsGained']/yards_total['number of matches'])
yards_total.head()

In [None]:
fig_yards = make_subplots(rows=2, cols=1, shared_xaxes=True, shared_yaxes=False, subplot_titles=("total yards gained by each team during the whole season", "average yards gained by each team during the whole season"))

fig_yards.add_trace(go.Bar(x=yards_total['possessionTeam'], y=yards_total['yardsGained'],),
1, 1)
fig_yards.add_trace(go.Bar(x=yards_total['possessionTeam'], y=yards_total['averageYards'],),
2, 1)
fig_yards.update_layout(showlegend=False, xaxis={'categoryorder':'total descending'}, autosize = False, width =1000, height =700)
fig_yards.show()

#### <div class="alert-success">now it would be interesting to see which dropback type would result in most yards, to check that we consider some factors</div>
* sum the whole yards that are gained by specific dropback type during the season.
* calculate how many of each dropback had successful pass 
* figure out the popularity of drawback types during the whole season.
    this would help us tho find the average yards gained by each
    & success rate of each dropback type



In [None]:
dropbacks_total = df_plays.groupby('typeDropback')['offensePlayResult'].agg(yardsGained='sum', popularity='count')
dropback_passRes = df_plays.groupby(['typeDropback', 'passResult'])[['passResult']].count()
dropback_passRes = dropback_passRes.add_suffix('_Count').reset_index()
success_dropback = dropback_passRes.loc[dropback_passRes['passResult']== 'C']

mapping = dict(success_dropback[['typeDropback', 'passResult_Count']].values)
dropbacks_total['succesful'] =dropbacks_total.index.map(mapping)
dropbacks_total['averageYard'] = np.round(dropbacks_total['yardsGained']/dropbacks_total['popularity'])
dropbacks_total['success_normal'] = dropbacks_total['succesful']/dropbacks_total['popularity']
dropbacks_total = dropbacks_total.reset_index()
dropbacks_total

In [None]:
#lets plot the successful, popular and most yardgained dropbacks

color_discrete_sequence=px.colors.qualitative.Dark2
colors = color_discrete_sequence[0:7]

fig = make_subplots( subplot_titles=["average of yards gained by each type of dropback in whole season", 
                                    "popularity of dropbacks in whole season", 'success rate of dropback types in whole season'],
    rows=2, cols=2,
    column_widths=[0.6, 0.4],
    row_heights=[0.5, 0.5],
    specs=[[{"type": "bar", "rowspan": 2}, {"type": "pie"}],
           [            None                    , {"type": "pie"}]])

# Add scattergeo globe map of volcano locations
fig.add_trace(
    go.Bar(x=dropbacks_total['typeDropback'], y=dropbacks_total['averageYard'],marker_color= colors,
           ),
    row=1, col=1
)

# Add locations pie chart popularity
fig.add_trace(
    go.Pie(values= dropbacks_total['popularity'], labels= dropbacks_total['typeDropback'],marker_colors=colors,
           ),
    row=1, col=2
)

# Add piechart successrate
fig.add_trace(
    go.Pie( values = dropbacks_total['success_normal'], labels =dropbacks_total['typeDropback'],marker_colors=colors),
    row=2, col=2
)


# Rotate x-axis labels
#fig.update_xaxes(tickangle=45)

# Set theme, margin in layout
fig.update_layout(
    
    template="plotly_dark",
    margin=dict(r=10, t=25, b=40, l=60),
    showlegend=True,xaxis={'categoryorder':'total descending'}, autosize = False, width =1000, height =700
)


fig['layout']['annotations'][0].update(x=0.3, y=1);
fig['layout']['annotations'][1].update(x=0.9, y=0.5);
fig['layout']['annotations'][2].update(x=0.9, y=-0.1);
fig.show()

#### <div class="alert-success">lets see which matches took place in which weeks</div>
* 1st we need to add some columns from games dataset into this one so that we can have draw some insights


In [None]:
#adding the necessary columns into plays
df_plays['week'] = df_plays['gameId'].map(df_games.set_index('gameId')['week'])
df_plays['homeTeam'] =df_plays['gameId'].map(df_games.set_index('gameId')['homeTeamAbbr'])
df_plays['guestTeam'] =df_plays['gameId'].map(df_games.set_index('gameId')['visitorTeamAbbr'])

In [None]:
# preparing a table that has 3 columns, week number, number of matches, Matches between teams
list =range(1,18)

grand_list = []
for i in list:
    name = 'week'+str(i)
    y_week = df_plays.loc[df_plays['week']== i, ['gameId', 'homeTeam', 'guestTeam']]
    y_week['matches'] = '['+ y_week['homeTeam']+ ','+ y_week['guestTeam']+']'
    num_games = str(len(y_week['gameId'].unique()))
    y_matches = np.array(y_week['matches'].unique().tolist())
    data =[name, num_games, y_matches]
    grand_list.append(data)

list_week = [item[0] for item in grand_list]
list_games = [item[1] for item in grand_list]
list_matches = [item[2] for item in grand_list]

In [None]:
table_matches = go.Figure(data = go.Table(
    columnorder= [1,2,3],
    columnwidth = [90,100,1300],
    
    header = dict(values = ['Weeks', 'Number of Matches', 'Matches between teams'], align = 'center', height = 40),
    cells = dict(values = [list_week, list_games, list_matches], align = 'center', height = 40)
))

table_matches.layout.width = 1300
table_matches.show()

#### <div class="alert-success">How is the play-type in overall, and each week per game </div>

(I am not sure if that would help for understanding the defense, but lets see, Later I can change my way of thinking. I am very new to football anyway)


In [None]:
#creating a new dataframe with the columns that I think might be helpful later for analytics

df_playes_new = df_plays[['week', 'gameId','possessionTeam', 'playType', 'offenseFormation', 'personnelO', 
'defendersInTheBox', 'numberOfPassRushers', 'personnelD', 'typeDropback',
'passResult', 'offensePlayResult', 'homeTeam', 'guestTeam']].copy()

df_playes_new['matches'] = '['+ df_playes_new['homeTeam']+ ' , '+ df_playes_new['guestTeam']+']'

# creating another dataframe just to visualize  playtype
df = df_playes_new.groupby(['week', 'matches', 'playType'])[['playType']].count()
#to make convert df from groupby object into dataframe with columns
df = df.add_suffix('_Count').reset_index()

df.head(4)

In [None]:
#during the whole season its the playtype percentage played by teams
playtype_pichart = px.pie(
    data_frame = df, 
    values = 'playType_Count', 
    names= 'playType', color = 'playType', 
    color_discrete_sequence =['mediumvioletred ', 'darkorange', 'blue'], hole = 0.3)
playtype_pichart

In [None]:
#to have a stacked bar chart  of playtype based on each game and each week
barchart_playtype = px.bar(
data_frame = df,
x = 'week',
y = 'playType_Count',
color = 'playType',
opacity = 0.9,
orientation = 'v',
facet_row = 'playType',
color_discrete_sequence =['mediumvioletred ', 'darkorange', 'blue'],
hover_name = 'matches',
labels = {'playType_Count': 'number of times play-type repeated in week', 'playType': 'play-type chosen by teams'},
title = 'stacked bar chart for playtype based on each game, each week')
barchart_playtype.update_layout(height = 1200)
barchart_playtype.update_yaxes(matches = None)

to do:
more plays analysis ( I need to think about it and read more online about football match)
probably I will focus on defence and offense style but if you have any suggestions for me, please share it with me