In [None]:
# Downloading data
from pathlib import Path
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn import preprocessing
import numpy as np

path = Path('../input/nfl-big-data-bowl-2021')
df_games = pd.read_csv(path/'games.csv')
df_players = pd.read_csv(path/'players.csv')
df_plays = pd.read_csv(path/'plays.csv')
df_weeks = pd.read_csv(path/'week1.csv')

# Checking games data
df_games.isnull().sum().sort_values(ascending = False)/len(df_games)

# Checking players data
df_players.isnull().sum().sort_values(ascending = False)/len(df_players)

# Checking plays data
df_plays.isnull().sum().sort_values(ascending = False)/len(df_plays)

# Checking week data
df_weeks.isnull().sum().sort_values(ascending = False)/len(df_weeks)

# Delete the columns (week1.csv)
df_weeks = df_weeks.drop(columns = ['route'])

# Delete the columns (plays.csv)
df_plays = df_plays.drop(columns = ['penaltyJerseyNumbers', 'penaltyCodes'])

df_plays = df_plays.dropna()
df_weeks = df_weeks.dropna()

# data analysis of the games.csv
games_by_date = df_games['gameDate'].value_counts().reset_index()
games_by_date.columns = [ 'date', 'games' ]
games_by_date = games_by_date.sort_values('games')

fig = px.bar(
    games_by_date, 
    y = 'date', 
    x = 'games', 
    orientation = 'h', 
    title = 'Number of games by date',
    labels = {'date': 'Game date', 'games': 'Games played'}
)

fig.show()
#
games_per_week = df_games['week'].value_counts().reset_index()
games_per_week.columns = [ 'week', 'games' ]
games_per_week = games_per_week.sort_values('games')

fig = px.bar(
    games_per_week, 
    y = 'week', 
    x = 'games',
    orientation = 'h', 
    title = 'Number of games per week',
    labels = {'week': 'Week', 'games': 'Games played'}
)

fig.show()

# data analysis of the players.csv
fig = px.histogram(
    df_players,
    x = "height", 
    nbins = 20,
    title = 'Height distribution',
    labels = {'height': 'Height'}
)
fig.show()
#
fig = px.histogram(
    df_players,
    x = "weight", 
    nbins = 20,
    title = 'Weight distribution',
    labels = {'weight': 'Weight'}
)
fig.show()

# data analysis of the plays.csv
plays_by_type = df_plays['playType'].value_counts().reset_index()
plays_by_type.columns = [ 'type', 'plays' ]
plays_by_type = plays_by_type.sort_values('plays')

fig = px.pie(
    plays_by_type, 
    names = 'type', 
    values = 'plays',  
    title = 'Number of plays of every type'
)

fig.show()
#
plays_per_quarter = df_plays['quarter'].value_counts().reset_index()
plays_per_quarter.columns = [ 'quarter', 'plays' ]
plays_per_quarter = plays_per_quarter.sort_values('plays')

fig = px.pie(
    plays_per_quarter, 
    names = 'quarter', 
    values = 'plays',  
    title = 'Number of plays per quarter'
)

fig.show()

# Prediction
data = df_weeks.drop(columns = ['time', 'jerseyNumber'])
data = pd.merge(data, df_games[['gameId', 'homeTeamAbbr', 'visitorTeamAbbr']], how = 'inner', on = 'gameId')
data.team = data.apply(lambda x: x.homeTeamAbbr if x.team == 'home' else x.visitorTeamAbbr, axis = 1)
data = data.drop(columns = ['homeTeamAbbr', 'visitorTeamAbbr'])

avg = data[['nflId', 's', 'a', 'dis']].groupby('nflId').mean() # Changing avr speed
data = data.drop(columns = ['s', 'a', 'dis'])
data = pd.merge(data, avg, how = 'inner', on = 'nflId')

def xmod(row):
    if row.playDirection == 'left': return row.absoluteYardlineNumber - row.x
    if row.playDirection == 'right': return row.x - row.absoluteYardlineNumber

data = pd.merge(data, df_plays[['gameId', 'playId', 'absoluteYardlineNumber']], how = 'inner', on = ['gameId', 'playId'])
data['x'] = data.apply(xmod, axis = 1)
data = data.drop(columns = ['playDirection', 'absoluteYardlineNumber'])

# Influence in players speed or acceleration
data = pd.merge(data, df_players[['nflId', 'height', 'weight']], how = 'inner', on = 'nflId')

plays_info = df_plays[['gameId', 'playId', 'possessionTeam', 'yardsToGo', 'playType', 'offenseFormation', 
                       'personnelO', 'defendersInTheBox', 'numberOfPassRushers', 'personnelD', 'typeDropback', 
                       'passResult', 'offensePlayResult', 'playResult', 'epa', 'isDefensivePI']]
# Merge our processed data
data = data[['gameId', 'playId', 'x', 'y', 'team', 'displayName', 'position', 
             'height', 'weight', 's', 'a', 'dis', 'o', 'dir', 'event']]
data = pd.merge(data, plays_info, how = 'inner', on = ['gameId', 'playId'])
data = data.drop(columns = ['gameId', 'playId'])

####
columns_type_bool = data.dtypes[data.dtypes == bool]
cols_to_transform = list(columns_type_bool.index)
cols_to_transform

####
data[cols_to_transform] = data[cols_to_transform].astype(int)

####
columns_type_object = data.dtypes[data.dtypes == object]
cols_to_transform = list(columns_type_object.index)
cols_to_transform


le = preprocessing.LabelEncoder()
data[cols_to_transform] = data[cols_to_transform].apply(lambda col: le.fit_transform(col), axis = 0, result_type = 'expand')

Y = data['playResult']
X = data.drop(columns = 'playResult')

cv = KFold(n_splits = 10, random_state = 1, shuffle = True)

# Linear model
model = LinearRegression()
scores = cross_val_score(model, X, Y, scoring = 'r2', cv = cv, n_jobs = -1)
print('R^2 (coefficient of determination): %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

# RandomForest
model = RandomForestClassifier()
scores = cross_val_score(model, X, Y, scoring = 'f1_micro', cv = cv, n_jobs = -1)
print('F1-Score (micro): %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

# Function for the prediction of the play result 
def process_play_result(value):
    if value == 0: return 0
    if value > 0: return 1
    if value < 0: return -1
    
data['playResult'] = data['playResult'].apply(lambda value: process_play_result(value))
Y = data['playResult']
X = data.drop(columns = 'playResult')

scores = cross_val_score(model, X, Y, scoring = 'r2', cv = cv, n_jobs = -1)
print('R^2 (coefficient of determination) for Linear Regression Model: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

scores = cross_val_score(model, X, Y, scoring = 'f1_micro', cv = cv, n_jobs = -1)
print('F1-Score (micro) for Random Forest Model: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
# Downloading data
import pandas as pd
import numpy as np
import os
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.patches as patches
pd.set_option('max_columns', 100)

train = pd.read_csv('../input/nfl-big-data-bowl-2020/train.csv', low_memory=False)
train2021 = pd.read_csv('../input/nfl-big-data-bowl-2021/plays.csv')

# Function for creating football field
def create_football_field(linenumbers=True,
                          endzones=True,
                          highlight_line=False,
                          highlight_line_number=50,
                          highlighted_name='Line of Scrimmage',
                          fifty_is_los=False,
                          figsize=(12, 6.33)):
    """
    Function that plots the football field for viewing plays.
    Allows for showing or hiding endzones.
    """
    rect = patches.Rectangle((0, 0), 120, 53.3, linewidth=0.1,
                             edgecolor='r', facecolor='darkgreen', zorder=0)

    fig, ax = plt.subplots(1, figsize=figsize)
    ax.add_patch(rect)

    plt.plot([10, 10, 10, 20, 20, 30, 30, 40, 40, 50, 50, 60, 60, 70, 70, 80,
              80, 90, 90, 100, 100, 110, 110, 120, 0, 0, 120, 120],
             [0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3,
              53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 53.3, 0, 0, 53.3],
             color='white')
    if fifty_is_los:
        plt.plot([60, 60], [0, 53.3], color='gold')
        plt.text(62, 50, '<- Player Yardline at Snap', color='gold')
    # Endzones
    if endzones:
        ez1 = patches.Rectangle((0, 0), 10, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ez2 = patches.Rectangle((110, 0), 120, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ax.add_patch(ez1)
        ax.add_patch(ez2)
    plt.xlim(0, 120)
    plt.ylim(-5, 58.3)
    plt.axis('off')
    if linenumbers:
        for x in range(20, 110, 10):
            numb = x
            if x > 50:
                numb = 120 - x
            plt.text(x, 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white')
            plt.text(x - 0.95, 53.3 - 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white', rotation=180)
    if endzones:
        hash_range = range(11, 110)
    else:
        hash_range = range(1, 120)

    for x in hash_range:
        ax.plot([x, x], [0.4, 0.7], color='white')
        ax.plot([x, x], [53.0, 52.5], color='white')
        ax.plot([x, x], [22.91, 23.57], color='white')
        ax.plot([x, x], [29.73, 30.39], color='white')

    if highlight_line:
        hl = highlight_line_number + 10
        plt.plot([hl, hl], [0, 53.3], color='yellow')
        plt.text(hl + 2, 50, '<- {}'.format(highlighted_name),
                 color='yellow')
    return fig, ax

create_football_field()
plt.show()

# Adding players
fig, ax = create_football_field()
train.query("PlayId == 20170907000118 and Team == 'away'") \
    .plot(x='X', y='Y', kind='scatter', ax=ax, color='orange', s=30, legend='Away')
train.query("PlayId == 20170907000118 and Team == 'home'") \
    .plot(x='X', y='Y', kind='scatter', ax=ax, color='blue', s=30, legend='Home')
plt.title('Adding players')
plt.legend()
plt.show()

# Adding the line of first physical contact
playid = 20181230154157
train.query("PlayId == @playid").head()

yl = train.query("PlayId == @playid")['YardLine'].tolist()[0]
fig, ax = create_football_field(highlight_line=True,
                                highlight_line_number=yl+54)
train.query("PlayId == @playid and Team == 'away'") \
    .plot(x='X', y='Y', kind='scatter', ax=ax, color='orange', s=30, legend='Away')
train.query("PlayId == @playid and Team == 'home'") \
    .plot(x='X', y='Y', kind='scatter', ax=ax, color='blue', s=30, legend='Home')
plt.title('line of first physical contact')
plt.legend()
plt.show()

# Starting of a play 
train2021 = pd.read_csv('../input/nfl-big-data-bowl-2021/week11.csv')

example_play_home = train2021.query('gameId == 2018111900 and playId == 5577 and team == "home"')
example_play_away = train2021.query('gameId == 2018111900 and playId == 5577 and team == "away"')

fig, ax = create_football_field()
example_play_home.query('event == "ball_snap"').plot(x='x', y='y', kind='scatter', ax=ax, color='orange', s=30, legend='Away')
example_play_away.query('event == "ball_snap"').plot(x='x', y='y', kind='scatter', ax=ax, color='blue', s=30, legend='Home')
plt.title('Game at Ball Snap (Starting)')
plt.legend()
plt.show()

# Moving of the players
fig, ax = create_football_field()
example_play_home.plot(x='x', y='y', kind='scatter', ax=ax, color='orange', s=30, legend='Away')
example_play_away.plot(x='x', y='y', kind='scatter', ax=ax, color='blue', s=30, legend='Home')
plt.title('Moving of the players')
plt.legend()
plt.show()

In [None]:
# Downloading data
import pandas as pd
import numpy as np
import os
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.patches as patches

import plotly.express as px
from plotly.subplots import make_subplots

import statsmodels.api as sm
import plotly.graph_objs as go

path_shared = '/kaggle/input/nfl-big-data-bowl-2021/{}'
df_players = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/players.csv')
df_games = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/games.csv')
df_plays = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/plays.csv')

# To change the height column:
df_players['height']=[int(x[0])*12 + int(x[2]) if '-' in x else x for x in df_players['height'] ]
df_players['height'] = pd.to_numeric(df_players['height'])
# To change the birthDate column
df_players['birthDate'] = pd.to_datetime(df_players['birthDate']).dt.strftime('%m/%d/%Y')
df_players['birthDate']= pd.to_datetime(df_players['birthDate'])

cat_item = {'Offense': ['QB', 'RB', 'FB', 'WR', 'TE', 'HB'], 
            'Defense': ['OLB', 'MLB', 'LB', 'ILB', 'CB', 'DE', 'DT', 'NT', 'DB', 'S', 'SS', 'FS'], 
            'Special': ['K', 'P', 'LS']}
item_cat = {w: k for k, v in cat_item.items() for w in v}

df_players['team_role'] = df_players['position'].map(item_cat)

df_players.tail(5)

# Based on position and team (Height and weight)
height_weight_scatter = px.scatter(data_frame = df_players, x = 'weight', y = 'height', 
                                    color="team_role", symbol = 'position', facet_col = 'team_role',
                                  symbol_sequence = [3,100,102, 104, 208,14,114,19,125,0,101,4,125],
                                  opacity = 0.7, color_discrete_map ={"Defense": "red", "Offense":"gray", "Special": "blue"},
                                  labels = {'team_role': 'team'})
height_weight_scatter
height_weight_scatter.update_layout(autosize = False, width =1000, height =500)