In [None]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import datetime
import tqdm
import re
from string import punctuation
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import time 
from IPython.display import display
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

pd.options.display.max_columns = None

sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [14,10]

In [None]:
#data = dd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/week*.csv')

### Games

In [None]:
games = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/games.csv')
games.loc[:,'Game_DateTime'] = [int(time.mktime(datetime.strptime(x, "%m/%d/%Y %H:%M:%S").timetuple())) for x in games['gameDate']+' '+games['gameTimeEastern']]
games.drop(['gameTimeEastern'],axis=1,inplace=True)

### Players

In [None]:
players = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/players.csv')
players.loc[:,'Age'] = [round((datetime.now()-datetime.strptime(x.replace('/','-'),'%Y-%m-%d')).days/365) if x[4] == '-' else round((datetime.now()-datetime.strptime(x.replace('/','-'),'%m-%d-%Y')).days/365) for x in players.birthDate]
players.drop(['displayName','birthDate'],axis=1,inplace=True)

### Plays

In [None]:
plays = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/plays.csv')
plays.drop(['playDescription'],axis=1,inplace=True)

impute_columns = []

def get_personnel_position(plays, impute_columns, offense = True):
    offense = 'O' if offense else 'D'
    for index,obj in enumerate(plays['personnel'+offense]):
        if isinstance(obj, str):
            split = obj.split(',')
            for key,value in {x.strip().split(' ')[1]:x.strip().split(' ')[0] for x in split}.items():
                plays.loc[index,key+'_'+offense] = value
                impute_columns.append(key+'_'+offense)
    plays.drop('personnel'+offense,axis=1,inplace=True)

get_personnel_position(plays,impute_columns,False)
get_personnel_position(plays,impute_columns,True)

def get_penalty_position(plays,impute_columns):
    for index,obj in enumerate(plays['penaltyCodes']):
        if isinstance(obj, str):
            split = obj.split(';')
            for key in split:
                plays.loc[index,key] = 1
                impute_columns.append(key)
    plays.drop('penaltyCodes',axis=1,inplace=True)

get_penalty_position(plays, impute_columns)
imput_columns = set(impute_columns)
plays.loc[:,imput_columns] = plays[imput_columns].fillna(0).astype(int)

In [None]:
data = pd.merge(plays,  games,   on=['gameId'])
data.loc[:,'Play_DateTime'] = [int(time.mktime(datetime.strptime(x, "%m/%d/%Y %H:%M:%S").timetuple())) for x in data['gameDate']+' '+data['gameClock'].fillna(method='ffill')]

data.drop(['gameDate','gameClock'],axis=1,inplace=True)

### Weeks

In [None]:
week1 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2021/week1.csv')

### Join All Data

In [None]:
data = pd.merge(data, week1,   on=['playId','gameId'])
data = pd.merge(data, players, on=['nflId'])

### Categorical Data Handling

In [None]:
categories   = ['playType','offenseFormation','typeDropback',
                'passResult','event','position_x','team','playDirection']
team_cat     = ['homeTeamAbbr','visitorTeamAbbr','possessionTeam','yardlineSide']
drop_columns = ['displayName','time','position_y','route','collegeName','penaltyJerseyNumbers']
numerics     = [x for x in data.columns if (x not in team_cat+categories+drop_columns)]

In [None]:
data.loc[:,"isDefensivePI"] = data["isDefensivePI"].map({True:0, False:1})
data.drop(drop_columns,axis=1,inplace=True)

data.loc[:,categories+team_cat] = data[categories+team_cat].fillna('nan')

for cat in categories:
    
    ohe       = OneHotEncoder(drop='first')
    ohe_fit   = ohe.fit(data[[cat]]) 
    cat_count = data[cat].nunique()
    data      = pd.concat([data.drop(cat,axis=1),
                           pd.DataFrame(ohe_fit.transform(data[[cat]]).toarray(),
                           columns=[cat+'_'+str(x) for x in range(cat_count-1)])],axis=1)

In [None]:
data_copy = data.copy()
#data = data_copy.copy()

In [None]:
ohe       = OneHotEncoder(drop='first')
data.loc[:,team_cat] = data[team_cat].fillna('nan')

team_set = set()
for cat in team_cat:
    team_set = team_set.union(set(data[cat]))
    
ohe_fit   = ohe.fit(np.asarray(list(team_set)).reshape(-1,1))
cat_count = len(team_set)

for cat in team_cat:
    data      = pd.concat([data.drop(cat,axis=1),
                           pd.DataFrame(ohe_fit.transform(data[[cat]]).toarray()
                                        ,columns=[cat+'_'+str(x) for x in range(cat_count-1)])],axis=1)

### Numerical Data Handling

In [None]:
check = data['height'].str.split('-',expand=True)
check.columns = ['first', 'second']
check.loc[(check['second'].notnull()), 'first'] = check[check['second'].notnull()]['first'].astype(np.int16) * 12 + check[check['second'].notnull()]['second'].astype(np.int16)
data['height'] = check['first']
data['height'] = data['height'].astype(np.float32)
data['height'] /= 12
data['bmi']    = ((data['weight'])/((data['height']*12)**2))*703

In [None]:
data = data[(data['defendersInTheBox'].notna()) | (data['numberOfPassRushers'].notna())]
data.loc[:,'defendersInTheBox']   = data.groupby(['gameId','playId'])['defendersInTheBox'].apply(lambda x: x.ffill().bfill())
data.loc[:,'numberOfPassRushers'] = data.groupby(['gameId','playId'])['numberOfPassRushers'].apply(lambda x: x.ffill().bfill())

In [None]:
data.drop('penaltyJerseyNumbers',axis=1,inplace=True)
data = data.sort_values(by=['gameId','playId']).reset_index(drop=True).interpolate(method="index")

In [None]:
for num in numerics:
    if data[data[num].isna()].shape[0]>0:
        print(num)

### Standard Scaling

In [None]:
minmax_scaler = MinMaxScaler()
data.loc[:,numerics] = pd.DataFrame(minmax_scaler.fit_transform(data[numerics]),columns=numerics)

In [None]:
data