In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from kaggle.competitions import nflrush

import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
#datetime
import datetime
#preprocessing
from sklearn import preprocessing 
import tqdm

PATH = '/kaggle/input/nfl-big-data-bowl-2020/'

In [None]:
#Reading the data
train = pd.read_csv(f'{PATH}train.csv', low_memory=False)

In [None]:
#unused columns
unused_col = ['GameId', 'PlayId', 'Team', 'Yards', 'TimeHandoff', 'TimeSnap']

In [None]:
#Matching the Team abbr in the three columns.
map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
for abb in train['PossessionTeam'].unique():
    map_abbr[abb] = abb
train['PossessionTeam'] = train['PossessionTeam'].map(map_abbr)
train['HomeTeamAbbr'] = train['HomeTeamAbbr'].map(map_abbr)
train['VisitorTeamAbbr'] = train['VisitorTeamAbbr'].map(map_abbr)

In [None]:
#Normalizing Stadium.
#Metlife
train.Stadium = np.where(train.Stadium.str.contains('Met'), 'MetLife Stadium', train.Stadium)
#Mercedes-Benz Stadium
train.Stadium = np.where(train.Stadium.str.contains('Mercedes'), 'Mercedes-Benz Stadium', train.Stadium)
#M&T
train.Stadium = np.where(train.Stadium.str.contains('M&T'), 'M & T Bank Stadium', train.Stadium)
#CenturyLink
train.Stadium = np.where(train.Stadium.str.contains('Century'), 'CenturyLink Field', train.Stadium)
#First Energy Stadium
train.Stadium = np.where(train.Stadium.str.contains('First'), 'First Energy Stadium', train.Stadium)
#Paul Brown Stadium
train.Stadium = np.where(train.Stadium.str.contains('Paul'), 'Paul Brown Stadium', train.Stadium)
#NRG Stadium
train.Stadium = np.where(train.Stadium.str.contains('NRG'), 'NRG Stadium', train.Stadium)
#EverBank Field
train.Stadium = np.where(train.Stadium.str.contains('Ever'), 'EverBank Field', train.Stadium)
#Los Angeles Memorial Coliesum
train.Stadium = np.where(train.Stadium.str.contains('Los Angeles'), 'Los Angeles Memorial Coliesum', train.Stadium)
#Lambeau field
train.Stadium = np.where(train.Stadium.str.contains('Lambeau'), 'Lambeau Field', train.Stadium)
#Oakland-Alameda County Coliseum
train.Stadium = np.where(train.Stadium.str.contains('Oakland'), 'Oakland Alameda County Coliseum', train.Stadium)
#Broncos Stadium at Mile High
train.Stadium = np.where(train.Stadium.str.contains('Broncos'), 'Broncos Stadium at Mile High', train.Stadium)
#Twickenham Stadium
train.Stadium = np.where(train.Stadium.str.contains('Twickenham'), 'Twickenham Stadium', train.Stadium)

In [None]:
#Normalizing Location
#Detroit
train.Location = np.where(train.Location.str.contains('Detroit'), 'Detroit, MI', train.Location)
#Los Angeles, CA
train.Location = np.where(train.Location.str.contains('Los Angeles'), 'Los Angeles, CA', train.Location)
#Cleveland, OH
train.Location = np.where(train.Location.str.contains('Cleveland'), 'Cleveland, OH', train.Location)
#Cincinnati, OH
train.Location = np.where(train.Location.str.contains('Cincinnati'), 'Cincinnati, OH', train.Location)
#Jacksonville, FL
train.Location = np.where(train.Location.str.contains('Jacksonville'), 'Jacksonville, FL', train.Location)
#Seattle, WA
train.Location = np.where(train.Location.str.contains('Seattle'), 'Seattle, WA', train.Location)
#Charlotte, NC
train.Location = np.where(train.Location.str.contains('Charlotte'), 'Charlotte, NC', train.Location)
#Pittsburgh, PA
train.Location = np.where(train.Location.str.contains('Pittsburgh'), 'Pittsburgh, PA', train.Location)
#Houston, TX
train.Location = np.where(train.Location.str.contains('Houston'), 'Houston, TX', train.Location)
#Foxborough, MA
train.Location = np.where(train.Location.str.contains('Foxborough'), 'Foxborough, MA', train.Location)
#Philadelphia, PA
train.Location = np.where(train.Location.str.contains('Philadelphia'), 'Philadelphia, PA', train.Location)
#Arlington, TX
train.Location = np.where(train.Location.str.contains('Arlington'), 'Arlington, TX', train.Location)
#Miami Gardens, FLA
train.Location = np.where(train.Location.str.contains('Miami Gardens'), 'Miami Gardens, FLA', train.Location)
#London, England
train.Location = np.where(train.Location.str.contains('London'), 'London, England', train.Location)
#E. Rutherford, NJ
train.Location = np.where(train.Location.str.contains('East Rutherford'), 'E. Rutherford, NJ', train.Location)
#New Orleans, LA
train.Location = np.where(train.Location.str.contains('New Orleans'), 'New Orleans, LA', train.Location)
#Orchard Park, NY
train.Location = np.where(train.Location.str.contains('Orchard Park'), 'Orchard Park, NY', train.Location)
#Chicago, IL
train.Location = np.where(train.Location.str.contains('Chicago'), 'Chicago, IL', train.Location)
#Baltimore, Md.
train.Location = np.where(train.Location.str.contains('Baltimore'), 'Baltimore, Md.', train.Location)

In [None]:
#Normalizing Stadium.
#creating mapper
map_indoor = {
    'Outdoors' : 'Outdoors',
    'Outdoor' : 'Outdoors',
    'Oudoor' : 'Outdoors',
    'Outddors' : 'Outdoors',
    'Outdor' : 'Outdoors',
    'Ourdoor' : 'Outdoors',
    'Outside' : 'Outdoors',
    'Heinz Field' : 'Outdoors',
    'Indoors' : 'Indoors',
    'Indoor' : 'Indoors',
    'Indoor, Roof Closed' : 'Indoors-1',
    'Indoor, Open Roof' : 'Indoors-2',
    'Dome' : 'Dome',
    'Domed' : 'Dome',
    'Dome, closed' : 'Dome, closed',
    'Closed Dome' : 'Dome, closed',
    'Domed, closed' : 'Dome, closed',
    'Bowl' : 'Bowl',
    'Open' : 'Open',
    'Domed, Open' : 'Dome, open',
    'Domed, open' : 'Dome, open',
    'Retr. Roof-Closed' : 'Retr-Roof, closed',
    'Retr. Roof - Closed' : 'Retr-Roof, closed',
    'Retr. Roof Closed' : 'Retr-Roof, closed',
    'Retr. Roof-Open' : 'Retr-Roof, open',
    'Retr. Roof - Open' : 'Retr-Roof, open',
    'Retractable Roof' : 'Retractable Roof',
    'Outdoor Retr Roof-Open' : 'Outdoor Retr Roof-Open',
    'Cloudy' : 'Cloudy'
}
train.StadiumType = train.StadiumType.map(map_indoor)

In [None]:
#Normalizing Turf
#Natural Grass
train.Turf = np.where(train.Turf.str.contains('natural'), 'Natural Grass', train.Turf)
train.Turf = np.where(train.Turf.str.contains('Natural'), 'Natural Grass', train.Turf)
#grass
train.Turf = np.where(train.Turf.str.contains('grass'), 'Grass', train.Turf)
#FieldTurf
train.Turf = np.where(train.Turf.str.contains('Field'), 'FieldTurf', train.Turf)
#Artificial
train.Turf = np.where(train.Turf.str.contains('Artificial'), 'Artifical', train.Turf)
#UBU Sports Speed S5-M
train.Turf = np.where(train.Turf.str.contains('UBU'), 'UBU Sports Speed S5-M', train.Turf)

In [None]:
#converting to datetime
train['TimeHandoff'] = train['TimeHandoff'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
train['TimeSnap'] = train['TimeSnap'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

In [None]:
#Making some features
lbl_dict = {}
for c in train.columns:
    #creating features for Defense Personnel.
    if c == 'DefensePersonnel':
        #Splitting each value and creating a separate feature
        arr = [[int(s[0]) for s in value.split(', ')] for value in train.DefensePersonnel]
        train['DL'] = np.array([a[0] for a in arr])
        train['LB'] = np.array([a[1] for a in arr])
        train['DB'] = np.array([a[2] for a in arr])
    elif c == 'GameClock':
        #splitting GameClock in minutes, seconds and totalsecs
        arr = [[int(s) for s in value.split(':')] for value in train['GameClock']]
        #features
        train['GameClockMin'] = pd.Series([a[0] for a in arr])
        train['GameClockSec'] = pd.Series([a[1] for a in arr])
        train['TotalSec'] = pd.Series([a[0] + a[1] for a in arr])
    elif c == 'PlayerHeight':
        #splitting PlayerHeight in feets, inches and totalinches
        arr = [[int(s) for s in value.split('-')] for value in train['PlayerHeight']]
        #features
        train['PlayerHeight'] = pd.Series([a[0] for a in arr])
        train['PlayerHeight_in'] = pd.Series([a[1] for a in arr])
        train['TotalInches'] = pd.Series([a[0] * 12 + a[1] for a in arr])
    elif c == 'PlayerBirthDate':
        #getting players birth year
        arr = [[int(s) for s in value.split('/')] for value in train['PlayerBirthDate']]
        train['BirthY'] = pd.Series(a[2] for a in arr)
        #getting the age of the players
        train['PlayerBirthDate'] = train['PlayerBirthDate'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))
        seconds_in_year = 60*60*24*365.25
        train['PlayerAge'] = train.apply(lambda row: (row['TimeHandoff']-row['PlayerBirthDate']).total_seconds()/seconds_in_year, axis=1)
    elif c == 'TimeHandoff':
        #Difference in snap and handoff
        train['TimeDelta'] = train.apply(lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)
    elif c == 'PlayerWeight':
        #PlayerWeight features.
        train['PlayerWeight_kg'] = train['PlayerWeight'] / 2.205
        train['PlayerWeight_grms'] = train['PlayerWeight'] * 453.592
    elif train[c].dtype=='object' and c not in unused_col: 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[c].values))
        lbl_dict[c] = lbl
        train[c] = lbl.transform(list(train[c].values))

In [None]:
train.isnull().sum().sort_values(ascending=False).head(10)

In [None]:
train.fillna(-999, inplace=True)

In [None]:
#Getting all the player related columns.
unique_columns = []
for c in train.columns:
    if c not in unused_col + ['PlayerBirthDate'] and len(set(train[c][:11])) != 1:
        unique_columns.append(c)
        print(f'{c} is unique')

In [None]:
all_columns = []
for c in train.columns:
    if c not in unique_columns + unused_col+["DefensePersonnel","GameClock","PlayerBirthDate"]:
        all_columns.append(c)
for c in unique_columns:
    for i in range(22):
        all_columns.append(c+str(i))
len(all_columns)

In [None]:
train_data=np.zeros((509762//22,len(all_columns)))
for i in tqdm.tqdm(range(0,509762,22), position=0):
    count=0
    for c in all_columns:
        if c in train:
            train_data[i//22][count] = train[c][i]
            count+=1
    for c in unique_columns:
        for j in range(22):
            train_data[i//22][count] = train[c][i+j]
            count+=1   

In [None]:
#Creating X and y
y_train_ = np.array([train["Yards"][i] for i in range(0,509762,22)])
X_train = pd.DataFrame(data=train_data,columns=all_columns)

In [None]:
#Scaling
scaler = preprocessing.StandardScaler()
scaler.fit(y_train_.reshape(-1, 1))
y_train = scaler.transform(y_train_.reshape(-1, 1)).flatten()

In [None]:
folds = 10
seed = 222
kf = KFold(n_splits = folds, shuffle = True, random_state=seed)
y_valid_pred = np.zeros(X_train.shape[0])
models = []

for tr_idx, val_idx in kf.split(X_train, y_train):
    tr_x, tr_y = X_train.iloc[tr_idx,:], y_train[tr_idx]
    vl_x, vl_y = X_train.iloc[val_idx,:], y_train[val_idx]
            
    print(len(tr_x),len(vl_x))
    tr_data = lgb.Dataset(tr_x, label=tr_y)
    vl_data = lgb.Dataset(vl_x, label=vl_y)  
    clf = lgb.LGBMRegressor(max_depth=5, n_estimators=200,learning_rate=0.01, missing=-999, subsample_for_bin=2000)
    clf.fit(tr_x, tr_y,
        eval_set=[(vl_x, vl_y)],
        early_stopping_rounds=20,
        verbose=False)
    y_valid_pred[val_idx] += clf.predict(vl_x, num_iteration=clf.best_iteration_)
    models.append(clf)

In [None]:
y_pred = np.zeros((509762//22,199))
y_ans = np.zeros((509762//22,199))

for i,p in enumerate(np.round(scaler.inverse_transform(y_valid_pred))):
    p+=99
    for j in range(199):
        if j>=p+10:
            y_pred[i][j]=1.0
        elif j>=p-10:
            y_pred[i][j]=(j+10-p)*0.05

for i,p in enumerate(scaler.inverse_transform(y_train)):
    p+=99
    for j in range(199):
        if j>=p:
            y_ans[i][j]=1.0

print("validation score:",np.sum(np.power(y_pred-y_ans,2))/(199*(509762//22)))

In [None]:
env = nflrush.make_env()

In [None]:
index = 0
for (test, sample_prediction_df) in tqdm.tqdm(env.iter_test(), position=0):
    
    test['PossessionTeam'] = test['PossessionTeam'].map(map_abbr)
    test['HomeTeamAbbr'] = test['HomeTeamAbbr'].map(map_abbr)
    test['VisitorTeamAbbr'] = test['VisitorTeamAbbr'].map(map_abbr)
    
    #Normalizing Stadium.
    #Metlife
    test.Stadium = np.where(test.Stadium.str.contains('Met'), 'MetLife Stadium', test.Stadium)
    #Mercedes-Benz Stadium
    test.Stadium = np.where(test.Stadium.str.contains('Mercedes'), 'Mercedes-Benz Stadium', test.Stadium)
    #M&T
    test.Stadium = np.where(test.Stadium.str.contains('M&T'), 'M & T Bank Stadium', test.Stadium)
    #CenturyLink
    test.Stadium = np.where(test.Stadium.str.contains('Century'), 'CenturyLink Field', test.Stadium)
    #First Energy Stadium
    test.Stadium = np.where(test.Stadium.str.contains('First'), 'First Energy Stadium', test.Stadium)
    #Paul Brown Stadium
    test.Stadium = np.where(test.Stadium.str.contains('Paul'), 'Paul Brown Stadium', test.Stadium)
    #NRG Stadium
    test.Stadium = np.where(test.Stadium.str.contains('NRG'), 'NRG Stadium', test.Stadium)
    #EverBank Field
    test.Stadium = np.where(test.Stadium.str.contains('Ever'), 'EverBank Field', test.Stadium)
    #Los Angeles Memorial Coliesum
    test.Stadium = np.where(test.Stadium.str.contains('Los Angeles'), 'Los Angeles Memorial Coliesum', test.Stadium)
    #Lambeau field
    test.Stadium = np.where(test.Stadium.str.contains('Lambeau'), 'Lambeau Field', test.Stadium)
    #Oakland-Alameda County Coliseum
    test.Stadium = np.where(test.Stadium.str.contains('Oakland'), 'Oakland Alameda County Coliseum', test.Stadium)
    #Broncos Stadium at Mile High
    test.Stadium = np.where(test.Stadium.str.contains('Broncos'), 'Broncos Stadium at Mile High', test.Stadium)
    #Twickenham Stadium
    test.Stadium = np.where(test.Stadium.str.contains('Twickenham'), 'Twickenham Stadium', test.Stadium)
    
    #Normalizing Location
    #Detroit
    test.Location = np.where(test.Location.str.contains('Detroit'), 'Detroit, MI', test.Location)
    #Los Angeles, CA
    test.Location = np.where(test.Location.str.contains('Los Angeles'), 'Los Angeles, CA', test.Location)
    #Cleveland, OH
    test.Location = np.where(test.Location.str.contains('Cleveland'), 'Cleveland, OH', test.Location)
    #Cincinnati, OH
    test.Location = np.where(test.Location.str.contains('Cincinnati'), 'Cincinnati, OH', test.Location)
    #Jacksonville, FL
    test.Location = np.where(test.Location.str.contains('Jacksonville'), 'Jacksonville, FL', test.Location)
    #Seattle, WA
    test.Location = np.where(test.Location.str.contains('Seattle'), 'Seattle, WA', test.Location)
    #Charlotte, NC
    test.Location = np.where(test.Location.str.contains('Charlotte'), 'Charlotte, NC', test.Location)
    #Pittsburgh, PA
    test.Location = np.where(test.Location.str.contains('Pittsburgh'), 'Pittsburgh, PA', test.Location)
    #Houston, TX
    test.Location = np.where(test.Location.str.contains('Houston'), 'Houston, TX', test.Location)
    #Foxborough, MA
    test.Location = np.where(test.Location.str.contains('Foxborough'), 'Foxborough, MA', test.Location)
    #Philadelphia, PA
    test.Location = np.where(test.Location.str.contains('Philadelphia'), 'Philadelphia, PA', test.Location)
    #Arlington, TX
    test.Location = np.where(test.Location.str.contains('Arlington'), 'Arlington, TX', test.Location)
    #Miami Gardens, FLA
    test.Location = np.where(test.Location.str.contains('Miami Gardens'), 'Miami Gardens, FLA', test.Location)
    #London, England
    test.Location = np.where(test.Location.str.contains('London'), 'London, England', test.Location)
    #E. Rutherford, NJ
    test.Location = np.where(test.Location.str.contains('East Rutherford'), 'E. Rutherford, NJ', test.Location)
    #New Orleans, LA
    test.Location = np.where(test.Location.str.contains('New Orleans'), 'New Orleans, LA', test.Location)
    #Orchard Park, NY
    test.Location = np.where(test.Location.str.contains('Orchard Park'), 'Orchard Park, NY', test.Location)
    #Chicago, IL
    test.Location = np.where(test.Location.str.contains('Chicago'), 'Chicago, IL', test.Location)
    #Baltimore, Md.
    test.Location = np.where(test.Location.str.contains('Baltimore'), 'Baltimore, Md.', test.Location)
    
    test.StadiumType = test.StadiumType.map(map_indoor)
    
    
    #Normalizing Turf
    #Natural Grass
    test.Turf = np.where(test.Turf.str.contains('natural'), 'Natural Grass', test.Turf)
    test.Turf = np.where(test.Turf.str.contains('Natural'), 'Natural Grass', test.Turf)
    #grass
    test.Turf = np.where(test.Turf.str.contains('grass'), 'Grass', test.Turf)
    #FieldTurf
    test.Turf = np.where(test.Turf.str.contains('Field'), 'FieldTurf', test.Turf)
    #Artificial
    test.Turf = np.where(test.Turf.str.contains('Artificial'), 'Artifical', test.Turf)
    #UBU Sports Speed S5-M
    test.Turf = np.where(test.Turf.str.contains('UBU'), 'UBU Sports Speed S5-M', test.Turf)
    
    #converting to datetime
    test['TimeHandoff'] = test['TimeHandoff'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    test['TimeSnap'] = test['TimeSnap'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))
    
    for c in test.columns:
        if c == 'DefensePersonnel':
            #Splitting each value and creating a separate feature
            arr = [[int(s[0]) for s in value.split(', ')] for value in test.DefensePersonnel]
            test['DL'] = np.array([a[0] for a in arr])
            test['LB'] = np.array([a[1] for a in arr])
            test['DB'] = np.array([a[2] for a in arr])
        elif c == 'GameClock':
            #splitting GameClock in minutes, seconds and totalsecs
            arr = [[int(s) for s in value.split(':')] for value in test['GameClock']]
            #features
            test['GameClockMin'] = pd.Series([a[0] for a in arr])
            test['GameClockSec'] = pd.Series([a[1] for a in arr])
            test['TotalSec'] = pd.Series([a[0] + a[1] for a in arr])
        elif c == 'PlayerHeight':
            #splitting PlayerHeight in feets, inches and totalinches
            arr = [[int(s) for s in value.split('-')] for value in test['PlayerHeight']]
            #features
            test['PlayerHeight'] = pd.Series([a[0] for a in arr])
            test['PlayerHeight_in'] = pd.Series([a[1] for a in arr])
            test['TotalInches'] = pd.Series([a[0] * 12 + a[1] for a in arr])
        elif c == 'PlayerBirthDate':
            #getting players birth year
            arr = [[int(s) for s in value.split('/')] for value in test['PlayerBirthDate']]
            train['BirthY'] = pd.Series(a[2] for a in arr)
            #getting the age of the players
            test['PlayerBirthDate'] = test['PlayerBirthDate'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))
            seconds_in_year = 60*60*24*365.25
            test['PlayerAge'] = test.apply(lambda row: (row['TimeHandoff']-row['PlayerBirthDate']).total_seconds()/seconds_in_year, axis=1)
        elif c == 'TimeHandoff':
            #Difference in snap and handoff
            test['TimeDelta'] = test.apply(lambda row: (row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)
        elif c == 'PlayerWeight':
            #PlayerWeight features.
            test['PlayerWeight_kg'] = test['PlayerWeight'] / 2.205
            test['PlayerWeight_grms'] = test['PlayerWeight'] * 453.592
        elif c in lbl_dict and test[c].dtype=='object'and c not in unused_col\
            and not pd.isnull(test[c]).any():
            try:
                test[c] = lbl_dict[c].transform(list(test[c].values))
            except:
                test[c] = [np.nan for i in range(22)]
    test.fillna(-999, inplace=True)
    count=0
    test_data = np.zeros((1,len(all_columns)))

    for c in all_columns:
        if c in test:
            try:
                test_data[0][count] = test[c][index]
            except:
                test_data[0][count] = np.nan
            count+=1
    for c in unique_columns:
        for j in range(22):
            try:
                test_data[0][count] = test[c][index + j]
            except:
                test_data[0][count] = np.nan
            count+=1        
    y_pred = np.zeros(199)        
    y_pred_p = np.sum(np.round(scaler.inverse_transform(
        [model.predict(test_data)[0] for model in models])))/folds
    y_pred_p += 99
    for j in range(199):
        if j>=y_pred_p+10:
            y_pred[j]=1.0
        elif j>=y_pred_p-10:
            y_pred[j]=(j+10-y_pred_p)*0.05
    env.predict(pd.DataFrame(data=[y_pred],columns=sample_prediction_df.columns))
    index += 22
env.write_submission_file()