In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
import keras
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import math
from string import punctuation

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
from kaggle.competitions import nflrush

In [None]:
# You can only call make_env() once, so don't lose it!
env = nflrush.make_env()

In [None]:
train = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2020/train.csv', low_memory=False)
#train.head(23)

In [None]:
# clean windspeed
train['WindSpeed'] = train['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
train['WindSpeed'] = train['WindSpeed'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
train['WindSpeed'] = train['WindSpeed'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
    
def str_to_float(txt):
    try:
        return float(txt)
    except:
        return -1
    
train['WindSpeed'] = train['WindSpeed'].apply(str_to_float)

In [None]:
# clean wind direction
def clean_WindDirection(txt):
    if pd.isna(txt):
        return np.nan
    txt = txt.lower()
    txt = ''.join([c for c in txt if c not in punctuation])
    txt = txt.replace('from', '')
    txt = txt.replace(' ', '')
    txt = txt.replace('north', 'n')
    txt = txt.replace('south', 's')
    txt = txt.replace('west', 'w')
    txt = txt.replace('east', 'e')
    return txt

train['WindDirection'] = train['WindDirection'].apply(clean_WindDirection)

In [None]:
def transform_WindDirection(txt):
    if pd.isna(txt):
        return np.nan
    
    if txt=='n':
        return 0
    if txt=='nne' or txt=='nen':
        return 1/8
    if txt=='ne':
        return 2/8
    if txt=='ene' or txt=='nee':
        return 3/8
    if txt=='e':
        return 4/8
    if txt=='ese' or txt=='see':
        return 5/8
    if txt=='se':
        return 6/8
    if txt=='ses' or txt=='sse':
        return 7/8
    if txt=='s':
        return 8/8
    if txt=='ssw' or txt=='sws':
        return 9/8
    if txt=='sw':
        return 10/8
    if txt=='sww' or txt=='wsw':
        return 11/8
    if txt=='w':
        return 12/8
    if txt=='wnw' or txt=='nww':
        return 13/8
    if txt=='nw':
        return 14/8
    if txt=='nwn' or txt=='nnw':
        return 15/8
    return np.nan

train['WindDirection'] = train['WindDirection'].apply(transform_WindDirection)

In [None]:
# clean Turf
Turf = {'Field Turf':'Artificial', 'A-Turf Titan':'Artificial', 'Grass':'Natural', 'UBU Sports Speed S5-M':'Artificial', 
        'Artificial':'Artificial', 'DD GrassMaster':'Artificial', 'Natural Grass':'Natural', 
        'UBU Speed Series-S5-M':'Artificial', 'FieldTurf':'Artificial', 'FieldTurf 360':'Artificial', 'Natural grass':'Natural', 'grass':'Natural', 
        'Natural':'Natural', 'Artifical':'Artificial', 'FieldTurf360':'Artificial', 'Naturall Grass':'Natural', 'Field turf':'Artificial', 
        'SISGrass':'Artificial', 'Twenty-Four/Seven Turf':'Artificial', 'natural grass':'Natural'} 

train['Turf'] = train['Turf'].map(Turf)
train['Turf'] = train['Turf'] == 'Natural'

# solve team name encoding problem
map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
for abb in train['PossessionTeam'].unique():
    map_abbr[abb] = abb
    
train['PossessionTeam'] = train['PossessionTeam'].map(map_abbr)
train['HomeTeamAbbr'] = train['HomeTeamAbbr'].map(map_abbr)
train['VisitorTeamAbbr'] = train['VisitorTeamAbbr'].map(map_abbr)

# Before pivot:
# Creat: IsBallCarrier, ToLeft, std_x, std_y, offense, age, bmi, player number

#train['IsBallCarrier'] = train['NflId'] == train['NflIdRusher']
train['ToLeft'] = train['PlayDirection'] == 'left'

In [None]:
def define_offense(df):
    offense = []
    for i in range(0,len(df)):
        if df['HomeTeamAbbr'][i] == df['PossessionTeam'][i]:
            if df['Team'][i] == 'home':
                offense.append('offense')
            else:
                offense.append('defense')
        else:
            if df['Team'][i] == 'away':
                offense.append('offense')
            else:
                offense.append('defense')
    df['Offense'] = np.array(offense)
    return df

train = define_offense(train)


In [None]:
def bye_XY(df):
    # 计算出centroid的坐标
    x = df.groupby(['PlayId','Team'],as_index=False)['X'].mean()
    x.columns = ['PlayId','Team','x_cen']
    df = pd.merge(df,x,how="inner",on=['PlayId','Team'])
    
    y = df.groupby(['PlayId','Team'],as_index=False)['Y'].mean()
    y.columns = ['PlayId','Team','y_cen']
    df = pd.merge(df,y,how="inner",on=['PlayId','Team'])
    
    # 计算两点(当前球员和centroid)之间的距离
    distances = []
    for i in range(len(df)) : 
      x1 = df.loc[i, "X"]
      y1 = df.loc[i, "Y"] 
      x2 = df.loc[i,"x_cen"]
      y2 = df.loc[i,"y_cen"]
      dis_i = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
      distances.append(dis_i)
    
    df['distances'] = distances
    
    # 计算Average distance to centroid
    avg_distance = df.groupby(['PlayId','Team'],as_index=False)['distances'].mean()
    avg_distance.columns = ['PlayId','Team','avg_distance']
    df = pd.merge(df,avg_distance,how="inner",on=['PlayId','Team'])
    
#    # 计算qb的位置坐标
    qb_pos = df[df["Position"] == 'QB']
    qb_pos = qb_pos[['PlayId','X','Y']]
    qb_pos.columns = ['PlayId','qb_x','qb_y']
    qb_pos.drop_duplicates(subset ="PlayId",keep = 'first',inplace = True)
    df = pd.merge(df,qb_pos,how="left",on=['PlayId'])
    
    # 计算球员和qb之间的距离
    distances_qb = []
    for i in range(len(df)) : 
      x1 = df.loc[i, "X"]
      y1 = df.loc[i, "Y"] 
      x2 = df.loc[i,"qb_x"]
      y2 = df.loc[i,"qb_y"]
      dis_i = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
      distances_qb.append(dis_i)
    
    df['distances_to_qb'] = distances_qb
    
    #计算Average distance to QB
    avg_distance_to_qb = df.groupby(['PlayId','Team'],as_index=False)['distances_to_qb'].mean()
    avg_distance_to_qb.columns = ['PlayId','Team','avg_distance_to_qb']
    df = pd.merge(df,avg_distance_to_qb,how="inner",on=['PlayId','Team'])
    return df


train = bye_XY(train)

In [None]:
def Clean_Op(df):
    Op=df.groupby('PlayId',as_index=False).agg({'OffensePersonnel':'first'})
    Op_split=Op.OffensePersonnel.str.split(',',expand=True)
    Op_split.columns=["s1","s2","s3","s4","s5"]
    #pivot s1
    p_s1=Op_split.s1.str.split(' ',expand=True)
    p_s1.columns=['number','position']
    Op=p_s1.pivot(columns='position',values='number')
    Op['PlayId']=Offense_personal['PlayId']
    #pivot s2-s4
    columns=list(Op_split)
    columns=columns[1:4]
    for i in columns:
        new=Op_split[i].str.split(' ',expand=True)
        new=new.drop(new.columns[0], axis=1)
        new.columns=['number','position']
        temp=new.pivot(columns='position',values='number')
        temp['PlayId']=Offense_personal['PlayId']
        Op=Op.merge(temp,on='PlayId',suffixes=('_left', '_right'))
    #pivot s5
    s5=Op_split.s5.str.split(' ',expand=True)
    s5.columns=['number','position']
    temp=s5.pivot(columns='position',values='number')
    temp['PlayId']=Offense_personal['PlayId']
    temp=temp.drop(temp.columns[0],axis=1)
    Op=Op.merge(temp,on='PlayId',suffixes=('_left', '_right'))
    #Cleaning the data frame
    Op=Op.replace({np.nan: 0})
    Op=Op.drop([np.nan],axis=1)
    Op=Op.apply(pd.to_numeric)
    Op['RB']=Op['RB_left']+Op['RB_right']
    Op['TE']=Op['TE_left']+Op['TE_right']
    Op['WR']=Op['WR_left']+Op['WR_right']
    Op=Op.drop(['RB_left','RB_right'],axis=1)
    Op=Op.drop(['TE_left','TE_right'],axis=1)
    Op=Op.drop(['WR_left','WR_right'],axis=1)
    
    return Op


In [None]:
# 调试模块
#import copy
#df = copy.deepcopy(train)
#df = Clean_Op(df)
#df = dis_max(df)
#df.head(30)

In [None]:
def dis_max(df):
    team_Max_distance=df.groupby(['PlayId','Team'],as_index=False).agg({'X':['max','min'],'Y':['max','min']})
    team_Max_distance.columns=['PlayId','Team','X_max','X_min','Y_max','Y_min']
    team_Max_distance.head()
    team_Max_distance['max_X_distance']=team_Max_distance['X_max']-team_Max_distance['X_min']
    team_Max_distance['max_Y_distance']=team_Max_distance['Y_max']-team_Max_distance['Y_min']
    team_Max_distance2=team_Max_distance[['PlayId','Team','max_X_distance','max_Y_distance']]
    df=pd.merge(df,team_Max_distance2,how="inner",on=['PlayId','Team'])
    return df

train = dis_max(train)

In [None]:
def define_team_average_age(df):
    a2 = pd.to_datetime(df['PlayerBirthDate']).dt.year
    a3 = pd.to_datetime(df['TimeHandoff']).dt.year
    a4 = a3-a2
    df['age'] = np.array(a4)
    team_average_age=df.groupby(['PlayId','Team'],as_index=False)['age'].mean()
    team_average_age.columns=['PlayId','Team','team_avg_age']
    df=pd.merge(df,team_average_age,how="inner",on=['PlayId','Team'])
    df=df.drop('age',axis=1)
    return df

train = define_team_average_age(train)

In [None]:
def define_bmi(df):
    df['PlayerHeight'] = df['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))
    df['PlayerBMI'] = 703*(df['PlayerWeight']/(df['PlayerHeight'])**2)
    df=df.drop(['PlayerHeight','PlayerWeight'],axis=1)
    return df

# 给每个队编号，有可能方便后面pivot
def append_player_number(df):
    player_num = []
    for i in range(0,len(df)):
        if i+1 <= 11:
            player_num.append(i+1)
        else:
            player_num.append(i%11+1)
    df['player_num'] = np.array(player_num)

train = define_bmi(train)
append_player_number(train)

In [None]:
def define_Top10UniversityAlumni(df):
    #According to Pro-football-reference 
    Top10University = ["Notre Dame","USC","Ohio State","Penn State","Michigan","Nebraska","Oklahoma","Alabama","Miami"]
    gg=[]
    for i in df['PlayerCollegeName']:
        if i in Top10University:
            gg.append(1)
        else:
            gg.append(0)
    df['Alumni'] = gg
    GroupTop10U = df.groupby(['PlayId','Team'],as_index=False).agg({'Alumni':['sum']})
    GroupTop10U.columns=['PlayId','Team','SumTop10UniversityAlumni']
    df=pd.merge(df,GroupTop10U,how="inner",on=['PlayId','Team'])
    df=df.drop('Alumni',axis=1)
    return df

train = define_Top10UniversityAlumni(train)


In [None]:
def remove_cat_features(df):
    cat_features = []
    for col in df.columns:
        if df[col].dtype == 'object':
            cat_features.append(col)
    #cat_features = [x for x in cat_features if x not in ('fieldPosition','StadiumType','GameWeather')]
    cat_features.append('NflId')
    df = df.drop(cat_features, axis=1)
    return df

# 找每行是unique值的column;player_col里面就存了这些columns
def find_uni_col(df):
    uni_col = []
    for col in df.columns:
        if df[col][:11].unique().shape[0]!=1:
            uni_col.append(col)
    uni_col.append('PlayId')
    return uni_col

In [None]:
def clean_data(df):
    df.fillna(-999, inplace=True)
    #inplace : boolean, 默认值 False。如果为Ture,在原地填满。
    #注意：这将修改次对象上的任何其他视图（例如，DataFrame中的列的无复制贴片）
    
    #添加X,Y的spread:    
    team_spread=df.groupby(['PlayId','Team'],as_index=False).agg({'X':['std'],'Y':['std']})
    team_spread.columns=['PlayId','Team','team_Xspread','team_Yspread']
    df=pd.merge(df,team_spread,how="inner",on=['PlayId','Team'])
    
    #按照offense和defense分组
    df_offense = df[(df['Offense']=='offense')]
    df_defense = df[(df['Offense']=='defense')]
    
    #light GBM可以直接使用categorical features，所以是不是不用删掉categorical features?
    df_offense = remove_cat_features(df_offense)
    df_defense = remove_cat_features(df_defense)
    
    uni_a = find_uni_col(df_offense)
    uni_a.remove('JerseyNumber')
    uni_a = [x for x in uni_a if x not in ('YardLine','Down','Distance','Yards')]
    
    df_unique_offense = df_offense[uni_a]
    df_unique_defense = df_defense[uni_a]
    
    uni_a.remove('PlayId')
    df_no_unique_offense = df_offense.drop(uni_a+['JerseyNumber','GameId'], axis=1)
    df_no_unique_defense = df_defense.drop(uni_a+['JerseyNumber','GameId'], axis=1)
    
    #example_unique=example_unique.drop(['PlayId'],axis=1)
    
    # 注意这里的player_num其实是必要的
    df_uni_piv_offense = df_unique_offense.pivot(index='PlayId', columns='player_num')
    df_uni_piv_defense = df_unique_defense.pivot(index='PlayId', columns='player_num')
    
    df_no_unique_offense = df_no_unique_offense.drop_duplicates(subset='PlayId')

    df_no_unique_defense = df_no_unique_defense.drop_duplicates(subset='PlayId')
    
    df_clean_offense = pd.merge(df_uni_piv_offense,df_no_unique_offense,how='inner',on='PlayId')
    #不需要df_clean_defense=pd.merge(df_uni_piv_defense,df_no_unique_defense,how='inner',on'PlayId)因为
    #如果这么做的话，后面再做df_clean的时候会重复列出场地信息
    
    df_clean = pd.merge(df_clean_offense,df_uni_piv_defense,how='inner',on='PlayId')
    
    return df_clean

In [None]:
train = clean_data(train)
train

In [None]:
all_columns=train.columns.values.tolist() 

In [None]:
all_columns
all_columns.remove('PlayId')
all_columns.remove('Yards')

In [None]:
# 要先把X和Y分开
X_train=pd.DataFrame(data=train,columns=all_columns)
y_train = np.array([train['Yards'][i] for i in range(0,23171)])

#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
y = y_train
target = y[np.arange(0, len(train), 22)]
standard_deviation = np.std(target)
scaler = StandardScaler()
# 去掉categorical features的一个原因就是，scale的时候不去掉会有问题
X_train = scaler.fit_transform(X_train)

In [None]:
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm


params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'mae',
            'learning_rate': 0.1,
            'num_iterations': 500,
            'verbosity': -1, 
            "boost_from_average" : False,
            'num_leaves': 44,
            'bagging_fraction': 0.8,
            'bagging_freq': 3,
            'min_child_samples': 43,
            'n_estimators': 300,
            'feature_fraction': 0.9,
            'lambda_l1': 0.13413394854686794,  
            'lambda_l2': 0.0009122197743451751,
            'random_state': 42
            }

folds = 5
seed = 999

kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

models = []

for train_index, val_index in kf.split(X_train, y_train):
    train_X = X_train[train_index]
    val_X = X_train[val_index]
    train_y = y_train[train_index]
    val_y = y_train[val_index]
    lgb_train = lgb.Dataset(train_X, train_y)
    lgb_eval = lgb.Dataset(val_X, val_y)
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=50,
                valid_sets=(lgb_train, lgb_eval),
                early_stopping_rounds=200,
                verbose_eval = 50)
    models.append(gbm)

print(models)

# 12/2/2019 对怎么调参数结果都不变这个问题的研究
import scipy
# 首先搞一个自己的test_df:
fake_test_df = X_train[:1]

fake_y_pred = np.mean([model.predict(fake_test_df, num_iteration=model.best_iteration) for model in models],axis=0)
#fake_y_pred = np.clip(np.cumsum(fake_y_pred, axis=1), 0, 1).tolist()[0]

fake_pred_df = np.zeros((1, 199))  # 1是number of rows, 199是number of columns
current_cdf = scipy.stats.norm(loc = fake_y_pred, scale = standard_deviation).cdf(-98)
fake_pred_df[0][1]

for A in range(len(fake_pred_df[0])):
    current_cdf = scipy.stats.norm(loc = fake_y_pred, scale = standard_deviation).cdf(A-99)
    fake_pred_df[0][A] = current_cdf

len(current_cdf)
    
fake_final_pred_df = pd.DataFrame(data=fake_pred_df)
fake_final_pred_df

num iteration: 1000
num of fold = 5
early stop = 300
training's l1: 3.4802	valid_1's l1: 3.51941

num iteration: 500
fold = 5
early story = 200
training's l1: 3.4802	valid_1's l1: 3.51941

num iteration: 500
fold = 10
early stop = 200
training's l1: 3.47776	valid_1's l1: 3.50066
training's l1: 3.48447	valid_1's l1: 3.4561
training's l1: 3.47314	valid_1's l1: 3.49199


最早的参数：
params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'l2_root'},
            'subsample': 0.25,
            'subsample_freq': 1,
            'learning_rate': 0.1,
            'num_iterations': 500,
            'num_leaves': 31,
            'feature_fraction': 0.8,
            'lambda_l1': 1,  
            'lambda_l2': 1
            }

11/25/2019 3:20pm 改动之前的参数：
params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'mae',
            'learning_rate': 0.005,
            'num_iterations': 500,
            'verbosity': -1, 
            "boost_from_average" : False,
            'num_leaves': 44,
            'bagging_fraction': 0.9999128827046064,
            'bagging_freq': 3,
            'min_child_samples': 43,
            'n_estimators': 300,
            'feature_fraction': 0.4271070738920401,
            'lambda_l1': 0.13413394854686794,  
            'lambda_l2': 0.0009122197743451751,
            'random_state': 42
            }
改动过的参数结果较好

11/25/2019 3:51pm 
尝试用multiclass这个objective funciton; 同时设置number of class:199，改变metric为multi logloss;
出现错误：Label must be in [0, 199), but found -4 in label

In [None]:
import tqdm
import scipy
import scipy.stats as st
    
batch_no = 0
for (test_df, sample_prediction_df) in tqdm.tqdm(env.iter_test()):
    try:
        dist_to_end_test = test_df.apply(lambda x:(100 - x.loc['YardLine']) if x.loc['own_field']==1 else x.loc['YardLine'],axis=1)
        test_df['WindSpeed'] = test_df['WindSpeed'].apply(lambda x: x.lower().replace('mph', '').strip() if not pd.isna(x) else x)
        test_df['WindSpeed'] = test_df['WindSpeed'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2 if not pd.isna(x) and '-' in x else x)
        test_df['WindSpeed'] = test_df['WindSpeed'].apply(lambda x: (int(x.split()[0])+int(x.split()[-1]))/2 if not pd.isna(x) and type(x)!=float and 'gusts up to' in x else x)
        test_df['WindSpeed'] = test_df['WindSpeed'].apply(str_to_float)
    
        test_df['WindDirection'] = test_df['WindDirection'].apply(clean_WindDirection)
        test_df['WindDirection'] = test_df['WindDirection'].apply(transform_WindDirection)
    
        Turf = {'Field Turf':'Artificial', 'A-Turf Titan':'Artificial', 'Grass':'Natural', 'UBU Sports Speed S5-M':'Artificial', 
        'Artificial':'Artificial', 'DD GrassMaster':'Artificial', 'Natural Grass':'Natural', 
        'UBU Speed Series-S5-M':'Artificial', 'FieldTurf':'Artificial', 'FieldTurf 360':'Artificial', 'Natural grass':'Natural', 'grass':'Natural', 
        'Natural':'Natural', 'Artifical':'Artificial', 'FieldTurf360':'Artificial', 'Naturall Grass':'Natural', 'Field turf':'Artificial', 
        'SISGrass':'Artificial', 'Twenty-Four/Seven Turf':'Artificial', 'natural grass':'Natural'} 

        test_df['Turf'] = test_df['Turf'].map(Turf)
        test_df['Turf'] = test_df['Turf'] == 'Natural'

    # solve team name encoding problem
        map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
        for abb in test_df['PossessionTeam'].unique():
            map_abbr[abb] = abb
    
        test_df['PossessionTeam'] = test_df['PossessionTeam'].map(map_abbr)
        test_df['HomeTeamAbbr'] = test_df['HomeTeamAbbr'].map(map_abbr)
        test_df['VisitorTeamAbbr'] = test_df['VisitorTeamAbbr'].map(map_abbr)

    # Before pivot:
    # Creat: IsBallCarrier, ToLeft, std_x, std_y, offense, age, bmi, player number
    #train['IsBallCarrier'] = train['NflId'] == train['NflIdRusher']
        test_df['ToLeft'] = test_df['PlayDirection'] == 'left'
    
        test_df = define_offense(test_df)
        test_df = bye_XY(test_df)
        test_df = dis_max(test_df)
        test_df = define_team_average_age(test_df)
    
        test_df = define_bmi(test_df)
        append_player_number(test_df)
        test_df = define_Top10UniversityAlumni(test_df)
    
    ## final test data:
        test_df = clean_data(test_df)
        test_df = scaler.fit_transform(test_df)
        y_pred = np.mean([model.predict(test_df, num_iteration=model.best_iteration) for model in models],axis=0)
        #y_pred = np.clip(np.cumsum(y_pred, axis=1), 0, 1).tolist()[0] #这一行好像有问题
        
        #y_pred_p = models.predict(test_df)
        #y_pred_first = y_pred_p[0]
        
    except:
        #y_pred_first = 1
        y_pred = 1

    pred_df = np.zeros((1, 199))  
    for A in range(len(pred_df[0])):
        current_cdf = scipy.stats.norm(loc = y_pred, scale = standard_deviation).cdf(A-99)
        pred_df[0][A] = current_cdf
        
   #pred_df[0][:80] = 0

    final_pred_df = pd.DataFrame(data=pred_df, columns=sample_prediction_df.columns)
    env.predict(final_pred_df)
    batch_no += 1

env.write_submission_file()

def get_score(y_pred,cdf,w,dist_to_end):
    y_pred = int(y_pred)
    if y_pred ==w:
        y_pred_array = cdf.copy()
    elif y_pred - w >0:
        y_pred_array = np.zeros(199)
        y_pred_array[(y_pred-w):] = cdf[:(-(y_pred-w))].copy()
    elif w - y_pred >0:
        y_pred_array = np.ones(199)
        y_pred_array[:(y_pred-w)] = cdf[(w-y_pred):].copy()
    y_pred_array[-1]=1
    y_pred_array[(dist_to_end+99):]=1
    return y_pred_array

dist_to_end_test = test_df.apply(lambda x:(100 - x.loc['YardLine']) if x.loc['own_field']==1 else x.loc['YardLine'],axis=1)

pred_value = 0
for model in models:
    pred_value += model.predict(X_test)[0]/5
pred_data = list(get_score(pred_value,cdf,4,dist_to_end_test.values[0]))
pred_data = np.array(pred_data).reshape(1,199)
pred_target = pd.DataFrame(index = sample_prediction_df.index, \
                               columns = sample_prediction_df.columns, \
                               #data = np.array(pred_data))
                               data = pred_data)
