In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
from datetime import datetime as dt

### 読み込み

In [None]:
df_input = pd.read_csv("/kaggle/input/nfl-big-data-bowl-2020/train.csv")

In [None]:
df_all = df_input.copy()

In [None]:
df_all.head()

In [None]:
# 選手一人毎の情報がある列
# 他の列は同一PlayId内では同じ
personal_columms = ['X', 'Y', 'S', 'A', 'Dis', 'Orientation', 'Dir', 
                    'NflId', 'DisplayName', 'JerseyNumber', 
                   'PlayerHeight', 'PlayerWeight', 'PlayerBirthDate','PlayerCollegeName', 'Position']

In [None]:
df_all.shape

In [None]:
pd.set_option("display.max_columns", 80)

In [None]:
df_all.describe()

In [None]:
df_all["Yards"].hist(range=(-20,50),bins=70)

# 前処理

### 表記ゆれ確認

In [None]:
# 列によりチーム名の表記揺れ
print(sorted(df_all['PossessionTeam'].unique()))
print(sorted(df_all['HomeTeamAbbr'].unique()))
print(sorted(df_all['VisitorTeamAbbr'].unique()))


In [None]:
# 風速の表記いろいろ
df_all["WindSpeed"].unique()

In [None]:
import re

In [None]:
def is_float(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [None]:
# 芝の種類の表記いろいろ
df_all["Turf"].unique()

In [None]:
df_all["GameWeather"].unique()

In [None]:
df_all["WindDirection"].unique()

In [None]:
re.split('[-/ ]', 'From SSW north'.replace("From ", ""))[0]

In [None]:
def preprocess(df):
#   df=pd.get_dummies(df,columns=['Team','PlayDirection','OffenseFormation','Position'])
#    df=pd.get_dummies(df,columns=['Team','OffenseFormation'], drop_first=True)
#    df=df.drop('FieldPosition', axis=1)
    
    # SadiumType:屋内・屋外の２分類に
    outdoor=['Outdoor', 'Outdoors','Open','Indoor, Open Roof','Outdoor Retr Roof-Open', 'Oudoor',
             'Ourdoor','Retr. Roof-Open','Outdor',
               'Retr. Roof - Open', 'Domed, Open', 'Domed, open', 'Outside','Heinz Field']
    indoor=['Indoors', 'RetractableRoof', 'Indoor','Retr. Roof-Closed','Dome', 'Domed, closed',
            'Indoor, Roof Closed', 'Retr. Roof Closed','Closed Dome','Dome, closed','Domed']
    df.loc[df['StadiumType'].isin(outdoor), 'stadiumtype']=1
    df.loc[df['StadiumType'].isin(indoor), 'stadiumtype']=0
    # 分類不能なもの
    df.loc[:, "stadiumtype"] = df.loc[:, "stadiumtype"].fillna(2).astype(int)
    
    # 天候：
    weather_keywords = [['indoor', 'controlled'], # 左記のキーワードが含まれていたら0
                    ['snow'], # 左記のキーワードが含まれていたら1 （上のキーワードが優先）
                    ['clear', 'sun', 'fair'], # 2
                    ['cloudy', 'overcast'], # 3
                    ['hazy'], # 4(もや)
                    ['rain', 'shower'], # 5
                ]
    df["weather"] = df["GameWeather"].apply(
        lambda x : min([ n for n,weather_class in enumerate(weather_keywords)
                        for word in weather_class if word in str(x).lower()]+[len(weather_keywords)]))
                        # 分類不能なものは6
    # 屋内は0(indoor)に分類
    df.loc[ df['StadiumType'].isin(indoor), "weather" ] = 0
    
    # 芝：天然か否か
    natural_turf = ['Grass', 'Natural Grass', 'Natural grass', 'grass', 'Natural', 'Naturall Grass', 'natural grass']
    df["natural_turf"] = df["Turf"].isin(natural_turf)
    
    # 身長(feet-inch)、時計：floatに
    df['PlayerHeight'] = df['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))
    df['GameClock']=[ pd.Timedelta(val).total_seconds() for val in df['GameClock']]
    
    # 時刻
    df["TimeSnap"] = pd.to_datetime(df["TimeSnap"], format="%Y-%m-%dT%H:%M:%S.000Z")
    
    # チーム名の表記揺れ修正
    df = df.replace({'ARZ':'ARI', 'BLT':'BAL', 'CLV':'CLE', 'HST':'HOU'})
    
    # 風速を数値に
#    df["WindSpeed"] = df["WindSpeed"].replace({ 'SSW':np.nan, '11-17':14, "14-23":18.5, '13 MPH':13, 
#                             '12-22':17, '4 MPh':4, '15 gusts up to 25':15, '10MPH':10, '10mph':10,
#                             'E':np.nan, '7 MPH':7, 'Calm':0, '6 mph':6, 'SE':np.nan, '10-20':15})
#    df["WindSpeed"] = df["WindSpeed"].astype(float)
    df["WindSpeed"] = df["WindSpeed"].apply(lambda value : 
                            np.mean([ float(oo) for oo in re.split('[^0-9.]+', str(value)) if is_float(oo) ]))
    # 屋内は無風のはず
    df.loc[ df['StadiumType'].isin(indoor), "WindSpeed" ] = 0
    
    # タッチダウンまで何ヤード
    df["yardsToTouchdown"] = df["YardLine"]
    df.loc[ df["PossessionTeam"] == df["FieldPosition"], "yardsToTouchdown"] = 100-df["YardLine"]

    return df

In [None]:
df_all = preprocess(df_all)

### 座標データの整理

In [None]:
def preprocess_coord(df):
    # 2017年のOrientation角度座標は90度ずれている
    # https://www.kaggle.com/ben519/understanding-x-y-dir-and-orientation
    df.loc[df["Season"]==2017, "Orientation"] = (df["Orientation"] -90)%360
    
    # 攻撃方向を右向きに揃える
    df.loc[df["PlayDirection"]=="left", "X"] = 120 - df["X"]
    df.loc[df["PlayDirection"]=="left", "Y"] = 53.3 - df["Y"] # フィールド幅53.3yards
    df.loc[df["PlayDirection"]=="left", "Dir"] = (df["Dir"]+180)%360
    df.loc[df["PlayDirection"]=="left", "Orientation"] = (df["Orientation"]+180)%360
    
    # XをYardLineからの距離に
    df["X"] = df["X"] - (110-df["yardsToTouchdown"]) 
    
    # 速度と角度-> xy方向成分(左(y)向きが0度,角度は時計回り)
    df["Sy"] = (df["Dir"]*np.pi/180).map(np.cos) * df["S"]
    df["Sx"] = (df["Dir"]*np.pi/180).map(np.sin) * df["S"]
    df.drop(["S", "Dir"], axis=1, inplace=True)
    # 向きのcos
    df["Orientation"] = (df["Orientation"]*np.pi/180).map(np.cos)
    
    return df

In [None]:
df_all = preprocess_coord(df_all)

### ボールを持っている選手の行のみを選択

In [None]:
df_play = df_all[df_all["NflId"]==df_all["NflIdRusher"]].copy()

In [None]:
df_play["Position"].value_counts()

# 追加の特徴量

### home/away → 攻撃/守備

In [None]:
def get_team_score(df_play):
    # 攻撃チームの得点
    df_play.loc[df_play["Team"]=="home", "rusherTeamScore"] = df_play["HomeScoreBeforePlay"]
    df_play.loc[df_play["Team"]=="away", "rusherTeamScore"] = df_play["VisitorScoreBeforePlay"]

    # 守備チームの得点
    df_play.loc[df_play["Team"]=="home", "defenceTeamScore"] = df_play["VisitorScoreBeforePlay"]
    df_play.loc[df_play["Team"]=="away", "defenceTeamScore"] = df_play["HomeScoreBeforePlay"]

    # 得点差
    df_play.loc[:, "diffScore"] = df_play["rusherTeamScore"] - df_play["defenceTeamScore"]
    
    return df_play

In [None]:
df_play = get_team_score(df_play)

In [None]:
def player_team_name(df_play):
    # 選手の所属チーム名
    df_play.loc[:, "PlayerTeamAbbr"] = df_play["HomeTeamAbbr"]
    df_play.loc[ df_play["Team"]=="away", "PlayerTeamAbbr"] = df_play["VisitorTeamAbbr"]
    return df_play

def team_name(df_play):
    # rusher選手の所属チーム名
    df_play.loc[:, "PlayerTeamAbbr"] = df_play["HomeTeamAbbr"]
    df_play.loc[ df_play["Team"]=="away", "PlayerTeamAbbr"] = df_play["VisitorTeamAbbr"]
    
    # defence側のチーム名
    df_play.loc[:, "DefenceTeamAbbr"] = df_play["HomeTeamAbbr"]
    df_play.loc[ df_play["Team"]=="home", "DefenceTeamAbbr"] = df_play["VisitorTeamAbbr"]
    
    return df_play

In [None]:
df_all = player_team_name(df_all)
df_play = team_name(df_play)

In [None]:
# offence_position = ['WR', 'TE', 'T', 'QB', 'RB', 'G', 'C', 'FB', 'HB',  'OT', 'OG', ]
# defence_position = ['SS', 'DE', 'ILB', 'FS', 'CB', 'DT', 'OLB', 'NT', 'MLB', 'LB', 'S', 'DL', 'DB', 'SAF']

In [None]:
# df_all.loc[:, "offence"] = 0
# df_all.loc[df_all["Position"].isin(offence_position), "offence"] = 1

In [None]:
# 攻撃側選手フラグ
df_all.loc[:, "offense"] = (df_all["PlayerTeamAbbr"]==df_all["PossessionTeam"])*1

### ポジション毎の人数

In [None]:
df_all["Position"].unique()

In [None]:
# これは使わない
def count_position(df_play):
    # 文字列　→ { "ポジション":人数 } の辞書
    df_play.loc[:, "OffensePersonnel"] = df_play["OffensePersonnel"].apply(
        lambda x : { i.split(" ")[-1]:int(i.split(" ")[-2]) for i in x.split(",")} )
    df_play.loc[:, "DefensePersonnel"] = df_play["DefensePersonnel"].apply(
        lambda x : { i.split(" ")[-1]:int(i.split(" ")[-2]) for i in x.split(",")} )

    # ポジション毎の人数
    for position in ["LB", "DB"]: #他は"DL", 
        df_play.loc[:, position] = [ d[position] for d in df_play["DefensePersonnel"]]
    for position in ["RB", "TE", "WR"]: #他は"OL", QB一人
        df_play.loc[:, position] = [ d[position] for d in df_play["OffensePersonnel"]]

    return df_play

In [None]:
# df_play = count_position(df_play)

In [None]:
# 守備選手がrusherになる事例アリ
df_play[df_play["Position"].isin(["CB", "DT", "DE"])][[ "PlayId", "DisplayName", "Position", 
    "Team", "PossessionTeam", "HomeTeamAbbr", "VisitorTeamAbbr"]]
# PossessionTeamは守備選手の側

In [None]:
# 守備選手が攻撃に参加している
df_all[df_all["PlayId"]==20181007081701][[ "PlayId", "DisplayName", 
    "Team", "PossessionTeam", "HomeTeamAbbr", "VisitorTeamAbbr", "Position"]]

## 選手・チームの平均獲得ヤード

In [None]:
def average_yards(df_play):

    # チーム毎の平均獲得ヤード（home/away別, 年度別）
    team_yards_df = df_play.groupby(["Season", "Team", "PlayerTeamAbbr"]).agg(["mean", "std"])[["Yards"]]
    team_yards_df = team_yards_df.unstack(level=0)["Yards"]
    team_yards_df = team_yards_df.swaplevel(0, 1, axis=1)
    
    # チーム毎の平均損失ヤード（home/away別, 年度別）
    # "Team" に入っているのは攻撃側(rusher)がhomeかawayか
    defence_yards_df = df_play.groupby(["Season", "Team", "DefenceTeamAbbr"]).agg(["mean", "std"])[["Yards"]]
    defence_yards_df = defence_yards_df.unstack(level=0)["Yards"]
    defence_yards_df = defence_yards_df.swaplevel(0, 1, axis=1)
        
    # 選手毎の平均獲得ヤード
    personal_yards = df_play.groupby(["Season", "NflId"]).agg(["mean", "std", "count"])["Yards"]
    personal_yards = personal_yards.unstack(level=0)   
    personal_yards.dropna(inplace=True)  # 1件だけの選手は、stdがNan→除外
    
    # 新人の前年成績には平均値を入れておく
    personal_yards.fillna({'mean': personal_yards["mean"].mean(),
                                            'std': personal_yards["std"].mean(), 'count': 0}, inplace=True)
    personal_yards = personal_yards.swaplevel(0, 1, axis=1)
    
    return team_yards_df, defence_yards_df, personal_yards

In [None]:
team_yards_df, defence_yards_df, personal_yards = average_yards(df_play)

## Rusher以外の選手の情報

In [None]:
# Wikipediaに従って14ポジションに集約
position_rename = {
 "SAF":"S", "FS":"S", "SS":"S", "S":"S", "DB":"S", # "DB"は、"CB"か"S"か不明だが、"S"とする
    "CB":"CB", 
    "ILB":"MLB", "MLB":"MLB", 
    "OLB":"OLB", "LB":"OLB", #"LB"は"ILB"or"OLB"だが、"OLB"とする
    "NT":"DT", "DT":"DT", 
    "DE":"DE", "DL":"DE", # "DL"は"DE"or"DT"だが、"DE"とする

    "C":"C",
    "G":"OG", "OG":"OG", 
    "T":"OT",  "OT":"OT",
    "WR":"WR", 
    "TE":"TE",
    "QB":"QB", 
    "FB":"FB",
    "HB":"HB",
    "RB":"RB", #
}
#DL, DB, LB

In [None]:
df_all["position"] = df_all["Position"].map(position_rename)
df_play["position"] = df_play["Position"].map(position_rename)

In [None]:
# ポジション名の細かい分類と粗い分類が混ざっているので統一
def group_position_offense(position):
    OL_position = ['C', 'OT', 'OG',  # OL:通常5人
                   'DT', 'DE'] # DL, DEの選手が攻撃参加する場合はOLの一員として扱う
    RB_position = ['RB', 'FB', 'HB', "QB",  # 通常 RB1人 QB1人
                   "MLB", 'S'] # FSの選手が攻撃参加する場合はRBの一員として扱う
    WR_position = ["WR", "TE", # WR2~3,TE1~2
                   "OLB", 'CB'] # CB,OLBの選手が攻撃参加する場合はWRの一員として扱う
    
    if position in OL_position:
        return "OL"
    elif position in RB_position:
        return "RB" 
    elif position in WR_position:
        return "WR" 
    else:
        print("unknown offense position name!{}".format(position))
        return np.nan

def group_position_defense(position):    
    DL_position = ['DT', 'DE', # 4-
                  'C', 'OT', 'OG', ]
    LB_position = ['MLB', 'OLB', # 3-
                    ]
    S_position = ['S', # 2
                  'RB', 'FB', 'HB', "QB"]
    CB_position = ['CB', # 2+
                  "WR", "TE"] 
    
    if position in DL_position:
        return "DL"
    elif position in LB_position:
        return "LB"
    elif position in S_position:
        return "S"
    elif position in CB_position:
        return "CB"
    else:
        print("unknown defense position name!{}".format(position))
        return np.nan
    

In [None]:
df_all.loc[df_all["offense"]==1, "position"] = df_all.loc[df_all["offense"]==1, "position"].apply(group_position_offense)
df_all.loc[df_all["offense"]==0, "position"] = df_all.loc[df_all["offense"]==0, "position"].apply(group_position_defense)


In [None]:
position_stats_columms=['X', 'Y', 'A', 'Dis','PlayerHeight', 'PlayerWeight', 'Sx', 'Sy', 'Orientation']
position_corrY_columms=['X', 'A', 'Dis','PlayerHeight', 'PlayerWeight', 'Sx', 'Sy', 'Orientation']

In [None]:
# 以下の３関数のうち、１つを選んで使う

In [None]:
# 各ボジション毎の、特徴量の平均・分散等
def position_stats(df): 
    groups = df[(df["NflId"]!=df["NflIdRusher"])][
        ["PlayId", "position", *position_stats_columms]].groupby(["PlayId", "position"])
    # 人数、平均、分散、歪度
    position_mean = groups.agg(["count", "mean", "std", "skew"]).unstack(level=1)
    # multiindex ->
    position_mean.columns = [ col[2]+"_"+col[0]+"_"+col[1] for col in position_mean.columns]
    
    cols = [col for col in position_mean.columns 
        if ("X_count" in col) # countはどの特徴量でも同じなので、Xのものだけ残す
        or ("mean" in col) # mean は全ての特徴量を使う
        or ("std" in col and re.findall("OL|WR|DL|LB|S_|CB", col)) # RBは通常一人なのでstdは使わない
        or ("skew" in col and re.findall("OL|DL", col))] # 通常３人以下のポジションはskewを使わない
    
    # Yとの相関（右側が大きいか左側が大きいか）
    position_corr = groups.corr()["Y"].unstack().drop("Y", axis=1).reset_index()
    # 通常３人以上のポジションのみ使う
    position_corr = position_corr.loc[position_corr["position"].isin(["OL", "DL", "LB", "WR"])]
    position_corr = position_corr.pivot(index="PlayId", columns="position", values=position_corrY_columms) 
    position_corr.columns = [ col[1]+"_"+col[0]+"_corrY" for col in position_corr.columns]
    
    position_stats = position_mean[cols].join(position_corr)
    position_stats.fillna(0, inplace=True)
    
    return position_stats    

In [None]:
def average_personal_data(df_play, df_all):
    # 攻撃,守備チーム平均 体重, 身長, S, A（PlayIdがキー）
    offence_av = df_all.loc[df_all["offence"]==1, ["PlayerHeight", "PlayerWeight", "S", "A", "PlayId"]].groupby("PlayId").mean()
    defence_av = df_all.loc[df_all["offence"]==0, ["PlayerHeight", "PlayerWeight", "S", "A", "PlayId"]].groupby("PlayId").mean()
    offence_av.columns = ['PlayerHeight_offence', 'PlayerWeight_offence', 'S_offence', 'A_offence']
    defence_av.columns = ['PlayerHeight_defence', 'PlayerWeight_defence', 'S_defence', 'A_defence']

    df_play = df_play.merge(offence_av, on="PlayId", how="left").merge(defence_av, on="PlayId", how="left")

    return df_play

In [None]:
# 全選手のデータを入れる場合
def map_position(df):
    df = df.loc[df["NflId"]!=df["NflIdRusher"], ["Position", *position_stats_columms]]
    df.loc[:,"offset"] = (df["Y"]-df["Y"].mean())**2 + df["X"]**2
    
    # 攻撃は、OL(前列)とその他(後列)に
    OL = df[df["Position"]=="OL"].sort_values("offset")
    OB = df[df["Position"].isin(["QB", "RB", "TE", "WR"])].sort_values("offset")

    # 守備は、box(前列)とDB(後列)に
    DL = df[df["Position"].isin(["DL", "LB"])].sort_values("offset")
    DB = df[df["Position"]=="DB"].sort_values("offset")

    
    if len(OL)>5: # OLが6人以上いたら、外側にいる人は後列と見なす
        OB = pd.concat([OB, OL.iloc[5:]])
        OL = OL.iloc[:5]
    elif len(OB)>5: # OBが6人以上いたら、内側にいる人は前列と見なす
        OL = pd.concat([OL, OB.iloc[:-5]])
        OB = OL.iloc[-5:]

    if len(DL)>7:
        DB = pd.concat([DB, DL.iloc[7:]])
        DL = DL.iloc[:7]
    elif len(DB)>4:
        DL = pd.concat([DL, DB.iloc[:-4]])
        DB = DB.iloc[-4:]

    # 攻撃守備・前列後列別で、Yの小さい順（攻撃側の左から順）に並べて入れる
    OB, OL, DB, DL = OB.sort_values("Y"), OL.sort_values("Y"), DB.sort_values("Y"), DL.sort_values("Y")
    
    return pd.concat([OB, OL, DB, DL]).drop("offset", axis=1).reset_index(drop=True)

def process_positiondata(df_all):
    map_position_df = df_all.groupby("PlayId").apply(map_position)
    positiondata_df = map_position_df.drop("Position",axis=1).unstack(level=1).fillna(0)
    positiondata_df.columns = [ col[0]+"_"+str(col[1]) for col in positiondata_df.columns]
    
    return positiondata_df

In [None]:
#from datetime import datetime as dt

In [None]:
#print(dt.now())
#map_position_df = df_all.groupby("PlayId").apply(map_position)
#print(dt.now())

#map_position_df.to_pickle("map_position_df.pickle")

In [None]:
# 下記３行のどれか一つのみ使う

# df_play = average_personal_data(df_play, df_all)
# df_play = df_play.merge(process_positiondata(df_all), on="PlayId", how="left")
df_play = df_play.merge(position_stats(df_all), on="PlayId", how="left")


In [None]:
# 以下の３関数のうち、１つを選んで使う

In [None]:
# average_yards関数で作った３つのうち、前年度のデータを、元のデータとmerge
def merge_lastYearAv(df_play, team_yards_df, defence_yards_df, personal_yards, year):
    last_year = year-1
    df = team_yards_df[[last_year]]
    df.columns = ["team_yards_av", "team_yards_std"]
    df_year = df_play[df_play["Season"]==year].merge(df, on=["Team","PlayerTeamAbbr"], how="left")

    df = defence_yards_df[[last_year]]
    df.columns = ["def_yards_av", "def_yards_std"]
    df_year = df_year.merge(df, on=["Team","DefenceTeamAbbr"], how="left")
    
    df_year = df_year.merge(
        personal_yards[last_year].rename(columns={'mean': 'player_yards_av', 'std': 'player_yards_std',
                                                 'count': 'player_yards_count'}), 
        on="NflId", how="left")   
    
    return df_year

In [None]:
# average_yards関数で作った３つを、２年分平均して元のデータとmerge
def merge_twoYearAv(df_play, team_yards_df, defence_yards_df, personal_yards):
    df = (team_yards_df[2017]+team_yards_df[2018])/2
    df.columns = ["team_yards_av", "team_yards_std"]
    df_year = df_play.merge(df, on=["Team","PlayerTeamAbbr"], how="left")

    df = (defence_yards_df[2017]+defence_yards_df[2018])/2
    df.columns = ["def_yards_av", "def_yards_std"]
    df_year = df_year.merge(df, on=["Team","DefenceTeamAbbr"], how="left")
    
    df = (personal_yards[2017]+personal_yards[2018])/2
    df_year = df_year.merge(
        df.rename(columns={'mean': 'player_yards_av', 'std': 'player_yards_std',
                                                 'count': 'player_yards_count'}), 
        on="NflId", how="left")   
    
    return df_year

In [None]:
# average_yards関数で作った３つのうち、2017,2018を互いに別年度のデータとmerge
def merge_anotherYearAv(df_play, team_yards_df, defence_yards_df, personal_yards):
    df = team_yards_df[[2017]]
    df.columns = ["team_yards_av", "team_yards_std"]
    df_2018 = df_play[df_play["Season"]==2018].merge(df, on=["Team","PlayerTeamAbbr"], how="left")

    df = defence_yards_df[[2017]]
    df.columns = ["def_yards_av", "def_yards_std"]
    df_2018 = df_2018.merge(df, on=["Team","DefenceTeamAbbr"], how="left")
    
    df_2018 = df_2018.merge(
        personal_yards[2017].rename(columns={'mean': 'player_yards_av', 'std': 'player_yards_std',
                                                 'count': 'player_yards_count'}), 
        on="NflId", how="left") 
    
    
    df = team_yards_df[[2018]]
    df.columns = ["team_yards_av", "team_yards_std"]
    df_2017 = df_play[df_play["Season"]==2017].merge(df, on=["Team","PlayerTeamAbbr"], how="left")

    df = defence_yards_df[[2018]]
    df.columns = ["def_yards_av", "def_yards_std"]
    df_2017 = df_2017.merge(df, on=["Team","DefenceTeamAbbr"], how="left")
    
    df_2017 = df_2017.merge(
        personal_yards[2018].rename(columns={'mean': 'player_yards_av', 'std': 'player_yards_std',
                                                 'count': 'player_yards_count'}), 
        on="NflId", how="left")  
    
    return pd.concat([df_2017, df_2018])

In [None]:
# 選手・チーム毎の平均獲得ヤード数を結合
# merge_anotherYearAv(別の年の平均), merge_twoYearAv(２年分の平均) のどちらか一方を使う

# df_year = merge_anotherYearAv(df_play, team_yards_df, defence_yards_df, personal_yards)
#df_year = merge_twoYearAv(df_play, team_yards_df, defence_yards_df, personal_yards)
# チーム名、NflIdを入れるなら外すべきか
df_year = df_play

In [None]:
df_year.shape

In [None]:
df_year.columns

In [None]:
import pickle

In [None]:
df_year.to_pickle("df_year.pickle")

In [None]:
#df_year = pd.read_pickle("df_year.pickle")

# 使うデータを選ぶ

In [None]:
# 欠測確認
df_year.isnull().sum().sort_values().tail(15)

In [None]:
def select_columns(df):
    # 欠測あり列のうち、使用しないもの
    df_u = df.drop(["FieldPosition", "StadiumType", "GameWeather", "WindDirection", "DefendersInTheBox"], axis=1)
    # Humidity、Temperature, WindSpeed　の欠測は平均で埋める
#    mean_cols = ["Humidity", "Temperature", "WindSpeed"]
#    df_u[mean_cols].fillna(df_u[mean_cols].mean(), inplace=True)
    df_u = df_u.fillna(df_u.mean())
    
#    df_use.loc[:, "Yards_class"] = df_use["Yards"]
#    df_use.loc[df_use["Yards"]>=20, "Yards_class"] = 20
#    df_use.loc[df_use["Yards"]<=-5, "Yards_class"] = -5
#'Yards' 'HomeTeamAbbr_x' 'HomeTeamAbbr_y' 'over20' 'touchdown'
        
    df_X = df_u.drop([ ## 文字列
                        "DisplayName", "PossessionTeam", 
                        "OffensePersonnel", "DefensePersonnel", "PlayDirection", 
                        "PlayerBirthDate", "PlayerCollegeName", 
                        "HomeTeamAbbr", "VisitorTeamAbbr", 
                        "Location", "Turf", "Position",
#"Team", "position", "PlayerTeamAbbr", "DefenceTeamAbbr",  使う
                        "OffenseFormation",
                        # 時刻or時間
                        "GameClock", "TimeHandoff", # "TimeSnap", 使う
                       # キー、番号
                       "GameId", "PlayId", "JerseyNumber", "NflIdRusher", 
# "NflId", 使う
        
                        # "yardsToTouchdown", 'rusherTeamScore','diffScore' と重複
                        "YardLine", "VisitorScoreBeforePlay", "HomeScoreBeforePlay", "defenceTeamScore",
        
        # importance の小さい変数
#        'OffenseFormation_EMPTY', 'OffenseFormation_I_FORM',
#       'OffenseFormation_JUMBO', 'OffenseFormation_PISTOL',
#       'OffenseFormation_SHOTGUN', 'OffenseFormation_SINGLEBACK',
#       'OffenseFormation_WILDCAT', 'LB', 'DB', 'RB', 'TE', 'WR', 
        
                       ], axis=1)
    
    return df_X

In [None]:
# pd.set_option("display.max_rows", 100)

In [None]:
df_X = select_columns(df_year)

df_y = df_X["Yards"]
df_X = df_X.drop(["Yards"], axis=1)

In [None]:
# 'Quarter', 'Down', 
cat_int = ['stadiumtype', 'weather', 'natural_turf',]
cat_num = ['NflId', 'Season', ]
cat_str = ['Team', 'position', "Stadium", "PlayerTeamAbbr", "DefenceTeamAbbr"] # 'OffenseFormation', 

categorical_features_indices = sorted([list(df_X.columns).index(n) for n in cat_int+cat_num+cat_str])

# Yard数の分類モデルとして学習

In [None]:
df_ycopy = df_y.copy()

In [None]:
# 分類で使うヤード数の最大最小
max_yards = 20
min_yards = -5

In [None]:
print( len(df_y.loc[df_y>=max_yards]) )
print( len(df_y.loc[df_y<=min_yards]) )

## 最大最小の外

In [None]:
df_count = df_ycopy.value_counts().sort_index()

In [None]:
from scipy.optimize import curve_fit

In [None]:
# 20yards以上の確率分布は指数関数と仮定
# 20~37yardsの観測分布をfitして係数を決める
# logを取ってから線形fitした方が、直接expでfitするより上手くいった
def linear_fit(x, a, b):
    return a*x+b

array_x= df_count.loc[max_yards:37].index
array_y= np.log(df_count.loc[max_yards:37].values)
param, cov = curve_fit(linear_fit, array_x, array_y)

large_probs = pd.DataFrame(np.exp(np.array(range(max_yards,100))*param[0]), 
                           columns=["value"], index=np.array(range(max_yards,100)))
large_probs = large_probs/large_probs.sum()

In [None]:
#-14~-5yardsも指数関数fit
# -14yards未満の確率はゼロ（train dataに１件もない）
array_x= df_count.loc[-14:min_yards].index
array_y= np.log(df_count.loc[-14:min_yards].values)
param, cov = curve_fit(linear_fit, array_x, array_y)

small_probs = pd.DataFrame(np.exp(np.array(range(-14, min_yards+1))*param[0]), 
                           columns=["value"], index=np.array(range(-14, min_yards+1)))
small_probs = small_probs/small_probs.sum()

In [None]:
array_y_fit = array_x * param[0] + param[1]

plt.scatter(array_x, array_y)
plt.plot(array_x, array_y_fit)

In [None]:
#df_use.loc[:, "Yards_class"] = df_use["Yards"]
df_y.loc[df_y>=max_yards] = max_yards
df_y.loc[df_y<=min_yards] = min_yards

In [None]:
# from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
from sklearn.model_selection import train_test_split


In [None]:
from catboost import CatBoostClassifier, Pool


### パラメータチューニング

In [None]:
train_X, test_X, train_y, test_y, train_ycopy, test_ycopy = train_test_split(df_X, df_y, df_ycopy, test_size=0.3)

In [None]:
# lightGBM は分類クラス番号が0or正しかダメなので
df_y0 = df_y-min_yards
train_y0 = train_y-min_yards
test_y0 = test_y-min_yards

In [None]:
from sklearn.model_selection import KFold
# kf = KFold(n_splits=4)

### 評価指標を作る

In [None]:
def calc_score(pred_y, test_y, yardsToTouchdown):
    pred_df = pd.DataFrame(pred_y)
    pred_df.columns = [ "Yards"+str(n) for n in range(min_yards,max_yards+1)]
    
    pred_df_all = pd.DataFrame(pred_df,columns=[ "Yards"+str(n) for n in range(-99,100)])
    
    # max_yards以上,min_yards以下は、exp関数で分布すると仮定
    pred_df_all.loc[:, "Yards"+str(max_yards):] = \
            (pred_df_all.loc[:, ["Yards"+str(max_yards)]]).dot(large_probs.values.T).values
    pred_df_all.loc[:, "Yards-14":"Yards"+str(min_yards)] = \
            (pred_df_all.loc[:, ["Yards"+str(min_yards)]]).dot(small_probs.values.T).values
    
    pred_df_all.fillna(0, inplace=True)
    pred_df_all = pred_df_all.cumsum(axis=1)
#    features.loc[:, :"Yards-6"] = 0
#    features.loc[:, "Yards21":] = 1

    # yardsToTouchdown 以上は行かない（累積確率=１）
    for index, row in pred_df_all.iterrows():
        pred_df_all.loc[index, "Yards"+str(yardsToTouchdown[index]):] = 1
        
    test_df = pd.DataFrame(np.array([(i >= test_y)*1 for i in range(-99,100)]).T,
                           columns=pred_df_all.columns)
    
    CRPS=((pred_df_all-test_df)**2).mean().mean()
    return CRPS, pred_df_all, test_df

In [None]:
from six.moves import xrange

In [None]:
import math

In [None]:
class CRPSMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        # approxes is list of indexed containers
        # (containers with only __len__ and __getitem__ defined), one container
        # per approx dimension. Each container contains floats.
        # weight is one dimensional indexed container.
        # target is float.   
        # weight parameter can be None.
        # Returns pair (error, weights sum)

        n_class = (max_yards-min_yards+1)
        assert len(approxes) == n_class
        assert len(target) == len(approxes[0])
        
        error_sum = 0
        weight_sum = 0
        for i in xrange(len(target)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            for j in xrange(len(approxes)):
                target01 = (target[i]>=j)*1
                prob = 1/(1+(1/math.exp(approxes[j][i])))
                
                error_sum += ( (prob - target01)**2 )/n_class
#        target01 = np.array([(np.array(target)>=i)*1 for i in range(n_class)])
#        error_sum = ((approxes-target01)**2 ).sum()/n_class

        return error_sum, len(target)

In [None]:
# 結果plot関数
def plot_pred(pred_df_all, test_df):
    plt.plot(np.arange(-99,100), pred_df_all.mean())
    plt.plot(np.arange(-99,100), test_df.mean())
    plt.xlim(-15,50)
    plt.show()
    
    pred_prob_df = pred_df_all-pred_df_all.shift(axis=1)
    test_prob_df = test_df-test_df.shift(axis=1)

    plt.plot(np.arange(-99,100), pred_prob_df.mean())
    plt.plot(np.arange(-99,100), test_prob_df.mean())
    plt.xlim(-15,50)
    plt.show()
    
    plt.plot(np.arange(-99,100), (pred_prob_df.sum()-test_prob_df.sum())/np.sqrt(test_prob_df.sum())  )
    plt.plot(np.arange(-99,100), np.zeros(199))
    plt.xlim(-15, 50)
    
    plt.show()

### 適当なパラメータで試す

In [None]:
# データセットの作成。Poolで説明変数、目的変数、
# カラムのデータ型を指定できる
# train_pool = Pool(train_X, train_y0, cat_features=categorical_features_indices)
# validate_pool = Pool(test_X, test_y0, cat_features=categorical_features_indices)

In [None]:
# model = CatBoostClassifier(iterations=100, 
#            eval_metric=CRPSMetric(), 
#            task_type="GPU",
#          learning_rate=0.3
#                          )

In [None]:
#model.fit(train_pool, 
#         eval_set=validate_pool, 
#         verbose=True, 
#          use_best_model=True
#         )

In [None]:
# preds = model.predict_proba(test_X)

In [None]:
# CRPS, pred_df_all, test_df = calc_score(preds, test_ycopy, test_X["yardsToTouchdown"].values)
# CRPS

In [None]:
# plot_pred(pred_df_all, test_df)

In [None]:
# importance = pd.DataFrame(model.feature_importances_, index=df_X.columns, columns=['importance'])
# importance.sort_values("importance", ascending=False).head(20)

## Optuna でチューニング

In [None]:
import optuna

In [None]:

def objective(trial):

    #最適化するパラメータの設定
#    n_estimators = trial.suggest_int("n_estimators", 50, 300)
#    criterion = trial.suggest_categorical("criterion", ['gini','entropy'])
#    learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.1),
    params = {
         'iterations' : trial.suggest_int('iterations', 50, 300),                         
        'depth' : trial.suggest_int('depth', 4, 12),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.003, 0.3),               
        'random_strength' :trial.suggest_loguniform('random_strength', 0.3, 30),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.1, 10.00), 
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.3, 30.0),
#        'max_leaves': trial.suggest_int('iterations', 30, 500),
#        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
#        'od_wait' :trial.suggest_int('od_wait', 10, 50)   
    }

# callback
#    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'crps')

    model = CatBoostClassifier(**params, 
#                          custom_metric=
#                               task_type="GPU",
                              )
    model.fit(train_pool, 
#         eval_set=validate_pool, 
         verbose=False, 
         )
    
    preds = model.predict_proba(test_X)
    CRPS, pred_df_all, test_df = calc_score(preds, test_ycopy, test_X["yardsToTouchdown"].values)
    
    return CRPS #np.mean(scores)



'''
    kfold = KFold(n_splits=3, random_state=23)    
    scores = []
    for train, test in kfold.split(df_X):
        train_X = df_X.iloc[train]
        train_y0 = df_y0.iloc[train]
        val_X = df_X.iloc[test]
        val_y0 = df_y0.iloc[test]
        
        train_pool = Pool(train_X, train_y0, cat_features=categorical_features_indices)
        validate_pool = Pool(test_X, test_y0, cat_features=categorical_features_indices)
        
        model = CatBoostClassifier(iterations=100, 
                          custom_metric=)
        model.fit(train_pool, 
                 eval_set=validate_pool, 
                 verbose=True, 
         )
        preds = model.predict_proba(test_X)

        CRPS, pred_df_all, test_df = calc_score(preds, df_ycopy.iloc[test], val_X["yardsToTouchdown"].values)
        scores.append(CRPS)
'''
                                             


In [None]:

def objective(trial):

    #最適化するパラメータの設定
#    n_estimators = trial.suggest_int("n_estimators", 50, 300)
#    criterion = trial.suggest_categorical("criterion", ['gini','entropy'])
#    learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.1),

    params = {
         'iterations' : trial.suggest_int('iterations', 200, 2000),                         
        'depth' : trial.suggest_int('depth', 6, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.003, 0.1),               
        'random_strength' :trial.suggest_loguniform('random_strength', 0.1, 1),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.1, 1.0), 
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 20.0),
        'has_time':trial.suggest_categorical('has_time', [True, False])
#        'max_leaves': trial.suggest_int('iterations', 30, 500),
#        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
#        'od_wait' :trial.suggest_int('od_wait', 10, 50)   
    }

# callback
#    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'crps')

    kfold = KFold(n_splits=3, random_state=23)    
    scores = []
    for train, test in kfold.split(df_X):
        train_X = df_X.iloc[train]
        train_y0 = df_y0.iloc[train]
        test_X = df_X.iloc[test]
        test_y0 = df_y0.iloc[test]
        
        train_pool = Pool(train_X, train_y0, cat_features=categorical_features_indices)        
        
        model = CatBoostClassifier(**params, 
#                               task_type="GPU",
                              )
        model.fit(train_pool, 
         verbose=False, 
         )
    
        preds = model.predict_proba(test_X)
        CRPS, pred_df_all, test_df = calc_score(preds, df_ycopy.iloc[test], test_X["yardsToTouchdown"].values)
        scores.append(CRPS)
    return np.mean(scores)


'''

        
        model = CatBoostClassifier(iterations=100, 
                          custom_metric=)
        model.fit(train_pool, 
                 eval_set=validate_pool, 
                 verbose=True, 
         )
        preds = model.predict_proba(test_X)

        CRPS, pred_df_all, test_df = calc_score(preds, df_ycopy.iloc[test], val_X["yardsToTouchdown"].values)
        
'''
                                             



In [None]:
# print(dt.now())
# study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5))
# study.optimize(objective, n_trials = 50)


In [None]:
#         'iterations' : trial.suggest_int('iterations', 50, 300),                         
#        'depth' : trial.suggest_int('depth', 4, 12),                                       
#        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.003, 0.3),               
#        'random_strength' :trial.suggest_loguniform('random_strength', 0.3, 30),                       
#        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.1, 10.00), 
#        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.3, 30.0),
#49 resulted in value: 0.012788437110175058. Current best value is 0.012788437110175058 with parameters: {'iterations': 234, 'depth': 7, 'learning_rate': 0.09096011093256048, 'random_strength': 0.30857135781816153, 'bagging_temperature': 0.30082227409801277, 'l2_leaf_reg': 14.130151318780017}.

### Best model

In [None]:
#params = study.best_params
#params

In [None]:
# params = {'iterations': 234, 'depth': 7, 'learning_rate': 0.09096011093256048, 
#          'random_strength': 0.30857135781816153, 'bagging_temperature': 0.30082227409801277, 
#          'l2_leaf_reg': 14.130151318780017}

In [None]:
params = {'iterations': 1000, 'depth': 6, 
          'learning_rate': 0.037587603778015836, 'random_strength': 0.6401109404028098, 
          'bagging_temperature': 0.10845008746433943, 'l2_leaf_reg': 6.247622793136706, 
          'has_time': True}

In [None]:
# optuna 0.18.1

In [None]:
# train_X, test_X, train_y, test_y, train_ycopy, test_ycopy = train_test_split(df_X, df_y, df_ycopy, test_size=0.3)

In [None]:
# from sklearn.model_selection import KFold

In [None]:
'''
kfold = KFold(n_splits=4, random_state=1023)
scores = []
for train, test in kfold.split(df_X):
    train_X = df_X.iloc[train]
    train_y0 = df_y0.iloc[train]
    test_X = df_X.iloc[test]
    test_y0 = df_y0.iloc[test]

    train_pool = Pool(train_X, train_y0, cat_features=categorical_features_indices)
    validate_pool = Pool(test_X, test_y0, cat_features=categorical_features_indices)    


    model = CatBoostClassifier(**params, 
#                          custom_metric=
                           task_type="GPU",
                          )
    model.fit(train_pool, 
#         eval_set=validate_pool, 
     verbose=True, 
     )

    preds = model.predict_proba(test_X)
    CRPS, pred_df_all, test_df = calc_score(preds, df_ycopy.iloc[test], test_X["yardsToTouchdown"].values)
    scores.append(CRPS)
print(scores)
np.mean(scores)
'''

In [None]:
# [0.011983229474717649, 0.011816465456511192, 0.013456503430103522, 0.014002557421325387]
# 0.012814688945664437

In [None]:
train_pool = Pool(df_X, df_y0, cat_features=categorical_features_indices)

In [None]:
model = CatBoostClassifier(**params, 
#                          custom_metric=
#                           task_type="GPU",
                          )

model.fit(train_pool, 
         verbose=True, 
         )

In [None]:
importance = pd.DataFrame(model.feature_importances_, index=df_X.columns, columns=['importance'])
importance.sort_values("importance", ascending=False).head(50)

# submission data 作成

In [None]:
from kaggle.competitions import nflrush

In [None]:
env = nflrush.make_env()
#iter_test = env.iter_test()
means = df_X.mean()
#train_df=df_X.iloc[:0,:]


In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:

for (test_df, sample_prediction_df) in tqdm(env.iter_test()):
    df_a = preprocess(test_df)
    df_a = preprocess_coord(df_a)

    df_ = df_a.loc[df_a["NflId"]==df_a["NflIdRusher"], :]

    df_ = get_team_score(df_)
    df_a = player_team_name(df_a)
    df_ = team_name(df_)

    df_a.loc[:, "offense"] = (df_a["PlayerTeamAbbr"]==df_a["PossessionTeam"])*1
    df_a["position"] = df_a["Position"].map(position_rename)
    df_["position"] = df_["Position"].map(position_rename)

    df_a.loc[df_a["offense"]==1, "position"] = df_a.loc[df_a["offense"]==1, "position"].apply(group_position_offense)
    df_a.loc[df_a["offense"]==0, "position"] = df_a.loc[df_a["offense"]==0, "position"].apply(group_position_defense)

    df_ = df_.merge(position_stats(df_a), on="PlayId", how="left")

    df_year = df_
    #    df_year = merge_lastYearAv(df_, team_yards_df, defence_yards_df, personal_yards, 2019)
    #    test_X = select_columns(df_year)
    test_X=pd.DataFrame(df_year,columns=df_X.columns)
    test_X.fillna(means)

    # 無いカテゴリー変数を埋める
    #    test_X=pd.concat([train_df,test_X],sort=False)
    #    test_X = test_X.fillna(0)

    #    pred520 = clf.predict_proba(test_X)[0]
    pred520 = model.predict_proba(test_X)[0]
    pred100 = np.concatenate([ np.zeros(85), pred520[0]*small_probs["value"].values, pred520[1:-1], pred520[-1]*large_probs["value"].values ])

    sample_prediction_df.iloc[0] = pred100.cumsum()
    sample_prediction_df.loc[:, "Yards"+str(test_X["yardsToTouchdown"][0]): ] = 1

    # cumsum で１を僅かに超える場合がある
    sample_prediction_df[sample_prediction_df>1.0] = 1.0

    env.predict(sample_prediction_df)


In [None]:
# test_df["WindSpeed"]

In [None]:
# re.split('[^0-9.]+', "6mph")

In [None]:
env.write_submission_file()

In [None]:
import os
print([filename for filename in os.listdir('/kaggle/working') if '.csv' in filename])

In [None]:
owariowari

# テスト

In [None]:
(test_df, sample_prediction_df) = next(env.iter_test())

In [None]:
test_df

In [None]:
#df_a = preprocess(test_df)
#df_a = preprocess_coord(df_a)

df_ = df_a.loc[df_a["NflId"]==df_a["NflIdRusher"], :]

df_ = get_team_score(df_)
df_a = player_team_name(df_a)
df_ = team_name(df_)

df_a.loc[:, "offense"] = (df_a["PlayerTeamAbbr"]==df_a["PossessionTeam"])*1
df_a["position"] = df_a["Position"].map(position_rename)
df_["position"] = df_["Position"].map(position_rename)

df_a.loc[df_a["offense"]==1, "position"] = df_a.loc[df_a["offense"]==1, "position"].apply(group_position_offense)
df_a.loc[df_a["offense"]==0, "position"] = df_a.loc[df_a["offense"]==0, "position"].apply(group_position_defense)

df_ = df_.merge(position_stats(df_a), on="PlayId", how="left")

df_year = df_play
#    df_year = merge_lastYearAv(df_, team_yards_df, defence_yards_df, personal_yards, 2019)
#    test_X = select_columns(df_year)
test_X=pd.DataFrame(df_year,columns=df_X.columns)
test_X.fillna(means)

# 無いカテゴリー変数を埋める
#    test_X=pd.concat([train_df,test_X],sort=False)
#    test_X = test_X.fillna(0)

#    pred520 = clf.predict_proba(test_X)[0]
pred520 = model.predict_proba(test_X)[0]
pred100 = np.concatenate([ np.zeros(85), pred520[0]*small_probs["value"].values, pred520[1:-1], pred520[-1]*large_probs["value"].values ])

sample_prediction_df.iloc[0] = pred100.cumsum()
sample_prediction_df.loc[:, "Yards"+str(test_X["yardsToTouchdown"][0]): ] = 1

# cumsum で１を僅かに超える場合がある
sample_prediction_df[sample_prediction_df>1.0] = 1.0

In [None]:
sample_prediction_df

In [None]:
sample_prediction_df.loc[:, "Yards-17":]

In [None]:
df_all = preprocess(test_df)

In [None]:
df_ = df_all[df_all["NflId"]==df_all["NflIdRusher"]]

In [None]:
#df_all["offence"] = 0
#df_all.loc[df_all["Position"].isin(offence_position), "offence"] = 1

In [None]:
df_ = team_name(df_)
# df_ = count_position(df_)
# team_yards_df, defence_yards_df, personal_yards = average_yards(df_)

In [None]:
df_ = get_team_score(df_)
df_ = average_personal_data(df_, df_all)
df_ = add_last_yards(df_)

In [None]:
df_year = merge_lastYearAv(df_, team_yards_df, defence_yards_df, personal_yards, 2019)

In [None]:
test_X = select_columns(df_year)

In [None]:
test_X.columns

In [None]:
test_X=pd.concat([train_df,test_X],sort=False)
test_X = test_X.fillna(0)

In [None]:
pred_df = pd.DataFrame(clf.predict_proba(test_X), columns=[ "Yards"+str(i) for i in clf.classes_])
pred_df = pred_df.cumsum(axis=1)