# モジュール導入

In [None]:
from kaggle.competitions import nflrush
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from tqdm import tqdm

env = nflrush.make_env()

 # データ読み込み

In [None]:
df = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2020/train.csv', low_memory=False)
df.columns

In [None]:
iter_test = env.iter_test()

# 前処理

## ダミー変数化

In [None]:
df['Position'].unique()

In [None]:
offense_position=['QB','RB','FB','HB','WR','TE','C','G','T']
diffense_position=['DL','DT','NT','LB','ILB','MLB','OLB','DB','CB','S','SS','SAF']
new_df=df.groupby(['PlayId','Position']).count()

In [None]:
position_count=new_df['GameId'].unstack().fillna(0).astype(int)
position_count

In [None]:
pd.merge(position_count,df[['PlayId','Yards']],on='PlayId').corr()['Yards'].sort_values(ascending=False)

In [None]:
def process(df):
    df=pd.get_dummies(df,columns=['Team','PlayDirection','OffenseFormation'])
    df=df[(df['StadiumType']!='Cloudy') & (df['StadiumType']!='Bowl')]
    df=df.drop('FieldPosition', axis=1)
    outdoor=['Outdoor', 'Outdoors','Open','Indoor, Open Roof','Outdoor Retr Roof-Open', 'Oudoor', 'Ourdoor','Retr. Roof-Open','Outdor',
       'Retr. Roof - Open', 'Domed, Open', 'Domed, open', 'Outside','Heinz Field']
    indoor=['Indoors', 'RetractableRoof', 'Indoor','Retr. Roof-Closed','Dome', 'Domed, closed','Indoor, Roof Closed', 'Retr. Roof Closed','Closed Dome','Dome, closed','Domed']
    df['stadiumtype']=(df['StadiumType'].isin(outdoor)*1)
    rain=['Light Rain', 'Showers','Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.','Rain', 'Heavy lake effect snow','Snow', 'Cloudy, Rain','Rain shower','Rainy']
    df['weather']=(~df['GameWeather'].isin(rain)*1)
    df['PlayerHeight']= df['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))
    df['gameclock']=[ pd.Timedelta(val).total_seconds() for val in df['GameClock']]
    return df

In [None]:
df=process(df)

In [None]:
df=df.dropna()

In [None]:
df_position=pd.merge(df,position_count, on='PlayId')

In [None]:
df_position=df_position.rename(columns={'S_x':'S','S_y':'S_position'})

In [None]:
df_position.columns

In [None]:
df_position.isnull().sum().sum()

In [None]:
df_position.corr()['Yards'].sort_values(ascending=False).head(20)

In [None]:
def feature(df):
    features=pd.DataFrame(df,columns=['X', 'Y', 'S', 'A', 'Dis','Dir','YardLine', 'Quarter',
       'gameclock', 'Down', 'Distance','HomeScoreBeforePlay', 'VisitorScoreBeforePlay', 'DefendersInTheBox','PlayerHeight',
       'PlayerWeight','Temperature', 'Humidity', 'Team_home', 'stadiumtype', 'weather','PlayDirection_right',
       'OffenseFormation_ACE',
       'OffenseFormation_I_FORM', 'OffenseFormation_JUMBO',
       'OffenseFormation_PISTOL', 'OffenseFormation_SHOTGUN',
       'OffenseFormation_SINGLEBACK', 'OffenseFormation_WILDCAT','C', 'CB', 'DB',
       'DE', 'DL', 'DT', 'FB', 'FS', 'G', 'HB', 'ILB', 'LB', 'MLB', 'NT', 'OG',
       'OLB', 'OT', 'QB', 'RB', 'S_position', 'SAF', 'SS', 'T', 'TE', 'WR'])
    return features
    

                      

In [None]:
features=feature(df_position)

In [None]:
train_mean=features.mean(axis=0)
train_mean

In [None]:
train_std=features.std(axis=0)
train_std

## 正規化

In [None]:
def normalize(features):
    scaler=StandardScaler()
    X=(features-train_mean)/train_std
    return X

## 訓練

In [None]:
def train_predict(X,target):
    lr=LinearRegression()
    lr.fit(X,target)
    return lr
    

In [None]:
X=normalize(features)

In [None]:
target=pd.Series(df_position['Yards'])
lr=train_predict(X,target)

In [None]:
r=lr.score(X,target)
r

In [None]:
train_df=df_position.iloc[:0,:]

# 予測

In [None]:
for (test_df, sample_prediction_df) in tqdm(iter_test):
    new_df=test_df.groupby(['PlayId','Position']).count()
    position_count=new_df['GameId'].unstack().fillna(0).astype(int)
    test_df=process(test_df)
    test_df=pd.merge(test_df,position_count, on='PlayId')
    test_df=test_df.rename(columns={'S_x':'S','S_y':'S_position'})
    test_df=pd.concat([train_df, test_df],sort=False)
    test_df=test_df.fillna(0)
    test_feature=feature(test_df)
    test_X=normalize(test_feature)
    pred_y=lr.predict(test_X)
    pred_y=np.round(pred_y)
    score=np.array([(i >= pred_y).mean()*1 for i in range(-99,100)])
    sample_prediction_df.iloc[0,:]=score.T
    env.predict(sample_prediction_df)

In [None]:
sample_prediction_df

In [None]:
env.write_submission_file()

In [None]:
import os
print([filename for filename in os.listdir('/kaggle/working') if '.csv' in filename])