In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# データの読み込み

In [None]:
df=pd.read_csv('/kaggle/input/nfl-big-data-bowl-2020/train.csv',sep=',')
df.columns

# 欠損値の確認

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

# 前処理

## チームのダミー変数化

In [None]:
df=pd.get_dummies(df,columns=['Team','PlayDirection'])

In [None]:
df['StadiumType'].unique()

In [None]:
df=df[(df['StadiumType']!='Cloudy') & (df['StadiumType']!='Bowl')]

In [None]:
df['StadiumType'].unique()

## スタジアムタイプが多いので、屋外と屋内で分ける

In [None]:
outdoor=['Outdoor', 'Outdoors','Open','Indoor, Open Roof','Outdoor Retr Roof-Open', 'Oudoor', 'Ourdoor','Retr. Roof-Open','Outdor',
       'Retr. Roof - Open', 'Domed, Open', 'Domed, open', 'Outside','Heinz Field']
indoor=['Indoors', 'Retractable Roof', 'Indoor','Retr. Roof-Closed','Dome', 'Domed, closed','Indoor, Roof Closed', 'Retr. Roof Closed','Closed Dome','Dome, closed','Domed']

In [None]:
df['stadiumtype']=(df['StadiumType'].isin(outdoor)*1)

In [None]:
df['stadiumtype'].unique()

In [None]:
df['GameWeather'].unique()

## 天候も種類が多いので、支障が出そうな気候だけリスト化

In [None]:
rain=['Light Rain', 'Showers','Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.','Rain', 'Heavy lake effect snow','Snow', 'Cloudy, Rain','Rain shower','Rainy']

In [None]:
df['weather']=(~df['GameWeather'].isin(rain)*1)

## フォーメーションとポジションのダミー変数化

In [None]:
df=pd.get_dummies(df, columns=['OffenseFormation','Position'])

In [None]:
df.columns

## 選手の身長をフィートとインチからセンチに変換

In [None]:
df['PlayerHeight']= df['PlayerHeight'].apply(lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))

## gameclockの文字列型からdatetime型への変更

In [None]:
df['gameclock']=[ pd.Timedelta(val).total_seconds() for val in df['GameClock']]

In [None]:
df=df.groupby('PlayId').mean()

## 欠損値の削除

In [None]:
df=df.dropna()

## 各要素の相関性の確認

In [None]:
df.corr()

In [None]:
df['DefendersInTheBox'].head()

# 説明変数と目的変数に分ける

In [None]:
features=pd.DataFrame(df,columns=['X', 'Y', 'S', 'A', 'Dis','Dir','NflId','YardLine', 'Quarter',
       'gameclock', 'Down', 'Distance','HomeScoreBeforePlay', 'VisitorScoreBeforePlay', 'DefendersInTheBox','PlayerHeight',
       'PlayerWeight','Temperature', 'Humidity', 'Team_home', 'stadiumtype', 'weather','PlayDirection_right',
       'OffenseFormation_ACE',
       'OffenseFormation_I_FORM', 'OffenseFormation_JUMBO',
       'OffenseFormation_PISTOL', 'OffenseFormation_SHOTGUN',
       'OffenseFormation_SINGLEBACK', 'OffenseFormation_WILDCAT', 'Position_C',
       'Position_CB', 'Position_DB', 'Position_DE', 'Position_DL',
       'Position_DT', 'Position_FB', 'Position_FS', 'Position_G',
       'Position_HB', 'Position_ILB', 'Position_LB', 'Position_MLB',
       'Position_NT', 'Position_OG', 'Position_OLB', 'Position_OT',
       'Position_QB', 'Position_RB', 'Position_S', 'Position_SAF',
       'Position_SS', 'Position_T', 'Position_TE', 'Position_WR'])

In [None]:
target=pd.Series(df['Yards'])

# 正規化

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X=scaler.fit_transform(features)
print(X.mean(axis=0))
print(X.std(axis=0))

# 訓練データとテストデータを分ける

In [None]:
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y,train_playid,test_playid=train_test_split(X,target,df.index,test_size=0.2,shuffle=False)


# 重回帰分析

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(train_X,train_y)
r=lr.score(test_X,test_y)
r

### 評価のカラム作成

In [None]:
yard=['Yards' + str(i) for i in range(-99,100)]

## 重回帰分析モデルでの予測

In [None]:
pred_y=lr.predict(test_X)
pred_y=np.round(pred_y)
pred_y

In [None]:
score=np.array([(i >= pred_y)*1 for i in range(-99,100)])
score

In [None]:
prediction = pd.DataFrame(score.T,
                  columns=yard,
                  index=test_playid)

In [None]:
prediction

In [None]:
score_test=np.array([(i >= test_y)*1 for i in range(-99,100)])

In [None]:
test_y=pd.DataFrame(score_test.T,
                  columns=yard,
                  index=test_playid)
test_y

# 予測値と実測値の差分評価

In [None]:
C=((prediction-test_y)**2).sum().sum()/(199*len(prediction.index))
C

In [None]:
prediction.to_csv('submission.csv')