# 1 ベースライン作成

## 1-1 ライブラリのインポート

In [1]:
import numpy as np
import pandas as pd
import gc
import pickle
import os
import datetime as dt

# plot
import matplotlib.pyplot as plt

# LightGBM
import lightgbm as lgb

from sklearn.metrics import mean_absolute_error

import warnings
warnings.simplefilter("ignore")

# 表示桁数の指定
pd.options.display.float_format = '{:10.4f}'.format

## 1-2 train_updated.csvファイルの読み込み

In [2]:
train = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/train_updated.csv")
print(train.shape)
train.head()

(1308, 12)


Unnamed: 0,date,nextDayPlayerEngagement,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
0,20180101,"[{""engagementMetricsDate"":""2018-01-02"",""player...",,"[{""playerId"":400121,""gameDate"":""2018-01-01"",""t...",,,"[{""transactionId"":340732,""playerId"":547348,""pl...",,,,"[{""date"":""2018-01-01"",""playerId"":545361,""playe...","[{""date"":""2018-01-01"",""teamId"":147,""teamName"":..."
1,20180102,"[{""engagementMetricsDate"":""2018-01-03"",""player...",,"[{""playerId"":134181,""gameDate"":""2018-01-02"",""t...",,,"[{""transactionId"":339458,""playerId"":621173,""pl...",,,,,
2,20180103,"[{""engagementMetricsDate"":""2018-01-04"",""player...",,"[{""playerId"":425492,""gameDate"":""2018-01-03"",""t...",,,"[{""transactionId"":347527,""playerId"":572389,""pl...",,,,,
3,20180104,"[{""engagementMetricsDate"":""2018-01-05"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-04"",""t...",,,"[{""transactionId"":339549,""playerId"":545343,""pl...",,,,,
4,20180105,"[{""engagementMetricsDate"":""2018-01-06"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-05"",""t...",,,"[{""transactionId"":341195,""playerId"":628336,""pl...",,,,,


## 1-3 処理速度を上げるためにデータを絞り込む

In [3]:
train = train.loc[train["date"]>=20200401, :].reset_index(drop=True)
print(train.shape)

(487, 12)


## 1-4 train_updated.csv専用の変換関数の作成

In [4]:
def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)

def extract_data(input_df, col="events", show=False):
    output_df = pd.DataFrame()
    for i in np.arange(len(input_df)):
        if show: print("\r{}/{}".format(i+1, len(input_df)), end="")
        try:
            output_df = pd.concat([
                output_df,
                unpack_json(input_df[col].iloc[i])
            ], axis=0, ignore_index=True)
        except:
            pass
    if show: print("")
    if show: print(output_df.shape)
    if show: display(output_df.head())
    return output_df

## 1-5 train_updated.csvから「nextDayPlayerEngagement」を取り出して表形式に変換

In [5]:
df_engagement = extract_data(train, col="nextDayPlayerEngagement", show=True)

487/487
(1003707, 6)


Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4
0,2020-04-02,425794,5.1249,9.434,0.1179,6.1947
1,2020-04-02,571704,0.0389,8.1761,0.0105,2.1304
2,2020-04-02,506702,0.0106,5.0314,0.0082,0.885
3,2020-04-02,607231,0.0247,2.8302,0.0222,0.59
4,2020-04-02,543193,0.0071,1.1006,0.0012,0.1967


## 1-6 結合キーであるdate_playerIDの作成

In [6]:
df_engagement["date_playerId"] = df_engagement["engagementMetricsDate"].str.replace("-", "") + "_" + df_engagement["playerId"].astype(str)
df_engagement.head()

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,date_playerId
0,2020-04-02,425794,5.1249,9.434,0.1179,6.1947,20200402_425794
1,2020-04-02,571704,0.0389,8.1761,0.0105,2.1304,20200402_571704
2,2020-04-02,506702,0.0106,5.0314,0.0082,0.885,20200402_506702
3,2020-04-02,607231,0.0247,2.8302,0.0222,0.59,20200402_607231
4,2020-04-02,543193,0.0071,1.1006,0.0012,0.1967,20200402_543193


## 1-7 日付から簡単な特徴量を作成

In [7]:
# 推論実施日のカラム作成（推論実施日＝推論対象日の前日）
df_engagement["date"] = pd.to_datetime(df_engagement["engagementMetricsDate"], format="%Y-%m-%d") + dt.timedelta(days=-1)

# 推論実施日から「曜日」と「年月」の特徴量作成
df_engagement["dayofweek"] = df_engagement["date"].dt.dayofweek
df_engagement["yearmonth"] = df_engagement["date"].astype(str).apply(lambda x: x[:7])
df_engagement.head()

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,date_playerId,date,dayofweek,yearmonth
0,2020-04-02,425794,5.1249,9.434,0.1179,6.1947,20200402_425794,2020-04-01,2,2020-04
1,2020-04-02,571704,0.0389,8.1761,0.0105,2.1304,20200402_571704,2020-04-01,2,2020-04
2,2020-04-02,506702,0.0106,5.0314,0.0082,0.885,20200402_506702,2020-04-01,2,2020-04
3,2020-04-02,607231,0.0247,2.8302,0.0222,0.59,20200402_607231,2020-04-01,2,2020-04
4,2020-04-02,543193,0.0071,1.1006,0.0012,0.1967,20200402_543193,2020-04-01,2,2020-04


## 1-8 players.csvファイルの読み込み

In [8]:
df_players = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/players.csv")
print(df_players.shape)
print(df_players["playerId"].agg("nunique"))
df_players.head()

(2061, 12)
2061


Unnamed: 0,playerId,playerName,DOB,mlbDebutDate,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
0,665482,Gilberto Celestino,1999-02-13,2021-06-02,Santo Domingo,,Dominican Republic,72,170,8,Outfielder,False
1,593590,Webster Rivas,1990-08-08,2021-05-28,Nagua,,Dominican Republic,73,219,3,First Base,True
2,661269,Vladimir Gutierrez,1995-09-18,2021-05-28,Havana,,Cuba,73,190,1,Pitcher,True
3,669212,Eli Morgan,1996-05-13,2021-05-28,Rancho Palos Verdes,CA,USA,70,190,1,Pitcher,True
4,666201,Alek Manoah,1998-01-09,2021-05-27,Homestead,FL,USA,78,260,1,Pitcher,True


## 1-9 評価対象の人数確認

In [9]:
df_players["playerForTestSetAndFuturePreds"] = np.where(df_players["playerForTestSetAndFuturePreds"]==True, 1, 0)
print(df_players["playerForTestSetAndFuturePreds"].sum())
print(df_players["playerForTestSetAndFuturePreds"].mean())

1187
0.5759340126152354


## 1-10 テーブル結合

In [10]:
#PlaerIDをキーにして結合
df_train = pd.merge(df_engagement, df_players, on=["playerId"], how="left")
print(df_train.shape)

(1003707, 21)


## 1-11 学習用データセットの作成

In [11]:
x_train = df_train[[
    "playerId", "dayofweek",
    "birthCity", "birthStateProvince", "birthCountry", "heightInches", "weight", 
    "primaryPositionCode", "primaryPositionName", "playerForTestSetAndFuturePreds"]]
y_train = df_train[["target1","target2","target3","target4"]]
#どの選手の何月何日の予測なのかを確認をできるように、日付やPalyerIDなどの識別情報をまとめたデータ
id_train = df_train[["engagementMetricsDate","playerId","date_playerId","date","yearmonth","playerForTestSetAndFuturePreds"]]
print(x_train.shape, y_train.shape, id_train.shape)
x_train.head()

(1003707, 10) (1003707, 4) (1003707, 6)


Unnamed: 0,playerId,dayofweek,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
0,425794,2,Brunswick,GA,USA,79,230,1,Pitcher,1
1,571704,2,Albuquerque,NM,USA,75,210,1,Pitcher,0
2,506702,2,Maracaibo,,Venezuela,70,235,2,Catcher,1
3,607231,2,Savannah,GA,USA,76,200,1,Pitcher,1
4,543193,2,Columbia,CA,USA,76,215,1,Pitcher,0


## 1-12 カテゴリ変数をcategory型に変換

In [12]:
for col in ["playerId", "dayofweek", "birthCity", "birthStateProvince", "birthCountry", "primaryPositionCode", "primaryPositionName"]:
    x_train[col] = x_train[col].astype("category")

## カテゴリ変数をcategory型に変換