## Modeling
### Reference: 
 * <https://www.kaggle.com/ryanholbrook/getting-started-with-mlb-player-digital-engagement>
 * <https://www.kaggle.com/ranjeetshrivastav/mib-eda-xgboost>

### 목표: 앞서 만든 train 데이터셋(player_engagement_with_info)을 활용하여 결과값을 예측해본다.

In [1]:
# import libraries

from pathlib import Path
import os.path
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn

import datetime

import pickle

In [2]:
# pickle 형태로 저장된 파일 불러오기

with open('player_engagement_with_info.pkl', 'rb') as f:
    train = pickle.load(f)

In [3]:
train.head(5)

Unnamed: 0,dailyDataDate,engagementMetricsDate,playerId,target1,target2,target3,target4,targetAvg,date,year,...,winsTeam,lossesTeam,runsScoredTeam,runsAllowedTeam,divisionRankTeam,leagueRankTeam,wildCardRankTeam,winPctTeam,winStreakTeam,lossStreakTeam
0,2018-01-01,2018-01-02,628317,0.011167,4.474708,0.005168,5.735294,2.556584,2018-01-01,2018,...,,,,,,,,,,
1,2018-01-01,2018-01-02,547989,0.042993,5.593385,0.045033,2.794118,2.118882,2018-01-01,2018,...,,,,,,,,,,
2,2018-01-01,2018-01-02,519317,0.974327,56.177043,13.693746,64.166667,33.752945,2018-01-01,2018,...,,,,,,,,,,
3,2018-01-01,2018-01-02,607625,0.0067,2.675097,0.005168,1.862745,1.137428,2018-01-01,2018,...,,,,,,,,,,
4,2018-01-01,2018-01-02,592547,0.001117,0.632296,0.002953,0.931373,0.391934,2018-01-01,2018,...,,,,,,,,,,


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2506176 entries, 0 to 2506175
Data columns (total 72 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   dailyDataDate          datetime64[ns]
 1   engagementMetricsDate  object        
 2   playerId               int64         
 3   target1                float64       
 4   target2                float64       
 5   target3                float64       
 6   target4                float64       
 7   targetAvg              float64       
 8   date                   datetime64[ns]
 9   year                   int64         
 10  month                  int64         
 11  inSeason               bool          
 12  seasonPart             object        
 13  playerName             object        
 14  DOB                    object        
 15  mlbDebutDate           object        
 16  birthCity              object        
 17  birthStateProvince     object        
 18  birthCountry          

In [5]:
# train 칼럼명 확인

for col in train.columns:
    print(col)

dailyDataDate
engagementMetricsDate
playerId
target1
target2
target3
target4
targetAvg
date
year
month
inSeason
seasonPart
playerName
DOB
mlbDebutDate
birthCity
birthStateProvince
birthCountry
primaryPositionName
rosterStatusCode
rosterStatus
rosterTeamId
rosterTeamName
numGames
numTeams
gameTeamId
runsScored
homeRuns
strikeOuts
baseOnBalls
hits
hitByPitch
atBats
caughtStealing
stolenBases
groundIntoDoublePlay
groundIntoTriplePlay
plateAppearances
totalBases
rbi
leftOnBase
sacBunts
sacFlies
gamesStartedPitching
runsPitching
homeRunsPitching
strikeOutsPitching
baseOnBallsPitching
hitsPitching
inningsPitchedAsFrac
earnedRuns
battersFaced
saves
blownSaves
pitchingGameScore
noHitter
gameTeamName
gameType
oppId
oppName
numGamesTeam
winsTeam
lossesTeam
runsScoredTeam
runsAllowedTeam
divisionRankTeam
leagueRankTeam
wildCardRankTeam
winPctTeam
winStreakTeam
lossStreakTeam


In [6]:
# feature_cols 지정

feature_cols = ['label_playerId', 'label_primaryPositionName','primaryPositionName','runsScored','homeRuns',
                'strikeOuts','baseOnBalls','hits','hitByPitch','atBats','caughtStealing','stolenBases','groundIntoDoublePlay','groundIntoTriplePlay','plateAppearances',
                'totalBases','rbi','leftOnBase','sacBunts','sacFlies','gamesStartedPitching','runsPitching','homeRunsPitching','strikeOutsPitching','baseOnBallsPitching',
                'hitsPitching','inningsPitchedAsFrac','earnedRuns','battersFaced','saves','blownSaves','pitchingGameScore','noHitter','gameTeamName','gameType',
                'oppId','oppName','numGamesTeam','winsTeam','lossesTeam','runsScoredTeam','runsAllowedTeam','divisionRankTeam','leagueRankTeam','wildCardRankTeam',
                'winPctTeam','winStreakTeam','lossStreakTeam']

In [7]:
# label encoding

player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
train['label_playerId'] = train['playerId'].map(player2num)
train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)

In [8]:
X = train[feature_cols]
y = train[['target1', 'target2', 'target3', 'target4']]

In [9]:
# train, valid 데이터 나누기

_index = (train['dailyDataDate'] < datetime.datetime(2021, 4, 1))
x_train = X.loc[_index].reset_index(drop=True)
y_train = y.loc[_index].reset_index(drop=True)
x_valid = X.loc[~_index].reset_index(drop=True)
y_valid = y.loc[~_index].reset_index(drop=True)

In [11]:
# lightGBM 활용

from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    model = LGBMRegressor(**params)
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)],
             early_stopping_rounds=100,
             verbose = 400)
    oof_pred = model.predict(x_valid)
    score = mean_absloute_error(oof_pred, y_valid)
    print('mae: ', score)
    return oof_pred, model, score