Credit to @columbia2131 - I started with his notebook and then added an external data set with descriptive statistics of the targets for each player.

## About Dataset

Train.csv is stored as a csv file with each column as follows.  
train.csvを以下のようにして各カラムをcsvファイルとして保管しています。

To use many data, I used fruction of "reduce_mem_usage" to reduce CPU load.
CPU負荷を抑えるためにreduce_mem_usageという関数を使っています。

Params are tuned by Light GBM tuner. 
パラメータはLight GBM tunerで調整しています。

I want to continue feature engineering, because there are other features not used.
特徴量エンジニアリングを続けたい、まだ使っていない特徴量があるため。

In [None]:
%%capture
"""
!pip install pandarallel 

import gc

import numpy as np
import pandas as pd
from pathlib import Path

from pandarallel import pandarallel
pandarallel.initialize()

BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
train = pd.read_csv(BASE_DIR / 'train.csv')

null = np.nan
true = True
false = False

for col in train.columns:

    if col == 'date': continue

    _index = train[col].notnull()
    train.loc[_index, col] = train.loc[_index, col].parallel_apply(lambda x: eval(x))

    outputs = []
    for index, date, record in train.loc[_index, ['date', col]].itertuples():
        _df = pd.DataFrame(record)
        _df['index'] = index
        _df['date'] = date
        outputs.append(_df)

    outputs = pd.concat(outputs).reset_index(drop=True)

    outputs.to_csv(f'{col}_train.csv', index=False)
    outputs.to_pickle(f'{col}_train.pkl')

    del outputs
    del train[col]
    gc.collect()
"""

## Training

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from functools import reduce
from tqdm import tqdm
import lightgbm as lgbm
import mlb
import gc

pd.options.display.max_rows = 200
pd.options.display.max_columns = 100

## Fruction to reduce CPU load

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
TRAIN_DIR = Path('../input/mlb-pdef-train-dataset')

## Select columns

In [None]:
targets_cols = [
    'playerId', 
    'target1', 
    'target2', 
    'target3', 
    'target4', 
    'date'
]

players_cols = [
    'playerId', 
    'primaryPositionName'
]

teams_cols = [
    'id', 
#     'name', 
#     'teamName', 
#     'teamCode', 
#     'shortName', 
#     'abbreviation', 
#     'locationName', 
    'leagueId', 
#     'leagueName', 
    'divisionId', 
#     'divisionName', 
#     'venueId', 
#     'venueName'
]

rosters_cols = [
    'playerId', 
    'teamId', 
    'status', 
    'date'
]

scores_cols = [
    'playerId', 
    'battingOrder', 
    'gamesPlayedBatting', 
    'flyOuts',
    'groundOuts', 
    'runsScored', 
    'doubles', 
    'triples', 
    'homeRuns',
    'strikeOuts', 
    'baseOnBalls', 
    'intentionalWalks', 
    'hits', 
    'hitByPitch',
    'atBats', 
    'caughtStealing', 
    'stolenBases', 
    'groundIntoDoublePlay',
    'groundIntoTriplePlay', 
    'plateAppearances', 
    'totalBases', 
    'rbi',
    'leftOnBase', 
    'sacBunts', 
    'sacFlies', 
    'catchersInterference',
    'pickoffs', 
    'gamesPlayedPitching', 
    'gamesStartedPitching',
    'completeGamesPitching', 
    'shutoutsPitching', 
    'winsPitching',
    'lossesPitching', 
    'flyOutsPitching', 
    'airOutsPitching',
    'groundOutsPitching', 
    'runsPitching', 
    'doublesPitching',
    'triplesPitching', 
    'homeRunsPitching', 
    'strikeOutsPitching',
    'baseOnBallsPitching', 
    'intentionalWalksPitching', 
    'hitsPitching',
    'hitByPitchPitching', 
    'atBatsPitching', 
    'caughtStealingPitching',
    'stolenBasesPitching', 
    'inningsPitched', 
    'saveOpportunities',
    'earnedRuns', 
    'battersFaced', 
    'outsPitching', 
    'pitchesThrown', 
    'balls',
    'strikes', 
    'hitBatsmen', 
    'balks', 
    'wildPitches', 
    'pickoffsPitching',
    'rbiPitching', 
    'gamesFinishedPitching', 
    'inheritedRunners',
    'inheritedRunnersScored', 
    'catchersInterferencePitching',
    'sacBuntsPitching', 
    'sacFliesPitching', 
    'saves', 
    'holds', 
    'blownSaves',
    'assists', 
    'putOuts', 
    'errors', 
    'chances', 
    'date'
]

awards_cols = [
    'date', 
    'playerId',
    'awardId'
]

playerTwitterFollowers_cols = [
    'playerId', 
    'numberOfFollowers'
]

teamTwitterFollowers_cols = [
    'teamId', 
    'numberOfFollowers'
]

standings_cols = [
    'teamId', 
#     'wildCardRank', 
    'wins', 
    'losses', 
#     'divisionChamp', 
#     'divisionLeader', 
#     'wildCardLeader', 
    'lastTenWins',
    'lastTenLosses',
    'date'
]

feature_cols = [
    'label_playerId', 
    'label_primaryPositionName', 
    'label_teamId',
    'label_status',
    'playerId', 
    'battingOrder', 
    'gamesPlayedBatting', 
    'flyOuts',
    'groundOuts', 
    'runsScored', 
    'doubles', 
    'triples', 
    'homeRuns',
    'strikeOuts', 
    'baseOnBalls', 
    'intentionalWalks', 
    'hits', 
    'hitByPitch',
    'atBats', 
    'caughtStealing', 
    'stolenBases', 
    'groundIntoDoublePlay',
    'groundIntoTriplePlay', 
    'plateAppearances', 
    'totalBases', 
    'rbi',
    'leftOnBase', 
    'sacBunts', 
    'sacFlies', 
    'catchersInterference',
    'pickoffs', 
    'gamesPlayedPitching', 
    'gamesStartedPitching',
    'completeGamesPitching', 
    'shutoutsPitching', 
    'winsPitching',
    'lossesPitching', 
    'flyOutsPitching', 
    'airOutsPitching',
    'groundOutsPitching', 
    'runsPitching', 
    'doublesPitching',
    'triplesPitching', 
    'homeRunsPitching', 
    'strikeOutsPitching',
    'baseOnBallsPitching', 
    'intentionalWalksPitching', 
    'hitsPitching',
    'hitByPitchPitching', 
    'atBatsPitching', 
    'caughtStealingPitching',
    'stolenBasesPitching', 
    'inningsPitched', 
    'saveOpportunities',
    'earnedRuns', 
    'battersFaced', 
    'outsPitching', 
    'pitchesThrown', 
    'balls',
    'strikes', 
    'hitBatsmen', 
    'balks', 
    'wildPitches', 
    'pickoffsPitching',
    'rbiPitching', 
    'gamesFinishedPitching', 
    'inheritedRunners',
    'inheritedRunnersScored', 
    'catchersInterferencePitching',
    'sacBuntsPitching', 
    'sacFliesPitching', 
    'saves', 
    'holds', 
    'blownSaves',
    'assists', 
    'putOuts', 
    'errors', 
    'chances', 
    'target1_mean',
    'target1_median',
    'target1_std',
    'target1_min',
    'target1_max',
    'target1_prob',
    'target2_mean',
    'target2_median',
    'target2_std',
    'target2_min',
    'target2_max',
    'target2_prob',
    'target3_mean',
    'target3_median',
    'target3_std',
    'target3_min',
    'target3_max',
    'target3_prob',
    'target4_mean',
    'target4_median',
    'target4_std',
    'target4_min',
    'target4_max',
    'target4_prob',
    'awardId_count',
    'playernumberOfFollowers',               
    'teamnumberOfFollowers',
    'label_leagueId',
    'label_divisionId',
    'wins', 
    'losses', 
    'lastTenWins',
    'lastTenLosses'
]

## Read data and groupby

In [None]:
players = pd.read_csv(BASE_DIR / 'players.csv', usecols = players_cols)
players = reduce_mem_usage(players)


teams = pd.read_csv(BASE_DIR / 'teams.csv', usecols = teams_cols)
teams = teams.rename(columns = {'id':'teamId'})
teams = reduce_mem_usage(teams)


rosters = pd.read_csv(TRAIN_DIR / 'rosters_train.csv', usecols = rosters_cols)
rosters = reduce_mem_usage(rosters)


targets = pd.read_csv(TRAIN_DIR / 'nextDayPlayerEngagement_train.csv', usecols = targets_cols)
targets = reduce_mem_usage(targets)


scores = pd.read_csv(TRAIN_DIR / 'playerBoxScores_train.csv', usecols = scores_cols)
scores = scores.groupby(['playerId', 'date']).sum().reset_index()
scores = reduce_mem_usage(scores)


awards = pd.read_csv(TRAIN_DIR / 'awards_train.csv', usecols = awards_cols)
# awards = awards.groupby(['playerId', 'date']).count().reset_index()


awards_count = awards[['playerId', 'awardId']].groupby('playerId').count().reset_index()
awards_count = awards_count.rename(columns = {'awardId':'awardId_count'})
awards_count = reduce_mem_usage(awards_count)


playerTwitterFollowers = pd.read_csv(TRAIN_DIR / 'playerTwitterFollowers_train.csv', usecols = playerTwitterFollowers_cols)
playerTwitterFollowers = playerTwitterFollowers.groupby('playerId').sum().reset_index()
playerTwitterFollowers = playerTwitterFollowers.rename(columns = {'numberOfFollowers':'playernumberOfFollowers'})
playerTwitterFollowers = reduce_mem_usage(playerTwitterFollowers)


teamTwitterFollowers = pd.read_csv(TRAIN_DIR / 'teamTwitterFollowers_train.csv', usecols = teamTwitterFollowers_cols)
teamTwitterFollowers = teamTwitterFollowers.groupby('teamId').sum().reset_index()
teamTwitterFollowers = teamTwitterFollowers.rename(columns = {'numberOfFollowers':'teamnumberOfFollowers'})
teamTwitterFollowers = reduce_mem_usage(teamTwitterFollowers)


standings = pd.read_csv(TRAIN_DIR / 'standings_train.csv', usecols = standings_cols)
standings = reduce_mem_usage(standings)

gc.collect()

In [None]:
player_target_stats = pd.read_csv("../input/player-target-stats/player_target_stats.csv")
data_names=player_target_stats.columns.values.tolist()
data_names

## Make train data

In [None]:
# creat dataset

train = targets.copy()[targets_cols]

print(targets[targets_cols].shape)

train = train.merge(
    players, 
    on=['playerId'], 
    how='left'
)
gc.collect()

print(train.shape, 'after_players')
print('--------------------------------------')

train = train.merge(
    rosters, 
    on=['playerId', 'date'], 
    how='left'
)
gc.collect()

print(train.shape, 'after_rosters')
print('--------------------------------------')

train = train.merge(
    scores, 
    on=['playerId', 'date'], 
    how='left'
)
gc.collect()

print(train.shape, 'after_scores')
print('--------------------------------------')

train = train.merge(
    player_target_stats, 
    how='inner', 
    on= "playerId",
)
gc.collect()

print(train.shape, 'after_player_target_stats')


print('--------------------------------------')

train = train.merge(
    teams,
    on = 'teamId',
    how='left'
)
# del rosters
gc.collect()

print(train.shape, 'after_teams')
print('--------------------------------------')

train = train.merge(
    awards_count,
    on = 'playerId',
    how = 'left'
)

train['awardId_count'] = train['awardId_count'].fillna(0)

print(train.shape, 'after_awards_count')
print('--------------------------------------')

train = train.merge(
    playerTwitterFollowers, 
    how = 'left', 
    on = 'playerId'
)
gc.collect()

print(train.shape, 'after_playerTwitter')
print('--------------------------------------')


train = train.merge(
    teamTwitterFollowers, 
    how = 'left', 
    on = 'teamId'
)
gc.collect()

print(train.shape, 'after_taemTwitter')
print('--------------------------------------')

train = train.merge(
    standings, 
    how = 'left', 
    on = ['teamId', 'date']
)
gc.collect()

print(train.shape, 'after_standings')
print('--------------------------------------')


# label encoding
player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
status2num = {c: i for i, c in enumerate(train['status'].unique())}
leagueId2num = {c: i for i, c in enumerate(train['leagueId'].unique())}
divisionId2num = {c: i for i, c in enumerate(train['divisionId'].unique())}


train['label_playerId'] = train['playerId'].map(player2num)
train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
train['label_teamId'] = train['teamId'].map(teamid2num)
train['label_status'] = train['status'].map(status2num)
train['label_leagueId'] = train['leagueId'].map(leagueId2num)
train['label_divisionId'] = train['divisionId'].map(divisionId2num)

In [None]:
train.info()

In [None]:
print(train.shape)
train.isnull().sum()

## Divide train and valid data

In [None]:
train_X = train[feature_cols]
train_y = train[['target1', 'target2', 'target3', 'target4']]

_index = (train['date'] < 20210401)
x_train = train_X.loc[_index].reset_index(drop=True)
y_train = train_y.loc[_index].reset_index(drop=True)
x_valid = train_X.loc[~_index].reset_index(drop=True)
y_valid = train_y.loc[~_index].reset_index(drop=True)

In [None]:
import pickle as pl
f=open('x_train.pkl','wb')
f1=open('y_train.pkl','wb') 
f2=open('x_valid.pkl','wb') 
f3=open('y_valid.pkl','wb') 
pl.dump(x_train,f)
pl.dump(y_train,f1)
pl.dump(x_valid,f2)
pl.dump(y_valid,f3)

## Example for tuning

## Predict