In [1]:
import gc
import os
import sys
import math
import random
import warnings
from pathlib import Path
from joblib import Parallel, delayed
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend,
                                           DeterministicProcess)

from tqdm import tqdm
import os
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandarallel import pandarallel
pandarallel.initialize()
warnings.simplefilter("ignore")
import ctypes as ct



INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
sys.path.append('../../')
import src.utils as utils

In [3]:
DATA_DIR = Path("/home/knikaido/work/MLB-Player-Digital-Engagement-Forecasting/data/")
MAIN_DATA_DIR = DATA_DIR / 'mlb-player-digital-engagement-forecasting-update'
OUTPUT_DIR = Path('./output/')

In [4]:
# Helper function to unpack json found in daily data
def unpack_json(json_str):
    return pd.DataFrame() if pd.isna(json_str) else pd.read_json(json_str)


def unpack_data(data, dfs=None, n_jobs=-1):
    if dfs is not None:
        data = data.loc[:, dfs]
    unnested_dfs = {}
    for name, column in data.iteritems():
        daily_dfs = Parallel(n_jobs=n_jobs)(
            delayed(unpack_json)(item) for date, item in column.iteritems())
        df = pd.concat(daily_dfs)
        unnested_dfs[name] = df
    return unnested_dfs

In [5]:
def exshow(col,n):
    tmp = training[col]
    tmp = tmp.dropna()
    tmpdf = unpack_json(tmp.iloc[n])
    print(tmpdf.columns)
    return tmpdf

In [10]:
training = pd.read_csv(MAIN_DATA_DIR / "train_updated.csv")
training

Unnamed: 0,date,nextDayPlayerEngagement,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
0,20180101,"[{""engagementMetricsDate"":""2018-01-02"",""player...",,"[{""playerId"":400121,""gameDate"":""2018-01-01"",""t...",,,"[{""transactionId"":340732,""playerId"":547348,""pl...",,,,"[{""date"":""2018-01-01"",""playerId"":545361,""playe...","[{""date"":""2018-01-01"",""teamId"":147,""teamName"":..."
1,20180102,"[{""engagementMetricsDate"":""2018-01-03"",""player...",,"[{""playerId"":134181,""gameDate"":""2018-01-02"",""t...",,,"[{""transactionId"":339458,""playerId"":621173,""pl...",,,,,
2,20180103,"[{""engagementMetricsDate"":""2018-01-04"",""player...",,"[{""playerId"":425492,""gameDate"":""2018-01-03"",""t...",,,"[{""transactionId"":347527,""playerId"":572389,""pl...",,,,,
3,20180104,"[{""engagementMetricsDate"":""2018-01-05"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-04"",""t...",,,"[{""transactionId"":339549,""playerId"":545343,""pl...",,,,,
4,20180105,"[{""engagementMetricsDate"":""2018-01-06"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-05"",""t...",,,"[{""transactionId"":341195,""playerId"":628336,""pl...",,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1289,20210713,"[{""engagementMetricsDate"":""2021-07-14"",""player...","[{""gamePk"":634675,""gameType"":""A"",""season"":2021...","[{""playerId"":450203,""gameDate"":""2021-07-13"",""t...","[{""home"":0,""gamePk"":634675,""gameDate"":""2021-07...","[{""home"":0,""teamId"":159,""gamePk"":634675,""gameD...","[{""transactionId"":504703,""playerId"":573062,""pl...","[{""season"":2021,""gameDate"":""2021-07-13"",""divis...","[{""awardId"":""ASMVP"",""awardName"":""All-Star MVP""...",,,
1290,20210714,"[{""engagementMetricsDate"":""2021-07-15"",""player...",,"[{""playerId"":444489,""gameDate"":""2021-07-14"",""t...",,,"[{""transactionId"":504772,""playerId"":605347,""pl...","[{""season"":2021,""gameDate"":""2021-07-14"",""divis...",,,,
1291,20210715,"[{""engagementMetricsDate"":""2021-07-16"",""player...","[{""gamePk"":633291,""gameType"":""R"",""season"":2021...","[{""playerId"":444489,""gameDate"":""2021-07-15"",""t...",,,"[{""transactionId"":504997,""playerId"":520980,""pl...","[{""season"":2021,""gameDate"":""2021-07-15"",""divis...",,,,
1292,20210716,"[{""engagementMetricsDate"":""2021-07-17"",""player...","[{""gamePk"":633283,""gameType"":""R"",""season"":2021...","[{""playerId"":448179,""gameDate"":""2021-07-16"",""t...","[{""home"":0,""gamePk"":633266,""gameDate"":""2021-07...","[{""home"":0,""teamId"":146,""gamePk"":633447,""gameD...","[{""transactionId"":505213,""playerId"":621043,""pl...","[{""season"":2021,""gameDate"":""2021-07-16"",""divis...",,"[{""gamePk"":633346,""gameDate"":""2021-07-16"",""gam...",,


In [11]:
# training['date'] = pd.to_datetime(training['date'], format="%Y%m%d")

In [13]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1294 entries, 0 to 1293
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   date                     1294 non-null   int64 
 1   nextDayPlayerEngagement  1294 non-null   object
 2   games                    715 non-null    object
 3   rosters                  1293 non-null   object
 4   playerBoxScores          612 non-null    object
 5   teamBoxScores            612 non-null    object
 6   transactions             1180 non-null   object
 7   standings                609 non-null    object
 8   awards                   309 non-null    object
 9   events                   609 non-null    object
 10  playerTwitterFollowers   43 non-null     object
 11  teamTwitterFollowers     43 non-null     object
dtypes: int64(1), object(11)
memory usage: 121.4+ KB


In [14]:
null = np.nan
true = True
false = False

for col in tqdm(training.columns):

    if col == 'date': continue

    _index = training[col].notnull()
    training.loc[_index, col] = training.loc[_index, col].parallel_apply(lambda x: eval(x))

    outputs = []
    for index, date, record in training.loc[_index, ['date', col]].itertuples():
        _df = pd.DataFrame(record)
        _df['index'] = index
        _df['date'] = date
        outputs.append(_df)

    outputs = pd.concat(outputs).reset_index(drop=True)
    outputs.to_csv(MAIN_DATA_DIR / f'train/{col}_train.csv', index=False)
    

100%|██████████| 12/12 [03:43<00:00, 18.63s/it]


In [15]:
outputs

Unnamed: 0,date,teamId,teamName,accountName,twitterHandle,numberOfFollowers,index
0,20180101,147,New York Yankees,New York Yankees,@Yankees,3130482,0
1,20180101,112,Chicago Cubs,Chicago Cubs,@Cubs,2373710,0
2,20180101,141,Toronto Blue Jays,Toronto Blue Jays,@BlueJays,2196352,0
3,20180101,111,Boston Red Sox,Boston Red Sox,@RedSox,1950737,0
4,20180101,119,Los Angeles Dodgers,Los Angeles Dodgers,@Dodgers,1949542,0
...,...,...,...,...,...,...,...
1285,20210701,115,Colorado Rockies,Colorado Rockies,@Rockies,588362,1277
1286,20210701,136,Seattle Mariners,Seattle Mariners,@Mariners,580268,1277
1287,20210701,133,Oakland Athletics,Oakland Athletics,@Athletics,579369,1277
1288,20210701,135,San Diego Padres,San Diego Padres,@Padres,497594,1277


## テストで取ってこれる一行はこんな感じ

In [None]:
example_sample_submission = pd.read_csv(MAIN_DATA_DIR / "example_sample_submission.csv")
example_sample_submission

In [7]:
example_test = pd.read_csv(MAIN_DATA_DIR / "example_test.csv")
example_test

Unnamed: 0,date,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
0,20210426,"[{""gamePk"":634374,""gameType"":""R"",""season"":2021...","[{""playerId"":405395,""gameDate"":""2021-04-26"",""t...","[{""home"":1,""gamePk"":634377,""gameDate"":""2021-04...","[{""home"":1,""teamId"":139,""gamePk"":634343,""gameD...","[{""transactionId"":480386,""playerId"":543685,""pl...","[{""season"":2021,""gameDate"":""2021-04-26"",""divis...",,"[{""gamePk"":634433,""gameDate"":""2021-04-26"",""gam...",,
1,20210427,"[{""gamePk"":634318,""gameType"":""R"",""season"":2021...","[{""playerId"":443558,""gameDate"":""2021-04-27"",""t...","[{""home"":1,""gamePk"":634320,""gameDate"":""2021-04...","[{""home"":1,""teamId"":117,""gamePk"":634333,""gameD...","[{""transactionId"":480456,""playerId"":642162,""pl...","[{""season"":2021,""gameDate"":""2021-04-27"",""divis...",,"[{""gamePk"":634332,""gameDate"":""2021-04-27"",""gam...",,
2,20210428,"[{""gamePk"":634309,""gameType"":""R"",""season"":2021...","[{""playerId"":429722,""gameDate"":""2021-04-28"",""t...","[{""home"":1,""gamePk"":634310,""gameDate"":""2021-04...","[{""home"":0,""teamId"":111,""gamePk"":634310,""gameD...","[{""transactionId"":480728,""playerId"":545358,""pl...","[{""season"":2021,""gameDate"":""2021-04-28"",""divis...",,"[{""gamePk"":634317,""gameDate"":""2021-04-28"",""gam...",,
3,20210429,"[{""gamePk"":634330,""gameType"":""R"",""season"":2021...","[{""playerId"":405395,""gameDate"":""2021-04-29"",""t...","[{""home"":1,""gamePk"":634330,""gameDate"":""2021-04...","[{""home"":0,""teamId"":119,""gamePk"":634346,""gameD...","[{""transactionId"":480993,""playerId"":606965,""pl...","[{""season"":2021,""gameDate"":""2021-04-29"",""divis...",,"[{""gamePk"":634346,""gameDate"":""2021-04-29"",""gam...",,
4,20210430,"[{""gamePk"":634287,""gameType"":""R"",""season"":2021...","[{""playerId"":405395,""gameDate"":""2021-04-30"",""t...","[{""home"":1,""gamePk"":634305,""gameDate"":""2021-04...","[{""home"":1,""teamId"":135,""gamePk"":634303,""gameD...",,"[{""season"":2021,""gameDate"":""2021-04-30"",""divis...","[{""awardId"":""NLRRELMON"",""awardName"":""NL Reliev...","[{""gamePk"":634327,""gameDate"":""2021-04-30"",""gam...",,


In [25]:
test_df = example_test.set_index('date').iloc[:1]

In [27]:
test_df

Unnamed: 0_level_0,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20210426,"[{'gamePk': 634374, 'gameType': 'R', 'season':...","[{""playerId"":405395,""gameDate"":""2021-04-26"",""t...","[{""home"":1,""gamePk"":634377,""gameDate"":""2021-04...","[{""home"":1,""teamId"":139,""gamePk"":634343,""gameD...","[{""transactionId"":480386,""playerId"":543685,""pl...","[{""season"":2021,""gameDate"":""2021-04-26"",""divis...",,"[{""gamePk"":634433,""gameDate"":""2021-04-26"",""gam...",,


In [33]:
sample_prediction_df = example_sample_submission[example_sample_submission['date']==test_df.index[0]].set_index('date')

In [34]:
sample_prediction_df

Unnamed: 0_level_0,date_playerId,target1,target2,target3,target4
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20210426,20210427_656669,0,0,0,0
20210426,20210427_543475,0,0,0,0
20210426,20210427_592866,0,0,0,0
20210426,20210427_452678,0,0,0,0
20210426,20210427_570257,0,0,0,0
...,...,...,...,...,...
20210426,20210427_593590,0,0,0,0
20210426,20210427_642180,0,0,0,0
20210426,20210427_663399,0,0,0,0
20210426,20210427_664199,0,0,0,0
