In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:


import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import gc
import sys
import warnings
from joblib import Parallel, delayed 

from pathlib import Path
from sklearn.model_selection import train_test_split

from statsmodels.tsa.deterministic import (CalendarFourier,    
                                           CalendarSeasonality,
                                           CalendarTimeTrend,
                                           DeterministicProcess)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers.experimental.preprocessing import StringLookup
from datetime import timedelta

warnings.simplefilter("ignore")

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
features = [
    "hits",
    "strikeOuts",
    "homeRuns",
    "runsScored",
    "stolenBases",
    "strikeOutsPitching",
    "inningsPitched",
    "strikes",
    "flyOuts",
    "groundOuts",
    "errors",
]

In [None]:
def unpack_json(json_str):
    return pd.DataFrame() if pd.isna(json_str) else pd.read_json(json_str)


def unpack_data(data, dfs=None, n_jobs=-1): #これか
    if dfs is not None:
        data = data.loc[:, dfs]
    unnested_dfs = {}
    for name, column in data.iteritems():
        daily_dfs = Parallel(n_jobs=n_jobs)(
            delayed(unpack_json)(item) for date, item in column.iteritems())
        df = pd.concat(daily_dfs)
        unnested_dfs[name] = df
    return unnested_dfs

In [None]:
data_dir = Path('../input/mlb-player-digital-engagement-forecasting/')

df_names = ['seasons', 'teams', 'players', 'awards']

for name in df_names:
    globals()[name] = pd.read_csv(data_dir / f"{name}.csv")

kaggle_data_tabs = widgets.Tab() #この指定も必要

kaggle_data_tabs.children = list([widgets.Output() for df_name in df_names])  #>>>>>これを出力するとかっこいいリストになる

for index in range(0, len(df_names)):
    # titleの名前を変える
    kaggle_data_tabs.set_title(index, df_names[index])
    
    # Display corresponding table output for this tab name
    with kaggle_data_tabs.children[index]:
        display(eval(df_names[index]))

display(kaggle_data_tabs)

In [None]:
pd.read_csv(data_dir / f"awards.csv")['awardName'].value_counts()

In [None]:
import pandas as pd
oh = pd.read_csv('../input/mlb-player-digital-engagement-forecasting/players.csv')
oh[oh['playerName']=='Shohei Ohtani']

In [None]:
%%time
dfs = [
    'nextDayPlayerEngagement', 
    'playerBoxScores',  
   
    'standings',
    'playerTwitterFollowers',
    'teamTwitterFollowers',
]

training = pd.read_csv(
    data_dir / 'train.csv',
    usecols=['date'] + dfs,
)

training['date'] = pd.to_datetime(training['date'], format="%Y%m%d")
training = training.set_index('date').to_period('D')
print(training.info())

In [None]:
pd.read_csv(data_dir / 'train.csv').head()

In [None]:
training[:7]

In [None]:
len(training['nextDayPlayerEngagement'][0])

In [None]:
training['nextDayPlayerEngagement'][0][:1500]

In [None]:
training_dfs = unpack_data(training, dfs=dfs)
print('\n', training_dfs.keys())

In [None]:
training_dfs

In [None]:
def train_lag(df, lag=1):
    dp = df[["playerId","date"]+TGTCOLS].copy()
    dp["date"]  =dp["date"] + timedelta(days=lag) 
    df = df.merge(dp, on=["playerId", "date"], suffixes=["",f"_{lag}"], how="left")
    return df

In [None]:
pids_test = players.playerId.loc[
    players.playerForTestSetAndFuturePreds.fillna(False)
].astype(str)

# Name of target columns
targets = ["target1", "target2", "target3", "target4"]


def make_playerBoxScores(dfs: dict, features):
    X = dfs['playerBoxScores'].copy()
    X = X[['gameDate', 'playerId'] + features]
    # Set dtypes
    X = X.astype({name: np.float32 for name in features})
    X = X.astype({'playerId': str})
    # Create date index
    X = X.rename(columns={'gameDate': 'date'})
    X['date'] = pd.PeriodIndex(X.date, freq='D')
    # Aggregate multiple games per day by summing
    X = X.groupby(['date', 'playerId'], as_index=False).sum()
    return X


def make_targets(training_dfs: dict): 
    Y = training_dfs['nextDayPlayerEngagement'].copy()
    # Set dtypes
    Y = Y.astype({name: np.float32 for name in targets})
    Y = Y.astype({'playerId': str})
    # Match target dates to feature dates and create date index
    Y = Y.rename(columns={'engagementMetricsDate': 'date'})
    Y['date'] = pd.to_datetime(Y['date'])
    Y = Y.set_index('date').to_period('D')
    Y.index = Y.index - 1
    return Y.reset_index()


def join_datasets(dfs):
    dfs = [x.pivot(index='date', columns='playerId') for x in dfs]
    df = pd.concat(dfs, axis=1).stack().reset_index('playerId')
    return df


def make_training_data(training_dfs: dict,
                       features,
                       targets,
                       fourier=4,
                       test_size=30):
    # Process dataframes
    X = make_playerBoxScores(training_dfs, features)
    Y = make_targets(training_dfs)
    
    
    # Merge for processing
    df = join_datasets([X, Y])
    
    
    # Filter for players in test set
    df = df.loc[df.playerId.isin(pids_test), :]
    # Convert from long to wide format
    df = df.pivot(columns="playerId")
    # Restore features and targets
    X = df.loc(axis=1)[features, :]
    Y = df.loc(axis=1)[targets, :]
    # Fill missing values in features
    X.fillna(-1, inplace=True)
    # Create temporal features
    fourier_terms = CalendarFourier(freq='A', order=fourier)
    deterministic = DeterministicProcess(
        index=X.index,
        order=0,
        seasonal=False,  # set to True for weekly seasonality
        additional_terms=[fourier_terms],
    )
    X = pd.concat([X, deterministic.in_sample()], axis=1)
    # Create train / validation splits
    X_train, X_valid, y_train, y_valid = train_test_split(
        X,
        Y,
        test_size=test_size,
        shuffle=False,
    )
    return X_train, X_valid, y_train, y_valid, deterministic

In [None]:
pids_test = players.playerId.loc[
    players.playerForTestSetAndFuturePreds.fillna(False)
].astype(str)
pids_test

In [None]:
len(players)

In [None]:
players.playerForTestSetAndFuturePreds.isnull().sum() #.fillna(False)

In [None]:
X= make_playerBoxScores(training_dfs, features) 
X

In [None]:
Y =make_targets(training_dfs)
Y

In [None]:
%time
import matplotlib.pyplot as plt
plt.scatter(x = 'target1', y = 'target2',  data=Y)

In [None]:
%time
import matplotlib.pyplot as plt
plt.scatter(x = 'target1', y = 'target3',  data=Y)

In [None]:
import matplotlib.pyplot as plt
plt.scatter(x = 'target1', y = 'target4',  data=Y)

In [None]:
import matplotlib.pyplot as plt
plt.scatter(x = 'target2', y = 'target3',  data=Y)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(Y.corr(), vmin=-1.0, vmax=1.0, annot=True, cmap='coolwarm', linewidths=0.1)
plt.show()

In [None]:
df = join_datasets([X, Y])
df

In [None]:
fourier=4
fourier_terms = CalendarFourier(freq='A', order=fourier)
fourier_terms

In [None]:
pids_test 

In [None]:
df = df.loc[df.playerId.isin(pids_test), :]
df

In [None]:
df = df.pivot(columns="playerId")
df

In [None]:
features 

In [None]:
 X = df.loc(axis=1)[features, :]
 X

In [None]:
Y = df.loc(axis=1)[targets, :]
Y

In [None]:
r = df.loc(axis=1)[features, '660271'].mean()
r

In [None]:
X.fillna(-1, inplace=True)
check = X.copy()
X

In [None]:
fourier_terms = CalendarFourier(freq='A', order=fourier)
fourier_terms

In [None]:
deterministic = DeterministicProcess(
        index=X.index,
        order=0,
        seasonal=False,  # set to True for weekly seasonality
        additional_terms=[fourier_terms],
    )
deterministic

In [None]:
for i in deterministic.in_sample():
    print(i)

In [None]:
X = pd.concat([X, deterministic.in_sample()], axis=1)
X

In [None]:
 df.loc(axis=1)[features, :]


In [None]:
test_size = 30

X_train, X_valid, y_train, y_valid, deterministic = make_training_data(
    training_dfs, 
    features=features, 
    targets=targets,
    fourier=4,  
    test_size=test_size,
)
X_train

In [None]:
y_train

In [None]:
y_train.mean()

In [None]:
y_train.mean().mean(level=1)

In [None]:
y_train.mean().mean(level=0)

In [None]:
r =y_train.loc(axis=1)[:,'660271']
r


In [None]:
check.loc(axis=1)[:,'660271'] 

In [None]:
r.mean()

In [None]:
r.mean().min(level=1)

In [None]:
pd.qcut(r.mean(), q=1) 

In [None]:
pd.qcut(r.mean(), q=2) 

In [None]:
pd.qcut(r.mean(), q=3) 

In [None]:
y_train.mean().min(level=1)

In [None]:
y_train.mean().max(level=1)

In [None]:
deciles = pd.qcut(y_train.mean().mean(level=1), q=5) 
deciles

In [None]:
pids_top_decile = deciles.index[deciles == deciles.max()] 
pids_top_decile

In [None]:
y_top_decile = y_train.loc(axis=1)[:, pids_top_decile]
y_top_decile

In [None]:
y_top_decile_avg = (y_top_decile / y_top_decile.max(axis=0)).mean(axis=1)
S = y_top_decile_avg.to_frame()
S.index.month

In [None]:
fs = pd.Timedelta("1Y") / pd.Timedelta("1D") #1年は何日か計算している
fs

In [None]:
from scipy.signal import periodogram
ts =y_top_decile_avg
detrend='linear'
ax=None
freqencies, spectrum = periodogram(
    ts,
    fs=fs,
    detrend=detrend,
    window="boxcar",
    scaling='spectrum',
)
freqencies[:20]

In [None]:
def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple") #これで線を引いているようだ
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual",
            "Semiannual",
            "Quarterly",
            "Bimonthly",
            "Monthly",
            "Biweekly",
            "Weekly",
            "Semiweekly",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Density")
    ax.set_title("Periodogram")
    return ax
detrend='linear'
freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
freqencies[:10]


In [None]:
spectrum[:11]

In [None]:
_ = plot_periodogram(y_top_decile_avg)

In [None]:
HIDDEN = 1024
ACTIVATION = 'relu' 
DROPOUT_RATE = 0.5
LEARNING_RATE = 1e-2
BATCH_SIZE = 32

OUTPUTS = y_train.shape[-1]
model = keras.Sequential([
    layers.Dense(HIDDEN, activation=ACTIVATION),
    layers.BatchNormalization(),
    layers.Dropout(DROPOUT_RATE),
    layers.Dense(HIDDEN, activation=ACTIVATION),
    layers.BatchNormalization(),
    layers.Dropout(DROPOUT_RATE),
    layers.Dense(HIDDEN, activation=ACTIVATION),
    layers.BatchNormalization(),
    layers.Dropout(DROPOUT_RATE),
    layers.Dense(OUTPUTS),
])

In [None]:
y_train1 = y_train['target1']
y_train2 = y_train['target2']
y_train3 = y_train['target3']
y_train4 = y_train['target4']

In [None]:
OUTPUTS

In [None]:
X_train[:7]

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE) #1e-2

model.compile(optimizer=optimizer, loss='mae', metrics=['mae'])

early_stopping = keras.callbacks.EarlyStopping(patience=3)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=BATCH_SIZE,
    epochs=90,
    callbacks=[early_stopping],
)

In [None]:
pids_test

In [None]:
def make_test_data(test_dfs: dict, features, deterministic):
    X = make_playerBoxScores(test_dfs, features)
    X = X.merge(pids_test, how='right')
    X['date'] = X.date.fillna(method='ffill').fillna(method='bfill')
    X.fillna(-1, inplace=True)
    # Convert from long to wide format
    X = X.pivot(index='date', columns="playerId")
    # Create temporal features
    X = pd.concat([
        X,
        deterministic.out_of_sample(steps=1, forecast_index=X.index),
    ],
                  axis=1)
    return X

In [None]:
def make_predictions(model, X, columns, targets):
    y_pred = model.predict(X)
    y_pred = pd.DataFrame(y_pred, columns=columns, index=X.index).stack()
    y_pred[targets] = y_pred[targets].clip(0, 100)
    y_pred['date_playerId'] = [
        (date + 1).strftime('%Y%m%d') + '_' + str(playerId)
        for date, playerId in y_pred.index
    ]
    y_pred.reset_index('playerId', drop=True, inplace=True)
    y_pred = y_pred[['date_playerId'] + targets]  # reorder
    y_pred.index = pd.Int64Index(
        [int(date.strftime('%Y%m%d')) for date in y_pred.index], name='date')
    return y_pred


In [None]:
import mlb

env = mlb.make_env()
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    # Unpack features from test_df
    test_dfs = unpack_data(test_df, dfs=['playerBoxScores'])
    X = make_test_data(test_dfs, features, deterministic)

    # Create predictions
    y_pred = make_predictions(
        model,
        X,
        columns=y_train.columns,
        targets=targets,
    )
    submission = (
        sample_prediction_df
        [['date_playerId']]
        .reset_index()  #  preserve index 'date'
        .merge(y_pred, how='left', on='date_playerId')
        .set_index('date')  #  restore index 'date'
    )

    # Submit predictions
    env.predict(submission)  # constructs submissions.csv

In [None]:
test_dfs = unpack_data(test_df, dfs=['playerBoxScores'])
test_dfs

In [None]:
X = make_test_data(test_dfs, features, deterministic)
X

In [None]:
y_pred = make_predictions(
        model,
        X,
        columns=y_train.columns,
        targets=targets,
    )
y_pred