In [None]:
import os
import pandas as pd
from datetime import datetime, timedelta

train_df = pd.read_csv("/kaggle/input/mlb-player-digital-engagement-forecasting/train.csv", nrows=500)
train_df.head()

In [None]:
players_df = pd.read_csv("/kaggle/input/mlb-player-digital-engagement-forecasting/players.csv")
players_df["position_and_birth_country"] = players_df.apply(
    lambda x: "{0}_{1}".format(x["birthCountry"], x["primaryPositionName"]), axis=1)
players_df.head()

In [None]:
from datetime import datetime

def calc_debut_age(r):
    try:
        value = int((datetime.strptime(str(r["mlbDebutDate"]), "%Y-%m-%d") - datetime.strptime(str(r["DOB"]), "%Y-%m-%d")).days)
    except Exception:
        value = None
        pass
    return value

players_df["debut_age_days"] = players_df.apply(lambda x: calc_debut_age(x),axis=1)

debut_age_dic = dict(zip(players_df["playerId"], players_df["debut_age_days"]))
print("Created debut_age_dic", len(debut_age_dic))

In [None]:
player_characteristics_category_dic = dict()

category_headers = ["birthCountry", "primaryPositionName", "position_and_birth_country"]

for category_header in category_headers:
    player_characteristics_category_dic[category_header] = dict(zip(players_df["playerId"],
                                                                    players_df[category_header]))
    print(category_header, len(player_characteristics_category_dic[category_header]))

In [None]:
from datetime import datetime, timedelta

player_birthday_dic = dict(zip(players_df["playerId"], players_df["DOB"]))
player_debut_dic = dict(zip(players_df["playerId"], players_df["mlbDebutDate"]))

print("player_debut_dic", len(player_debut_dic))
print("player_birthday_dic", len(player_birthday_dic))

In [None]:
def build_age_dic(player_age_dic, key, reference_dic, min_time, max_time):
    
    print("Building age dic...", key)

    continue_loop = True
    j = 0
    data_added = 0

    while continue_loop:

        target_date = datetime.strptime(str(min_time), "%Y%m%d") + timedelta(days=j)
        target_date_key = datetime.strftime(target_date, "%Y%m%d")

        for player in reference_dic:

            try:

                baseline_day = datetime.strptime(str(reference_dic[player]), "%Y-%m-%d")
                days_since_baseline_day = (target_date - baseline_day).days
                
                if days_since_baseline_day > 0:
                    player_age_dic[(target_date_key, int(player), key)] = days_since_baseline_day
                    data_added += 1
                else:
                    player_age_dic[(target_date_key, int(player), key)] = 0

            except Exception as e:
                #print(e)
                player_age_dic[(target_date_key, int(player), key)] = None
                pass

        j += 1

        if target_date_key == max_time:
            continue_loop = False

    print("Success, data_added", key, data_added)
    
    return player_age_dic

In [None]:
min_time = "20170101"
max_time = "20220101"

player_age_dic = dict()

player_age_dic = build_age_dic(player_age_dic,
                               key="days_since_birthday",
                               reference_dic=player_birthday_dic,
                               min_time=min_time,
                               max_time=max_time)

player_age_dic = build_age_dic(player_age_dic,
                               key="days_since_debut",
                               reference_dic=player_debut_dic,
                               min_time=min_time,
                               max_time=max_time)

In [None]:
awards_df = pd.read_csv("/kaggle/input/mlb-player-digital-engagement-forecasting/awards.csv")
awards_df["award_date_dateobj"] = awards_df["awardDate"].map(lambda x: datetime.strptime(x, "%Y-%m-%d"))
awards_df.sort_values(by="awardDate", ascending=True, inplace=True)
awards_df.head()

In [None]:
from collections import defaultdict

player_award_dic = defaultdict(list)

for i, row in awards_df.iterrows():
    data = [row["award_date_dateobj"], row["awardId"]]
    player_award_dic[row["playerId"]].append(data)

print("Created player award dic", len(player_award_dic))

for p in [q for q in player_award_dic][0:10]:
    print(p, len(player_award_dic[p]))

In [None]:
# Helper function to unpack json found in daily data
def unpack_json(json_str):
    return pd.DataFrame() if pd.isna(json_str) else pd.read_json(json_str)

**FEATURE ENGINEERING...**

dictionaries we can use for feature engineering:
- player_characteristics_category_dic
- player_age_dic (days since birth, days since debut)
- player_award_dic

In [None]:
def feature_engineering_prediction_df(input_df, input_sample_prediction_df, row_threshold):
    
    rows_processed = 0
    
    if input_sample_prediction_df.shape[0] > 0:
        columns_to_check = ["date", "date_playerId", "target1", "target2", "target2", "target3", "target4"]
        for column_to_check in columns_to_check:
            assert column_to_check in input_sample_prediction_df.columns.values
    
    assert input_df.shape[0] > 0
    
    if input_sample_prediction_df.shape[0] == 0:
        
        assert "nextDayPlayerEngagement" in input_df.columns.values
        assert "date" in input_df.columns.values
        final_sample_prediction_df = pd.DataFrame()
        
        for i, row in input_df.iterrows():
            
            next_day_player_engagement_df = unpack_json(row["nextDayPlayerEngagement"])
            if final_sample_prediction_df.shape[0] == 0 and next_day_player_engagement_df.shape[0] > 0:
                final_sample_prediction_df = next_day_player_engagement_df
            else:
                assert len(final_sample_prediction_df.columns.values) == len(next_day_player_engagement_df.columns.values)
                intersection_set = set.intersection(set(final_sample_prediction_df.columns.values),
                                                    set(next_day_player_engagement_df.columns.values))
                assert len(intersection_set) == len(final_sample_prediction_df.columns.values)
                assert len(intersection_set) == len(next_day_player_engagement_df.columns.values)
                final_sample_prediction_df = final_sample_prediction_df.append(next_day_player_engagement_df)
        
            rows_processed += 1
            
            if row_threshold > 0 and rows_processed >= row_threshold:
                break
        
        assert final_sample_prediction_df.shape[0] > 0
        
        final_sample_prediction_df["date"] = final_sample_prediction_df["engagementMetricsDate"].map(
            lambda x: (datetime.strptime(x, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y%m%d"))
        final_sample_prediction_df["date_playerId"] = final_sample_prediction_df.apply(
            lambda x: "{0}_{1}".format(datetime.strptime(x["engagementMetricsDate"],
                                                         "%Y-%m-%d").strftime("%Y%m%d"),
                                       x["playerId"]), axis=1)
        
        final_headers = ["date", "date_playerId", "target1", "target2", "target3", "target4"]
        final_sample_prediction_df = final_sample_prediction_df[final_headers].copy()
        
    else:
        
        final_sample_prediction_df = input_sample_prediction_df.copy()
    
    print("Feature engineering complete, rows processed = {}".format(rows_processed))
    
    return final_sample_prediction_df

In [None]:
final_sample_prediction_df = feature_engineering_prediction_df(input_df=train_df,
                                                               input_sample_prediction_df=pd.DataFrame(),
                                                               row_threshold=200)
final_sample_prediction_df.head()

In [None]:
def add_feature_ages(df, player_age_dic):
    
    df["feature_days_since_dob"] = df.apply(
        lambda x: player_age_dic[(x["date"],
                                  x["playerId"],
                                  "days_since_birthday")] if (x["date"],
                                                              x["playerId"],
                                                              "days_since_birthday") in player_age_dic else None,
        axis=1)
    
    df["feature_days_since_debut"] = df.apply(
        lambda x: player_age_dic[(x["date"],
                                  x["playerId"],
                                  "days_since_debut")] if (x["date"],
                                                           x["playerId"],
                                                           "days_since_debut") in player_age_dic else None,
        axis=1)
    
    return df

In [None]:
def feature_engineering_model_df(input_prediction_df,
                                 player_age_dic):
    
    print("input_prediction_df columns", input_prediction_df.columns.values)
    
    assert input_prediction_df.shape[0] > 0
    
    if "date" not in input_prediction_df.columns.values:
        assert "date_playerId" in input_prediction_df.columns.values
        input_prediction_df["date"] = input_prediction_df["date_playerId"].map(
            lambda x: (datetime.strptime(str(x).split("_")[0], "%Y%m%d") - timedelta(days=1)).strftime("%Y%m%d"))
    
    required_headers = ["date", "date_playerId", "target1", "target2", "target3", "target4"]
    for required_header in required_headers:
        assert required_header in input_prediction_df.columns.values
    
    final_model_df = input_prediction_df[required_headers].copy()
    final_model_df["playerId"] = final_model_df["date_playerId"].map(lambda x: int(str(x).split("_")[1]))
    
    final_model_df = add_feature_ages(final_model_df, player_age_dic)
    
    return final_model_df

In [None]:
final_model_df = feature_engineering_model_df(input_prediction_df=final_sample_prediction_df,
                                              player_age_dic=player_age_dic)

final_model_df.head()

In [None]:
# then we train the model...

from lightgbm import LGBMRegressor

model_dic = dict()
feature_header_dic = dict()

feature_headers = [f for f in final_model_df.columns.values if f.find("feature_") != -1]
assert len(feature_headers) > 0

for target_header in ["target1", "target2", "target3", "target4"]:
    
    print("Training model", target_header)
    clf = LGBMRegressor()
    clf.fit(final_model_df[feature_headers], final_model_df[target_header])
    model_dic[target_header] = clf
    feature_header_dic[target_header] = feature_headers
    del clf

In [None]:
import mlb
env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test:
    
    final_model_df = feature_engineering_model_df(input_prediction_df=sample_prediction_df,
                                                  player_age_dic=player_age_dic)
    
    for target_header in ["target1", "target2", "target3", "target4"]:
        feature_headers = feature_header_dic[target_header]
        final_model_df[target_header] = model_dic[target_header].predict(final_model_df[feature_headers])
        sample_prediction_df[target_header] = final_model_df[target_header].values
    
    final_headers = ["date_playerId", "target1", "target2", "target3", "target4"]
    sample_prediction_df = sample_prediction_df[final_headers].copy()
    
    env.predict(sample_prediction_df)