In [None]:
import re
import pickle

In [None]:
import lightgbm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm
import seaborn as sns
from sklearn.multioutput import MultiOutputRegressor

In [None]:
import mlb

In [None]:
TARGET_COLS = ["target1", "target2", "target3", "target4"]
ROLLING_WINDOWS = [7, 30]
CATEGORICAL_FEATURES = [
    "home",
    "positionCode",
    "birthCountry",
    "day_of_week",
    "week_of_year",
    "month",
    "toTeamId",
    "fromTeamId",
    "teamId",
    "statusCode",
    "leagueId",
    "divisionId"
]
BOX_SCORES_COMPARISON_COLS = [
    "flyOuts",
    "runsScored",
    "doubles",
    "triples",
    "homeRuns",
    "strikeOuts",
    "baseOnBalls",
    "intentionalWalks",
    "hits",
    "stolenBases",
    "plateAppearances",
    "totalBases",
    "rbi",
    "errors",
    "chances",
]
BOX_SCORES_CATEGORICAL_COLS = ["home", "positionCode"]
BOX_SCORES_NUMERIC_COLS = ["battingOrder"]
PLAYERS_COLS = ["playerId", "DOB", "mlbDebutDate", "birthCountry", "salary"]
SEASONS_DAYS_FROM_COLS = [
    "seasonStartDate",
    "preSeasonStartDate",
    "regularSeasonStartDate",
    "postSeasonStartDate",
    "postSeasonEndDate",
]
SEASONS_DATE_COLS = [
    "allStarDate",
    "lastDate1stHalf",
    "firstDate2ndHalf",
    "seasonEndDate",
    "preSeasonEndDate",
    "regularSeasonEndDate",
    "postSeasonEndDate",
]
TRANSACTIONS_COLS = [
    "toTeamId",
    "fromTeamId",
    "SFA",
    "TR",
    "NUM",
    "ASG",
    "DES",
    "CLW",
    "OUT",
    "REL",
    "SC",
    "OPT",
    "RTN",
    "SGN",
    "SE",
    "CU",
    "DFA",
    "RET",
]
ROSTERS_COLS = ["date", "playerId", "teamId", "statusCode"]
PLAYER_TWITTER_COLS = ["year", "month", "playerId", "player_followers_depth"]
TEAM_TWITTER_COLS = ["year", "month", "teamId", "team_followers_depth"]
TEAMS_COLS = ["teamId", "leagueId", "divisionId"]
AWARDS_COLS = ["playerId", "awardsCount", "allStarCount", "mvpCount", "rookieCount"]

In [None]:
FEATURES_COLS = ['target1_mean', 'target1_max', 'target1_std', 'target2_mean',
       'target2_max', 'target2_std', 'target3_mean', 'target3_max',
       'target3_std', 'target4_mean', 'target4_max', 'target4_std',
       'day_of_week', 'week_of_year', 'month', 'relative_flyOuts',
       'relative_runsScored', 'relative_doubles', 'relative_triples',
       'relative_homeRuns', 'relative_strikeOuts', 'relative_baseOnBalls',
       'relative_intentionalWalks', 'relative_hits', 'relative_stolenBases',
       'relative_plateAppearances', 'relative_totalBases', 'relative_rbi',
       'relative_errors', 'relative_chances', 'battingOrder', 'home',
       'positionCode', 'birthCountry', 'salary', 'age', 'debut_days',
       'days_from_seasonStartDate', 'days_from_preSeasonStartDate',
       'days_from_regularSeasonStartDate', 'days_from_postSeasonStartDate',
       'days_from_postSeasonEndDate', 'is_allStarDate', 'is_lastDate1stHalf',
       'is_firstDate2ndHalf', 'is_seasonEndDate', 'is_preSeasonEndDate',
       'is_regularSeasonEndDate', 'is_postSeasonEndDate', 'is_restSeason',
       'historicalAwardsCount', 'historicalAllStarCount', 'historicalMvpCount',
       'historicalRookieCount', 'awardsCount', 'allStarCount', 'mvpCount',
       'rookieCount', 'fromTeamId', 'toTeamId', 'ASG', 'CLW', 'CU', 'DES',
       'DFA', 'NUM', 'OPT', 'OUT', 'REL', 'RET', 'RTN', 'SC', 'SE', 'SFA',
       'SGN', 'TR', 'teamId', 'statusCode', 'player_followers_depth',
       'team_followers_depth', 'leagueId', 'divisionId']

In [None]:
ROOT = "../input/mlb-player-digital-engagement-forecasting"
CUSTOM_DATA_ROOT = "../input/datav7fixed"

In [None]:
def unpack_col_to_df(col):
    try:
        output = pd.concat(
            [pd.read_json(row) for row in col if isinstance(row, str)], ignore_index=True
        )
        date_cols = [col for col in output.columns if re.search("date", col.lower())]
        output[date_cols] = (
            pd.concat([pd.to_datetime(output[col]) for col in date_cols], axis=1)
            if len(date_cols) > 0
            else output[date_cols]
        )
    except (TypeError, ValueError):
        return pd.DataFrame()
    
    return output

In [None]:
def reduce_mem_usage(df, verbose=False):
    df = df.copy()
    numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
def process_players(df):
    df = df.copy()
    
    df["age"] = (df["date"] - df["DOB"]).dt.days
    df["debut_days"] = (df["date"] - df["mlbDebutDate"]).dt.days
    return df.drop(columns=["DOB", "mlbDebutDate"])

In [None]:
def process_box_scores(df):
    df = df.drop(columns=["gamePk", "teamId", "teamName", "playerName", "gameTimeUTC", "jerseyNum"])
    df = df.rename(columns={"gameDate": "date"})
    df.date = pd.to_datetime(df.date)
    df = reduce_mem_usage(df)
    output = df.groupby(["date", "playerId"], as_index=False).mean()
    
    return output

In [None]:
def process_transactions(df):
    df = df.copy()
    df = df.drop(
        columns=["transactionId", "description"]
    ).drop_duplicates()

    df["constant"] = 1

    df[["fromTeamId", "toTeamId"]] = df[
        ["fromTeamId", "toTeamId"]
    ].fillna(0)

    df_wide = (
        df[["playerId", "date", "fromTeamId", "toTeamId", "typeCode", "constant"]]
        .pivot_table(
            index=["playerId", "date", "fromTeamId", "toTeamId"],
            columns="typeCode",
            values="constant",
            aggfunc="sum",
        )
        .reset_index()
    )

    cols_to_fill_na = df.typeCode.unique()

    df_wide[cols_to_fill_na] = df_wide[cols_to_fill_na].fillna(0)
    df_wide = reduce_mem_usage(df_wide)

    df_wide[cols_to_fill_na] = df_wide[cols_to_fill_na].astype("uint8")

    df_wide = df_wide.sort_values(
        ["playerId", "date", "fromTeamId", "toTeamId"]
    )

    output = df_wide.groupby(
        ["playerId", "date"], as_index=False
    ).last()
    
    output["date"] += pd.Timedelta(days=1)
    
    current_cols = output.columns.to_list()
    holiday_cols_missing = [col for col in TRANSACTIONS_COLS if col not in current_cols]
    return output.reindex(columns=current_cols + holiday_cols_missing, fill_value=0).drop(columns=["date"])

In [None]:
data = pd.read_csv(
    f"{CUSTOM_DATA_ROOT}/labels.csv",
    parse_dates=["date"],
    dtype={"target1": "float16",
            "target2": "float16",
            "target3": "float16",
            "target4": "float16"}
)

In [None]:
box_scores = pd.read_csv(
    f"{CUSTOM_DATA_ROOT}/box_scores.csv",
    parse_dates=["date"]
)

In [None]:
data = reduce_mem_usage(data)
box_scores = reduce_mem_usage(box_scores)

In [None]:
players = pd.read_csv(f"{CUSTOM_DATA_ROOT}/players.csv",
                      parse_dates=["DOB", "mlbDebutDate"])

players_processed = players[PLAYERS_COLS].copy()

players_processed["birthCountry"] = pd.Categorical(players_processed["birthCountry"])

In [None]:
seasons = pd.read_csv(
    f"{ROOT}/seasons.csv",
    parse_dates=[
        "seasonStartDate",
        "seasonEndDate",
        "preSeasonStartDate",
        "preSeasonEndDate",
        "regularSeasonStartDate",
        "regularSeasonEndDate",
        "lastDate1stHalf",
        "allStarDate",
        "firstDate2ndHalf",
        "postSeasonStartDate",
        "postSeasonEndDate",
    ],
)

seasons["date"] = [
    pd.date_range(start, end)
    for start, end in zip(seasons.seasonStartDate, seasons.seasonEndDate)
]

seasons = seasons.explode("date")

dates = pd.DataFrame({"date": pd.date_range(seasons.seasonStartDate.min(), seasons.seasonEndDate.max())})

seasons = dates.merge(seasons, on=["date"], how="left")

cols_to_bfill = SEASONS_DAYS_FROM_COLS + SEASONS_DATE_COLS + ["seasonId"]

seasons[cols_to_bfill] = seasons[cols_to_bfill].bfill()

del dates

In [None]:
historical_awards = pd.read_csv(f"{ROOT}/awards.csv")

In [None]:
teams = pd.read_csv(f"{ROOT}/teams.csv")
teams = reduce_mem_usage(teams)
teams = teams.rename(columns={"id": "teamId"})
teams = teams[TEAMS_COLS]

In [None]:
player_twitter_followers = pd.read_csv(f"{CUSTOM_DATA_ROOT}/player_twitter_followrs.csv")
team_twitter_followers = pd.read_csv(f"{CUSTOM_DATA_ROOT}/team_twitter_followrs.csv")

player_twitter_followers = reduce_mem_usage(player_twitter_followers)
team_twitter_followers = reduce_mem_usage(team_twitter_followers)

## Feature engineering

In [None]:
data = data.sort_values(["playerId", "date"])

In [None]:
# def engineer_moving_averages(df):
#     for col in TARGET_COLS:
#         df[f"{col}_mean"] = df.groupby("playerId")[col].transform("mean").astype("float16")
#         df[f"{col}_max"] = df.groupby("playerId")[col].transform("max").astype("float16")
#         df[f"{col}_std"] = df.groupby("playerId")[col].transform("std").astype("float16")
        
#     return df
def engineer_moving_averages(df):
    for col in TARGET_COLS:
        df[f"{col}_mean"] = df.groupby("playerId")[col].transform(lambda x: x.shift(1).rolling(7).mean())
        df[f"{col}_max"] = df.groupby("playerId")[col].transform(lambda x: x.shift(1).rolling(7).mean())
        df[f"{col}_std"] = df.groupby("playerId")[col].transform(lambda x: x.shift(1).rolling(7).mean())

    return df


def engineer_seasonality(df):
    df["day_of_week"] = df["date"].dt.day_of_week.astype("uint8")
    df["week_of_year"] = df["date"].dt.isocalendar().week.astype("uint8")
    df["month"] = df["date"].dt.month.astype("uint8")
    
    return df


def engineer_player_box_scores(df):
    df = df.sort_values(["playerId", "date"], ignore_index=True)
    for col in BOX_SCORES_COMPARISON_COLS:
        df[f"mean_to_this_day_{col}"] = df.groupby("playerId")[col].transform(lambda x: x.shift(1).expanding(2).mean())
        df[f"relative_{col}"] = df[col] - df[f"mean_to_this_day_{col}"]
        
        df = df.drop(columns=[f"mean_to_this_day_{col}"])
    
    cols_to_keep = [col for col in df.columns if re.match("date|playerId|relative", col)]
    return df[cols_to_keep]


def engineer_seasons(df):
    df = df.copy()
    for col in SEASONS_DAYS_FROM_COLS:
        df[f"days_from_{col}"] = (df["date"] - df[col]).dt.days

    for col in SEASONS_DATE_COLS:
        df[f"is_{col}"] = (df["date"] == df[col]).astype("uint8")

    df["is_restSeason"] = np.where(df.days_from_seasonStartDate < 0, 1, 0)
    cols_to_keep = [
        "date",
        "days_from_seasonStartDate",
        "days_from_preSeasonStartDate",
        "days_from_regularSeasonStartDate",
        "days_from_postSeasonStartDate",
        "days_from_postSeasonEndDate",
        "is_allStarDate",
        "is_lastDate1stHalf",
        "is_firstDate2ndHalf",
        "is_seasonEndDate",
        "is_preSeasonEndDate",
        "is_regularSeasonEndDate",
        "is_postSeasonEndDate",
        "is_restSeason",
    ]

    return reduce_mem_usage(df[cols_to_keep])



def engineer_historical_awards(df):
    df = df.copy()

    awards_count = df.groupby("playerId")["awardName"].count().rename("historicalAwardsCount")
    all_star_count = (
        df.groupby("playerId")["awardName"]
        .apply(lambda x: (x.str.contains("all.*star", case=False)).sum())
        .rename("historicalAllStarCount")
    )
    mvp_count = (
        df.groupby("playerId")["awardName"]
        .apply(lambda x: (x.str.contains("mvp|most.*valuable", case=False)).sum())
        .rename("historicalMvpCount")
    )
    rookie_count = (
        df.groupby("playerId")["awardName"]
        .apply(lambda x: (x.str.contains("rookie", case=False)).sum())
        .rename("historicalRookieCount")
    )

    return pd.concat(
        [awards_count, all_star_count, mvp_count, rookie_count], axis=1
    ).reset_index()


def engineer_awards(df):
    df = df.copy()
    
    df["date"] += pd.Timedelta(days=1)
    
    awards_count = df.groupby(["date", "playerId"])["awardName"].count().rename("awardsCount")
    all_star_count = (
        df.groupby(["date", "playerId"])["awardName"]
        .apply(lambda x: (x.str.contains("all.*star", case=False)).sum())
        .rename("allStarCount")
    )
    mvp_count = (
        df.groupby(["date", "playerId"])["awardName"]
        .apply(lambda x: (x.str.contains("mvp|most.*valuable", case=False)).sum())
        .rename("mvpCount")
    )
    rookie_count = (
        df.groupby(["date", "playerId"])["awardName"]
        .apply(lambda x: (x.str.contains("rookie", case=False)).sum())
        .rename("rookieCount")
    )

    return pd.concat(
        [awards_count, all_star_count, mvp_count, rookie_count], axis=1
    ).reset_index()


def engineer_player_twitter(df):
    df["month"] = df.date.dt.month
    df["year"] = df.date.dt.year
    df["player_followers_depth"] = (df["numberOfFollowers"]
                                    .div(df.groupby("month")["numberOfFollowers"].transform("median"))
                                   )
    
    return df[PLAYER_TWITTER_COLS]


def engineer_team_twitter(df):
    df["month"] = df.date.dt.month
    df["year"] = df.date.dt.year
    df["team_followers_depth"] = (df["numberOfFollowers"]
                                  .div(df.groupby("month")["numberOfFollowers"].transform("median"))
                                 )
    
    return df[TEAM_TWITTER_COLS]

In [None]:
model1, model2, model3, model4 = [pickle.load(open(f"../input/modelv8/lightgbm_v8_target{i}.pkl", "rb")) for i in range(1, 5)]

In [None]:
box_scores = box_scores.drop(columns=["gameTimeUTC", "jerseyNum"])

In [None]:
box_scores_agg = box_scores.groupby(["date", "playerId"], as_index=False).mean()

In [None]:
box_scores_agg.date += pd.Timedelta(days=1)

In [None]:
seasons_features = engineer_seasons(seasons)

In [None]:
historical_awards_features = engineer_historical_awards(historical_awards)
historical_awards_features = reduce_mem_usage(historical_awards_features)

In [None]:
rosters_features = pd.read_csv(f"{CUSTOM_DATA_ROOT}/rosters.csv",
                               parse_dates=["date"])

rosters_features = reduce_mem_usage(rosters_features)

In [None]:
del box_scores
del seasons
del historical_awards

In [None]:
env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test:
    current_date = pd.to_datetime(str(sample_prediction_df.reset_index().date.iat[0]))
    test_date = current_date + pd.Timedelta(days=1)
    
    predictions = sample_prediction_df.reset_index()["date_playerId"].str.split("_", expand=True).rename(columns={0: "date", 1: "playerId"})
    predictions["date"] = pd.to_datetime(predictions["date"].astype(str))
    predictions["playerId"] = predictions["playerId"].astype("int32")
    
    test_box_scores = unpack_col_to_df(test_df.playerBoxScores)
    if not test_box_scores.empty:
        test_box_scores_agg = process_box_scores(test_box_scores)
        test_box_scores_agg["date"] += pd.Timedelta(days=1)
    
    test_rosters = unpack_col_to_df(test_df.rosters)
    if not test_rosters.empty:
        test_rosters = test_rosters.rename(columns={"gameDate": "date"})
        test_rosters["date"] += pd.Timedelta(days=1)
        test_rosters = test_rosters[ROSTERS_COLS]
    else:
        test_rosters = pd.DataFrame({'playerId': predictions['playerId']})
        for col in ROSTERS_COLS:
            if col == 'playerId': continue
            elif col == 'date': 
                test_rosters['date'] = test_date
            else:
                test_rosters[col] = np.nan
    
    awards = unpack_col_to_df(test_df.awards)
    if not awards.empty:
        awards = awards.rename(columns={"awardDate": "date"})
        awards_features = engineer_awards(awards)
        awards_features = reduce_mem_usage(awards_features)
    else:
        awards_features = pd.DataFrame(columns=AWARDS_COLS, dtype="float16")
                
    test_player_twitter_followers = unpack_col_to_df(test_df.playerTwitterFollowers)
    if not test_player_twitter_followers.empty:
        test_player_twitter_followers = engineer_player_twitter(test_player_twitter_followers)
        player_twitter_followers = pd.concat([player_twitter_followers, test_player_twitter_followers])

    test_team_twitter_followers = unpack_col_to_df(test_df.teamTwitterFollowers)
    if not test_team_twitter_followers.empty:
        test_team_twitter_followers = engineer_player_twitter(test_team_twitter_followers)
        team_twitter_followers = pd.concat([team_twitter_followers, test_team_twitter_followers])
        
    year_player = player_twitter_followers.year.max()
    month_player = player_twitter_followers[player_twitter_followers.year.eq(year_player)].month.max()
    year_team = team_twitter_followers.year.max()
    month_team = team_twitter_followers[team_twitter_followers.year.eq(year_team)].month.max()
    
    player_date_mask = player_twitter_followers.year.eq(year_player) & player_twitter_followers.month.eq(month_player)
    team_date_mask = team_twitter_followers.year.eq(year_team) & team_twitter_followers.month.eq(month_team)
    test_player_twitter_followers = player_twitter_followers[player_date_mask][["playerId", "player_followers_depth"]]
    test_team_twitter_followers = team_twitter_followers[team_date_mask][["teamId", "team_followers_depth"]]  
   
    rosters_features = rosters_features[rosters_features.date <= current_date]
    rosters_features = pd.concat([rosters_features, test_rosters]).reset_index(drop=True)
    rosters_features[["statusCode", "teamId"]] = rosters_features.groupby("playerId")[["statusCode", "teamId"]].ffill()
    test_rosters_features = rosters_features[rosters_features.date.eq(test_date)].drop(columns=["date"]).copy()
    
    test_transactions = unpack_col_to_df(test_df.transactions)
    if test_transactions.empty:
        transactions_features = pd.DataFrame(columns=["playerId"] + TRANSACTIONS_COLS)
    else:
        transactions_features = process_transactions(test_transactions)
        
    box_scores_agg = box_scores_agg[box_scores_agg.date <= current_date]
    box_scores_agg = pd.concat([box_scores_agg, test_box_scores_agg]).reset_index(drop=True)
    box_scores_features = pd.concat([engineer_player_box_scores(box_scores_agg), 
                                     box_scores_agg[BOX_SCORES_NUMERIC_COLS],
                                     box_scores_agg[BOX_SCORES_CATEGORICAL_COLS]], axis=1)
    
    
    data = data[data.date <= current_date]
    data["year"] = data.date.dt.year.astype("uint16")
    data = pd.concat([data, predictions]).reset_index(drop=True)
    data = data.pipe(engineer_moving_averages).pipe(engineer_seasonality)
        
    test_data = data.loc[data.date.eq(test_date)].copy()
    test_data = predictions.merge(
        test_data.merge(box_scores_features, on=["date", "playerId"], how="left"),
        on=["date", "playerId"],
        how="left",
        validate="1:1"
    )
    test_data = process_players(test_data.merge(players_processed, on=["playerId"], how="left", validate="1:1"))
    test_data = test_data.merge(seasons_features, on=["date"], how="left", validate="m:1")
    test_data = test_data.merge(historical_awards_features, on=["playerId"], how="left", validate="1:1")
    test_data = test_data.merge(awards_features, on=["playerId"], how="left", validate="1:1")
    test_data = test_data.merge(transactions_features, on=["playerId"], how="left", validate="1:1")
    test_data = test_data.merge(test_rosters_features, on=["playerId"], how="left", validate="1:1")
    test_data = test_data.merge(test_player_twitter_followers, on=["playerId"], how="left", validate="m:1")
    test_data = test_data.merge(test_team_twitter_followers, on=["teamId"], how="left", validate="m:1")
    test_data = test_data.merge(teams, on=["teamId"], how="left", validate="m:1")
    
    test_data[TRANSACTIONS_COLS] = test_data[TRANSACTIONS_COLS].fillna(0)
    test_data["statusCode"] = pd.Categorical(test_data["statusCode"])
    test_data["leagueId"] = pd.Categorical(test_data["leagueId"])
    test_data["divisionId"] = pd.Categorical(test_data["divisionId"])
    
    test_data = test_data.drop(columns=["year"])
    
    X_test = test_data[FEATURES_COLS]
    
    target1 = model1.predict(X_test)
    target2 = model2.predict(X_test)
    target3 = model3.predict(X_test)
    target4 = model4.predict(X_test)
    
    sample_prediction_df['target1'] = np.clip(target1, 0, 100)
    sample_prediction_df['target2'] = np.clip(target2, 0, 100)
    sample_prediction_df['target3'] = np.clip(target3, 0, 100)
    sample_prediction_df['target4'] = np.clip(target4, 0, 100)
    sample_prediction_df = sample_prediction_df.fillna(0)
    
    data.loc[data.date.eq(test_date),'target1'] = np.clip(target1, 0, 100)
    data.loc[data.date.eq(test_date),'target2'] = np.clip(target2, 0, 100)
    data.loc[data.date.eq(test_date),'target3'] = np.clip(target3, 0, 100)
    data.loc[data.date.eq(test_date),'target4'] = np.clip(target4, 0, 100)
    data = data.fillna(0)
    
    env.predict(sample_prediction_df)