## Notebook Setup

In [None]:
#### Import Python Libraries and Set Script Options ####
import numpy as np
import pandas as pd

# Plotly libraries
import plotly as pl
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go

# Library for interactive Python widgets
import ipywidgets as widgets

# Utility libraries
import gc
from pathlib import Path

# Set notebook mode to make plotly graphics offline
pyo.init_notebook_mode()

# Expand max column width when displaying data frames 
pd.set_option('display.max_colwidth', 100)

# Lists all input data files from "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Read in Kaggle Data Files

#### Read in MLB Player Digital Engagement Forecasting Data from CSVs into pandas DFs

In [None]:
# Start with input file path
input_file_path = Path('/kaggle/input/mlb-player-digital-engagement-forecasting/')

# Create table with list of CSV files to be read in, w/ corresponding df name
# This does include large 'train' data set (read in separately)
csv_and_df_names = pd.DataFrame(data = {
  'csv_name': ['seasons', 'teams', 'players', 'awards',
    'example_test', 'example_sample_submission'],
  'df_name': ['seasons', 'teams', 'players', 'awards_pre2018',
    'example_test', 'example_sample_submission'] 
  })

# Set up for tabbed output
kaggle_data_tabs = widgets.Tab()

# Add Output widgets for each (eventual) DF as tabs' children
kaggle_data_tabs.children = list([widgets.Output() for df_name 
  in csv_and_df_names['df_name']])

for index, row in csv_and_df_names.iterrows():
    
    csv_name = row['csv_name']
    df_name = row['df_name']
    
    # Read from CSV and create df with specified name in environment
    globals()[df_name] = pd.read_csv(input_file_path / f"{csv_name}.csv")

    # Set tab title to df name
    kaggle_data_tabs.set_title(index, df_name)
    
    # Display corresponding table output for this tab name
    with kaggle_data_tabs.children[index]:
        display(eval(df_name))

display(kaggle_data_tabs)

#### Read in Training Data CSV into pandas DF

In [None]:
train = pd.read_csv(input_file_path / 'train.csv')

# Convert training data date field to pandas datetime type
train['date'] = pd.to_datetime(train['date'], format = "%Y%m%d")

display(train.info())

display(train)

#### Unnest and look at data from each of the nested data frames within the daily data

In [None]:
# Get names of all "nested" data frames in daily training set
daily_data_nested_df_names = train.drop('date', axis = 1).columns.values.tolist()

for df_name in daily_data_nested_df_names:
    date_nested_table = train[['date', df_name]]

    date_nested_table = (date_nested_table[
      ~pd.isna(date_nested_table[df_name])
      ].
      reset_index(drop = True)
      )
    
    daily_dfs_collection = []
    
    for date_index, date_row in date_nested_table.iterrows():
        daily_df = pd.read_json(date_row[df_name])
        
        daily_df['dailyDataDate'] = date_row['date']
        
        daily_dfs_collection = daily_dfs_collection + [daily_df]

    # Concatenate all daily dfs into single df for each row
    unnested_table = (pd.concat(daily_dfs_collection,
      ignore_index = True).
      # Set and reset index to move 'dailyDataDate' to front of df
      set_index('dailyDataDate').
      reset_index()
      )
    
    # Creates 1 pandas df per unnested df from daily data read in, with same name
    globals()[df_name] = unnested_table    
    
    # Clean up tables and collection of daily data frames for this df
    del(date_nested_table, daily_dfs_collection, unnested_table)

# Set up for tabbed output
daily_data_unnested_tabs = widgets.Tab()

# Add Output widgets for each (eventual) DF as tabs' children
daily_data_unnested_tabs.children = list([widgets.Output() 
  for df_name in daily_data_nested_df_names])

for index in range(0, len(daily_data_nested_df_names)):
    df_name = daily_data_nested_df_names[index]
    
    # Rename tab bar titles to df names
    daily_data_unnested_tabs.set_title(index, df_name)

    # Display corresponding table output for this tab name
    with daily_data_unnested_tabs.children[index]:
        display(eval(df_name))

display(daily_data_unnested_tabs)

In [None]:
#### Delete original training data since it has all been extracted and garbage collect, to clear memory

del(train)

gc.collect()

## Exploratory Analysis of Target Variables

#### Look at some distribution summary values for each target

In [None]:
# Melt data to get 1 row per player-date per target
player_engagement_targets_melted = pd.melt(
  nextDayPlayerEngagement,
  id_vars = ['dailyDataDate', 'playerId'],
  value_vars = ['target1', 'target2', 'target3', 'target4'],
  var_name = 'target'
  )

# Calculate some distribution summary values by target
player_engagement_data_summary_by_target = (player_engagement_targets_melted.
  groupby(['target'], as_index = False).
  agg(
    count = ('value', 'count'),
    # Mean and standard deviation
    mean = ('value', np.mean),
    stdDev = ('value', np.std),
    # A few percentiles of interest (including median)
    pctle10 = ('value', lambda x: np.percentile(x, q = 10)),
    pctle25 = ('value', lambda x: np.percentile(x, q = 25)),
    median = ('value', np.median),
    pctle75 = ('value', lambda x: np.percentile(x, q = 75)),
    pctle90 = ('value', lambda x: np.percentile(x, q = 90)),
    # Percentage of all target values equal to min (0) or max (100)
    pctValues0 = ('value', lambda x: np.mean(x == 0) * 100),
    pctValues100 = ('value', lambda x: np.mean(x == 100) * 100)
    )
  )

display(player_engagement_data_summary_by_target.round(decimals = 3))

#### Interactive distribution plot for each prediction target over all training data ####

In [None]:
# Decimal places to round target values to for grouping in distribution plot
ROUND_DECIMALS = 0

# Round target values so they can be grouped by for distributions
player_engagement_targets_melted['roundedValue'] = (
  player_engagement_targets_melted['value'].round(ROUND_DECIMALS))

# Group by target and rounded value
player_engagement_targets_dist = (player_engagement_targets_melted.
  groupby(['target', 'roundedValue'], as_index = False).
  agg(numPlayerDates = ('playerId', 'count'))
  )

player_engagement_targets_dist['cumPlayerDates'] = (player_engagement_targets_dist.
  groupby(['target'])['numPlayerDates'].cumsum())

player_engagement_targets_dist['cumPctPlayerDates'] = (
  player_engagement_targets_dist['cumPlayerDates'] /
  # Divide by total # of player-dates in original data set
  nextDayPlayerEngagement.shape[0]
  ) * 100
    
player_engagement_targets_dist_plot = px.bar(
  player_engagement_targets_dist,
  x = 'roundedValue',
  y = 'numPlayerDates',
  facet_row = 'target',
  hover_data = player_engagement_targets_dist.columns,
  labels = {
    'roundedValue': 'Rounded Target Value',
    'numPlayerDates': '# of Player-Dates',
    'cumPlayerDates': 'Cumulative # of Player-Dates',
    'cumPctPlayerDates': 'Cumulative % of Player-Dates'
    },
  title = 'Target Value Distributions Across Player-Dates',
  width = 900,
  height = 900
  )

pyo.iplot(player_engagement_targets_dist_plot)

#### Look at correlations among target metrics across player-dates

In [None]:
player_engagement_targets_correlations = (nextDayPlayerEngagement[
  ['target1', 'target2', 'target3', 'target4']].
  corr()
  )

display(player_engagement_targets_correlations.round(decimals = 3))

In [None]:
#### Remove large melted player engagement data frame to clear memory
del(player_engagement_targets_melted)

gc.collect()

## Exploratory Analysis and Preparation of Data for Potential Predictors of Digital Engagement

### Dates Relative to Season

#### Get some information on each date in daily data (using season dates of interest)

In [None]:
dates = pd.DataFrame(data = 
  {'dailyDataDate': nextDayPlayerEngagement['dailyDataDate'].unique()})

dates['year'] = dates['dailyDataDate'].dt.year
dates['month'] = dates['dailyDataDate'].dt.month

dates_with_info = pd.merge(
  dates,
  seasons,
  left_on = 'year',
  right_on = 'seasonId'
  )

# Count anything between regular and Postseason as "in season"
dates_with_info['inSeason'] = (
  dates_with_info['dailyDataDate'].between(
    dates_with_info['regularSeasonStartDate'],
    dates_with_info['postSeasonEndDate'],
    inclusive = True
    )
  )

# Separate dates into different parts of MLB season
dates_with_info['seasonPart'] = np.select(
  [
    dates_with_info['dailyDataDate'] < dates_with_info['preSeasonStartDate'], 
    dates_with_info['dailyDataDate'] < dates_with_info['regularSeasonStartDate'],
    dates_with_info['dailyDataDate'] <= dates_with_info['lastDate1stHalf'],
    dates_with_info['dailyDataDate'] < dates_with_info['firstDate2ndHalf'],
    dates_with_info['dailyDataDate'] <= dates_with_info['regularSeasonEndDate'],
    dates_with_info['dailyDataDate'] < dates_with_info['postSeasonStartDate'],
    dates_with_info['dailyDataDate'] <= dates_with_info['postSeasonEndDate'],
    dates_with_info['dailyDataDate'] > dates_with_info['postSeasonEndDate']
  ], 
  [
    'Offseason',
    'Preseason',
    'Reg Season 1st Half',
    'All-Star Break',
    'Reg Season 2nd Half',
    'Between Reg and Postseason',
    'Postseason',
    'Offseason'
  ], 
  default = np.nan
  )

dates_with_season_part = (dates_with_info[['dailyDataDate', 'year',
  'seasonId', 'month', 'inSeason', 'seasonPart']].
  rename(columns = {'seasonId': 'season'})
  )

display(dates_with_season_part)

### Rosters

#### Look at different roster status values and frequency in data

In [None]:
roster_status_values = (rosters.
  groupby(['statusCode', 'status'], as_index = False).
  agg(
    numPlayerDates = ('playerId', 'count')
    ).
  sort_values(['numPlayerDates'], ascending = False, 
    ignore_index = True)
  )

display(roster_status_values)

#### Check for any cases of multiple roster rows per player-date
If output below has 0 rows, can proceed without worrying about duplicate player-date roster rows.

In [None]:
player_dates_multiple_roster_entries = (rosters.
  groupby(['dailyDataDate', 'playerId'], as_index = False).
  agg(
    numPlayerDateRosterEntries = ('playerId', 'count')
    ).
  query("numPlayerDateRosterEntries > 1")
  )

display(player_dates_multiple_roster_entries)

### Team Games and Game Stats

#### Turn games table into 1 row per team game, then merge with team box scores

In [None]:
# Filter to regular/Postseason & All-Star games marked "final" in games table
games_for_stats = games[
  np.isin(games['gameType'], ['R', 'F', 'D', 'L', 'W', 'C', 'P', 'A']) &
  (games['codedGameState'] == 'F')
  ]

# Get games table from home team perspective
games_home_perspective = games_for_stats.copy()

# Change column names so that "team" is "home", "opp" is "away"
games_home_perspective.columns = [
  col_value.replace('home', 'team').replace('away', 'opp') for 
    col_value in games_home_perspective.columns.values]

games_home_perspective['isHomeTeam'] = 1

# Get games table from away team perspective
games_away_perspective = games_for_stats.copy()

# Change column names so that "opp" is "home", "team" is "away"
games_away_perspective.columns = [
  col_value.replace('home', 'opp').replace('away', 'team') for 
    col_value in games_away_perspective.columns.values]

games_away_perspective['isHomeTeam'] = 0

# Put together games from home/away perspective to get df w/ 1 row per team game
team_games = (pd.concat([
  games_home_perspective,
  games_away_perspective
  ],
  ignore_index = True)
  )

# Copy over team box scores data to modify
team_game_stats = teamBoxScores.copy()

# Change column names to reflect these are all "team" stats - helps 
# to differentiate from individual player stats if/when joining later
team_game_stats.columns = [
  (col_value + 'Team') 
  if (col_value not in ['dailyDataDate', 'home', 'teamId', 'gamePk',
    'gameDate', 'gameTimeUTC'])
    else col_value
  for col_value in team_game_stats.columns.values
  ]

# Merge games table with team game stats
team_games_with_stats = pd.merge(
  team_games,
  team_game_stats.
    # Drop some fields that are already present in team_games table
    drop(['home', 'gameTimeUTC'], axis = 1),
  on = ['dailyDataDate', 'gamePk', 'gameDate', 'teamId'],
  # Doing this as 'inner' join excludes spring training games, postponed games,
  # etc. from original games table, but this may be fine for purposes here 
  how = 'inner'
  )

display(team_games_with_stats)

#### Aggregate team game-level stats to daily date (accounts for multiple games per day)
If 1st output below has 0 rows, can proceed without worrying about duplicate team dates.

In [None]:
# Verify that no team played 2 different opponents or in 2 different game types
# Allows opp and gameType to be used in aggregation w/o getting multiple rows
team_date_gameTypes_opps_agg = (team_games_with_stats.
  groupby(['dailyDataDate', 'teamId'], as_index = False).
  agg(
    numGameTypes = ('gameType', 'nunique'),
    numOppIds = ('oppId', 'nunique'),
    numOppNames = ('oppName', 'nunique')
    )
  )

# Can proceed w/o worrying about duplicate team-dates as long as this returns 0 rows
display(team_date_gameTypes_opps_agg[
  (team_date_gameTypes_opps_agg['numGameTypes'] != 1) |
  (team_date_gameTypes_opps_agg['numOppIds'] != 1) |
  (team_date_gameTypes_opps_agg['numOppNames'] != 1)
  ])

team_date_stats_agg = (team_games_with_stats.
  groupby(['dailyDataDate', 'teamId', 'gameType', 'oppId', 'oppName'], 
    as_index = False).
  agg(
    numGamesTeam = ('gamePk', 'nunique'),
    winsTeam = ('teamWinner', 'sum'),
    lossesTeam = ('oppWinner', 'sum'),
    runsScoredTeam = ('teamScore', 'sum'),
    runsAllowedTeam = ('oppScore', 'sum')
    )
   )

display(team_date_stats_agg)

### Player Game Stats

#### Add some stats/info to player game level stats, look at notable results

In [None]:
# Copy over player box scores df and rename team fields
player_game_stats = (playerBoxScores.copy().
  # Change team Id/name to reflect these come from player game, not roster
  rename(columns = {'teamId': 'gameTeamId', 'teamName': 'gameTeamName'})
  )

# Adds in field for innings pitched as fraction (better for aggregation)
player_game_stats['inningsPitchedAsFrac'] = np.where(
  pd.isna(player_game_stats['inningsPitched']),
  np.nan,
  np.floor(player_game_stats['inningsPitched']) +
    (player_game_stats['inningsPitched'] -
      np.floor(player_game_stats['inningsPitched'])) * 10/3
  )

# Add in Tom Tango pitching game score (https://www.mlb.com/glossary/advanced-stats/game-score)
player_game_stats['pitchingGameScore'] = np.where(
  # pitching game score doesn't apply if player didn't pitch, set to NA
  pd.isna(player_game_stats['pitchesThrown']) | 
    (player_game_stats['pitchesThrown'] == 0),
  np.nan,
  (40
    + 2 * player_game_stats['outsPitching']
    + 1 * player_game_stats['strikeOutsPitching']
    - 2 * player_game_stats['baseOnBallsPitching']
    - 2 * player_game_stats['hitsPitching']
    - 3 * player_game_stats['runsPitching']
    - 6 * player_game_stats['homeRunsPitching']
    )
  )

# Look at top pitching game scores in span of data
player_game_top_pitching_game_scores = (player_game_stats
  [['gameDate', 'playerName', 'gameTeamName', 'outsPitching',
    'strikeOutsPitching', 'baseOnBallsPitching', 'hitsPitching',
    'runsPitching', 'homeRunsPitching', 'pitchingGameScore']].
  sort_values(['pitchingGameScore'], ascending = False,
    ignore_index = True).
  head(n = 10)
  )
    
print('Top Pitching Game Scores in Span of Data')
display(player_game_top_pitching_game_scores)

# Add in criteria for no-hitter by pitcher (individual, not multiple pitchers)
player_game_stats['noHitter'] = np.where(
  (player_game_stats['completeGamesPitching'] == 1) &
  (player_game_stats['inningsPitched'] >= 9) &
  (player_game_stats['hitsPitching'] == 0),
  1, 0
  )

player_game_no_hitters = (player_game_stats
  [player_game_stats['noHitter'] == 1]
  [['gameDate', 'playerName', 'gameTeamName', 'completeGamesPitching', 
    'inningsPitched', 'hitsPitching', 'noHitter', 'pitchingGameScore']].
  sort_values(['gameDate'], ascending = False, ignore_index = True)
  )

print('Individual No-Hitters in Span of Data')
display(player_game_no_hitters)
# Can check vs MLB official list: https://www.mlb.com/news/no-hitter-c265779246

#### Aggregate player game-level stats to daily date (accounts for multiple games per day)

In [None]:
player_date_stats_agg = pd.merge(
  (player_game_stats.
    groupby(['dailyDataDate', 'playerId'], as_index = False).
    # Some aggregations that are not simple sums
    agg(
      numGames = ('gamePk', 'nunique'),
      # Should be 1 team per player per day, but adding here for 1 exception:
      # playerId 518617 (Jake Diekman) had 2 games for different teams marked
      # as played on 5/19/19, due to resumption of game after he was traded
      numTeams = ('gameTeamId', 'nunique'),
      # Should be only 1 team for all player-dates, taking min to make sure
      gameTeamId = ('gameTeamId', 'min'),
      gameTeamName = ('gameTeamName', 'min')
      )
    ),
  # Merge with a bunch of player stats that can be summed at date/player level
  (player_game_stats.
    groupby(['dailyDataDate', 'playerId'], as_index = False)
    [[# Stats as hitter/baserunner
      'gamesPlayedBatting', 'runsScored', 'doubles', 'triples', 'homeRuns',
      'strikeOuts', 'baseOnBalls', 'hits', 'hitByPitch', 'atBats',
      'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
      'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
      # Stats as pitcher
      'gamesPlayedPitching', 'gamesStartedPitching', 'completeGamesPitching',
      'shutoutsPitching', 'winsPitching', 'lossesPitching', 'runsPitching',
      'homeRunsPitching', 'strikeOutsPitching', 'baseOnBallsPitching',
      'hitsPitching', 'earnedRuns', 'battersFaced', 'outsPitching',
      'pitchesThrown', 'balls', 'strikes', 'saves', 'holds', 'blownSaves',
      'inningsPitchedAsFrac', 'pitchingGameScore', 'noHitter',
      # Stats as fielder (quite basic)
      'assists', 'putOuts', 'errors'  
      ]].
    sum()
    ),
  on = ['dailyDataDate', 'playerId'],
  how = 'inner'
  )

display(player_date_stats_agg)

### Notable In-Game Events

#### Add some event-based stats that look at context on each play

In [None]:
# Merge games w/ events to get scheduled length of game (helps w/ some calculations)
events_plus = pd.merge(
  events,
  games[['gamePk', 'scheduledInnings']].drop_duplicates(),
  on = ['gamePk'],
  how = 'left'
  )

# Get current score from batting & pitching team perspectives
events_plus['battingTeamScore'] = np.where(events_plus['halfInning'] == 'bottom',
  events_plus['homeScore'], events_plus['awayScore'])

events_plus['pitchingTeamScore'] = np.where(events_plus['halfInning'] == 'bottom',
  events_plus['awayScore'], events_plus['homeScore'])

events_plus['pitches100mph'] = np.where(
  (events_plus['type'] == 'pitch') & (events_plus['startSpeed'] >= 100), 
  1, 0)

events_plus['HRDist450ft'] = np.where(
  (events_plus['event'] == 'Home Run') & (events_plus['totalDistance'] >= 450), 
  1, 0)

# Use game context/score logic to add fields for notable in-game events
events_plus['gameTyingRBI'] = np.where(
  (events_plus['isPaOver'] == 1) & (events_plus['rbi'] > 0) &
  # Start w/ batting team behind in score...
  (events_plus['battingTeamScore'] < events_plus['pitchingTeamScore']) & 
  # ...and look at cases where adding RBI ties score
  ((events_plus['battingTeamScore'] + events_plus['rbi']) == 
    events_plus['pitchingTeamScore']
    ),
  1, 0)

events_plus['goAheadRBI'] = np.where(
  (events_plus['isPaOver'] == 1) & (events_plus['rbi'] > 0) &
  # Start w/ batting team not ahead in score (can be tied)...
  (events_plus['battingTeamScore'] <= events_plus['pitchingTeamScore']) &
  # ... and look at cases where adding RBI puts batting team ahead
  ((events_plus['battingTeamScore'] + events_plus['rbi']) >
    events_plus['pitchingTeamScore']
    ),
  1, 0)

# Add field to count walk-off (game-winning, game-ending) RBI
events_plus['walkoffRBI'] = np.where(
  (events_plus['inning'] >= events_plus['scheduledInnings']) & 
  (events_plus['halfInning'] == 'bottom') &
  (events_plus['goAheadRBI'] == 1),
  1, 0)

added_events_fields = ['pitches100mph', 'HRDist450ft', 'gameTyingRBI',
  'goAheadRBI', 'walkoffRBI']

# Count overall frequency of added events
event_counts = events_plus[added_events_fields].sum()

display(event_counts)

#### Aggregate player event-based stats to player-date level

In [None]:
pitcher_date_events_agg = (events_plus.
  groupby(['dailyDataDate', 'pitcherId'], as_index = False).
  agg(
    pitches100mph = ('pitches100mph', 'sum'),
    walkoffRBIAllowed = ('walkoffRBI', 'sum')  
    )  
  )

hitter_date_events_agg = (events_plus.
  groupby(['dailyDataDate', 'hitterId'], as_index = False)
  [[field for field in added_events_fields if field != 'pitches100mph']].
  sum()
  )

player_date_events_agg = (pd.merge(
  pitcher_date_events_agg.rename(columns = {'pitcherId': 'playerId'}),
  hitter_date_events_agg.rename(columns = {'hitterId': 'playerId'}),
  on = ['dailyDataDate', 'playerId'],
  how = 'outer'
  ). 
  # NAs on events fields can be turned to 0 (no such stats in those categories)
  fillna({field: 0 for field in added_events_fields + ['walkoffRBIAllowed']})
  )

display(player_date_events_agg)

#### Merge date-level player game stats w/ date-level player stats from events

In [None]:
player_date_stats_events_agg = (pd.merge(
  player_date_stats_agg,
  player_date_events_agg,
  on = ['dailyDataDate', 'playerId'],
  how = 'left'
  ). 
  # set event fields NAs to 0 (assumed since player has game stats but not these)
  fillna({field: 0 for field in added_events_fields + ['walkoffRBIAllowed']})
  )

display(player_date_stats_events_agg)

In [None]:
#### Delete some tables no longer needed past this point to clear up memory 

del(events, events_plus, player_date_stats_agg, pitcher_date_events_agg,
  hitter_date_events_agg, player_date_events_agg)

gc.collect()

### Transactions

#### Look at different transaction type values and frequency in data

In [None]:
transaction_type_values = (transactions.
  groupby(['typeCode', 'typeDesc'], as_index = False).
  agg(
    numTransactions = ('date', 'count')
    ).
  sort_values(['numTransactions'], ascending = False,
    ignore_index = True)
  )

display(transaction_type_values)

#### Create data frame on player-date level, w/ 1 column per transaction type


In [None]:
# Pick certain transaction codes of interest from above list
transaction_types_of_interest = ['Assigned', 'Signed as Free Agent', 
  'Status Change', 'Optioned', 'Recalled', 'Signed', 'Selected',
  'Trade', 'Designated for Assignment']

player_date_transactions_wide = (transactions.
  assign(
    # Create field w/ initial lower case & w/o spaces for later field names
    typeDescNoSpace = [(typeDesc[0].lower() + typeDesc[1:]) for typeDesc in
      transactions['typeDesc'].str.replace(' ', '')],
    # Add count ahead of pivot
    count = 1
    )
  [
  # Filter to transactions of desired types and rows for actual players
    np.isin(transactions['typeDesc'], transaction_types_of_interest) &
    pd.notna(transactions['playerId'])
  ][['dailyDataDate', 'playerId', 'typeDescNoSpace', 'count']].
  # Filter to unique transaction types across player-date
  drop_duplicates().
  # Pivot data to 1 row per player-date and 1 column per transaction type
  pivot_table(
    index = ['dailyDataDate', 'playerId'],
    columns = 'typeDescNoSpace',
    values = 'count',
    # NA can be turned to 0 since it means player didn't have that transaction that day
    fill_value = 0
    ).
  reset_index()
  )

display(player_date_transactions_wide)

### Team Standings

#### Prepare team standings table for merge w/ player digital engagement data

If 1st output below has 0 rows, can proceed without worrying about duplicate team-date standings rows.

In [None]:
# Check for multiple entries for team standings on same date
team_dates_multiple_standings_entries = (standings.
  groupby(['dailyDataDate', 'teamId'], as_index = False).
  agg(
    numTeamDateStandingsEntries = ('teamId', 'count')
    ).
  query("numTeamDateStandingsEntries > 1")
  )

# If following returns 0 rows, can join to other daily data w/o worrying about duplicates 
display(team_dates_multiple_standings_entries)

# Pick only certain fields of interest from standings for merge
standings_selected_fields = (standings[['dailyDataDate', 'teamId', 
  'streakCode', 'divisionRank', 'leagueRank', 'wildCardRank', 'pct'
  ]].
  rename(columns = {'pct': 'winPct'})
  )

# Change column names to reflect these are all "team" standings - helps 
# to differentiate from player-related fields if/when joining later
standings_selected_fields.columns = [
  (col_value + 'Team') 
  if (col_value not in ['dailyDataDate', 'teamId'])
    else col_value
  for col_value in standings_selected_fields.columns.values
  ]

standings_selected_fields['streakLengthTeam'] = (
  standings_selected_fields['streakCodeTeam'].
    str.replace('W', '').
    str.replace('L', '').
    astype(float)
    )

# Add fields to separate winning and losing streak from streak code
standings_selected_fields['winStreakTeam'] = np.where(
  standings_selected_fields['streakCodeTeam'].str[0] == 'W',
  standings_selected_fields['streakLengthTeam'],
  np.nan
  )

standings_selected_fields['lossStreakTeam'] = np.where(
  standings_selected_fields['streakCodeTeam'].str[0] == 'L',
  standings_selected_fields['streakLengthTeam'],
  np.nan
  )

# Drop streak fields no longer necessary w/ derived values
standings_selected_fields.drop(
  ['streakCodeTeam', 'streakLengthTeam'], 
  axis = 1, 
  inplace = True
  )

display(standings_selected_fields)

### Awards

#### Look at various awards/honors received by players in daily data & before

In [None]:
player_awards_all = (pd.concat([
  # Filter awards from daily df to only those involving tracked players
  awards[np.isin(awards['playerId'], players['playerId'])],
  # Add daily data date to pre-2018 awards_df
  awards_pre2018.assign(
    dailyDataDate = pd.to_datetime(awards_pre2018['awardDate'], 
      format = '%Y-%m-%d')
    )], 
    ignore_index = True
    ).
  sort_values(['awardDate'], ascending = False, ignore_index = True)
  )

display(player_awards_all)

awards_summary = (player_awards_all.
  groupby(['awardId', 'awardName'], as_index = False).
  agg(
    numWinnersInThisPlayerSet = ('playerId', 'count'),
    mostRecAwardDate = ('awardDate', 'max'),
    mostRecAwardSeason = ('awardSeason', 'max')
    ). 
  sort_values(['mostRecAwardDate', 'awardId'],
    ascending = [False, True], ignore_index = True)
  )

display(awards_summary)

#### Limit down to certain award categories, get players' running tallies by date

In [None]:
selected_awards = pd.DataFrame(data = {
  'awardId':  ['ALAS', 'NLAS', 'ALMVP', 'NLMVP', 'ALCY', 'NLCY'],
  'awardCategory': ['AllStar', 'AllStar', 'MVP', 'MVP', 'CyYoung', 'CyYoung']
  })

player_selected_awards = pd.merge(
  player_awards_all,
  selected_awards,
  on = 'awardId',
  # Inner join to limit player awards to only selected ones
  how = 'inner'
  )

selected_award_categories_in_data = (player_selected_awards['awardCategory'].
  unique())

player_selected_awards_by_date = (player_selected_awards.
  # Add count for use when pivoting
  assign(count = 1).
  pivot_table(
    index = ['dailyDataDate', 'playerId', 'playerName'],
    columns = 'awardCategory',
    values = 'count',
    # NA can be turned to 0 since it means player didn't get that award that day
    fill_value = 0
    ).
  reset_index()
  )

# Add cumulative 'to date' sums for each award category
for award_category in selected_award_categories_in_data:
    player_selected_awards_by_date[('toDate' + award_category + 's')] = (
      player_selected_awards_by_date.
        groupby(['playerId', 'playerName'])[award_category].cumsum()
      )

# Prepare for time-based merging by dropping non-"to date" fields
player_selected_awards_by_date.drop(selected_award_categories_in_data,
  axis = 1, inplace = True)

display(player_selected_awards_by_date)

### Player and Team Twitter Followers by Date

#### Prepare player & team Twitter followers data for merge w/ player daily engagement data

In [None]:
# Extract only desired fields, rename some fields (for clarity later)
player_twitter_followers_for_merge = (playerTwitterFollowers
  [['dailyDataDate', 'date', 'playerId', 'numberOfFollowers']].
  rename(columns = {
    'date': 'playerTwitterDataDate',
    'numberOfFollowers': 'playerTwitterFollowers'
    })
  )

# Extract only desired fields, rename some fields (for clarity/joining later)
team_twitter_followers_for_merge = (teamTwitterFollowers
  [['dailyDataDate', 'date', 'teamId', 'numberOfFollowers']].
  rename(columns = {
    'date': 'teamTwitterDataDate',
    # Name is weird, but helps set up for merge w/ digital engagement data
    'teamId': 'rosterTeamIdIntForMerge',
    'numberOfFollowers': 'teamTwitterFollowers'
    })
  )

display(player_twitter_followers_for_merge)

display(team_twitter_followers_for_merge)

## Merge in Other Data with Player Daily Engagement

#### Merge in other data frames by date to add various (player/team/etc.) info to daily engagement

In [None]:
# Merge in daily player engagement with date info, then filter to in-season dates only
# Since test and future eval period is all 'in season', we imagine that looking at
# this filtered set of dates will help w/ identifying relevant trends more easily
player_engagement_with_info = (pd.merge(
  nextDayPlayerEngagement,
  dates_with_season_part,
  on = ['dailyDataDate'],
  how = 'left'
  ).
  query("inSeason").
  reset_index(drop = True)
  )

# Take "row mean" across targets to add (helps with studying all 4 targets at once)
player_engagement_with_info['target1To4Avg'] = np.mean(
  player_engagement_with_info[['target1', 'target2', 'target3', 'target4']],
  axis = 1)

# Merge in some player information
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  players[['playerId', 'playerName', 'DOB', 'mlbDebutDate', 'birthCity',
    'birthStateProvince', 'birthCountry', 'primaryPositionName']],
   on = ['playerId'],
   how = 'left'
   )

# Merge in some player roster information by date
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  (rosters[['dailyDataDate', 'playerId', 'statusCode', 'status', 'teamId']].
    rename(columns = {
      'statusCode': 'rosterStatusCode',
      'status': 'rosterStatus',
      'teamId': 'rosterTeamId'
      })
    ),
  on = ['dailyDataDate', 'playerId'],
  how = 'left'
  )

# Add int version of rosterTeamId (w/ -1 for NA) to help w/ future merging
player_engagement_with_info['rosterTeamIdIntForMerge'] = (np.where(
  pd.isna(player_engagement_with_info['rosterTeamId']), -1,
  player_engagement_with_info['rosterTeamId']).
  astype('int64')
  )

# Merge in team name from player's roster team
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  (teams[['id', 'teamName']].
    rename(columns = {
      'id': 'rosterTeamId',
      'teamName': 'rosterTeamName'
      })
    ),
  on = ['rosterTeamId'],
  how = 'left'
  )

# Merge in some player game stats and events (previously aggregated) from that date
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  player_date_stats_events_agg,
  on = ['dailyDataDate', 'playerId'],
  how = 'left'
  )
    
# Merge in some team game stats/results (previously aggregated) from that date
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  team_date_stats_agg.rename(columns = {'teamId': 'gameTeamId'}),
  on = ['dailyDataDate', 'gameTeamId'],
  how = 'left'
  )

# Get list of transactions fields to be added (and fill in NAs for post-merge)
transactions_fields = (player_date_transactions_wide.
  drop(['dailyDataDate', 'playerId'] , axis = 1).
  columns.values.tolist())

# Merge in player transactions of note (previously created) on that date 
player_engagement_with_info = (pd.merge(
  player_engagement_with_info,
  player_date_transactions_wide,
  on = ['dailyDataDate', 'playerId'],
  how = 'left'
  ).
  # NAs on transactions fields can be turned to 0 (no player transaction that day)
  fillna({field: 0 for field in transactions_fields})
  )

# Merge in some pieces of team standings (previously created) from that date
player_engagement_with_info = pd.merge(
  player_engagement_with_info,
  # Join standings based on rosterTeamId (not gameTeamId)
  standings_selected_fields.rename(columns = {'teamId': 'rosterTeamId'}),
  on = ['dailyDataDate', 'rosterTeamId'],
  how = 'left'
  )

# Get list of awards fields to be added (and fill in NAs for post-merge)
awards_fields = (player_selected_awards_by_date.
  drop(['dailyDataDate', 'playerId', 'playerName'], axis = 1).
  columns.values.tolist())

# Merge in selected player awards received from latest award date before given date
player_engagement_with_info = (pd.merge_asof(
  player_engagement_with_info,
  player_selected_awards_by_date.drop(['playerName'], axis = 1),
  # "merge" on date by player, looking backward (only use award dates up to daily date)
  on = ['dailyDataDate'],
  by = ['playerId'],
  direction = 'backward'
  ).
  # NAs on awards fields can be turned to 0 (player had no awards of that type to date)
  fillna({field: 0 for field in awards_fields})
  )

# Merge in player's Twitter followers from latest tracked date before given date
player_engagement_with_info = pd.merge_asof(
  player_engagement_with_info,
  player_twitter_followers_for_merge,
  # "merge" on date by player, looking backward (only use Twitter dates up to daily date)
  on = ['dailyDataDate'],
  by = ['playerId'],
  direction = 'backward'
  )

# Merge in team Twitter followers from latest date before given date
player_engagement_with_info = pd.merge_asof(
  player_engagement_with_info,
  team_twitter_followers_for_merge,
  # "merge" on date by team, looking backward (only use Twitter dates up to daily date)
  on = ['dailyDataDate'],
  # Use integer version of rosterTeamId since merge_asof seems to need int (not float)
  by = ['rosterTeamIdIntForMerge'],
  direction = 'backward'
  )

# Drop integer version of rosterTeamId since merging is done
player_engagement_with_info.drop(['rosterTeamIdIntForMerge'], axis = 1)

display(player_engagement_with_info)

In [None]:
player_engagement_with_info.to_csv('choncho.csv',index=False)

#### Look at listing of all fields in merged player digital engagement data

In [None]:
#display(player_engagement_with_info.info(max_cols = 200))