In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import joblib
import os



In [44]:
# Load data
csv_dir = os.path.join(  '..', '..', '..', 'app', 'data', 'csv', 'MLB')
rosters_df = pd.read_csv(os.path.join(csv_dir, "mlb_rosters.csv"))
injuries_df = pd.read_csv(os.path.join(csv_dir, "mlb_injuries.csv"))
player_stats_df = pd.read_csv(os.path.join(csv_dir, "mlb_player_stats.csv")).rename(columns={'id': 'player_id'})
historical_data_df = pd.concat([
    pd.read_csv(os.path.join(csv_dir, "mlb_historical_data(2010-2013).csv")),
    pd.read_csv(os.path.join(csv_dir, "mlb_historical_data(2014-2016).csv")),
    pd.read_csv(os.path.join(csv_dir, "mlb_historical_data(2017-2019).csv")),
    pd.read_csv(os.path.join(csv_dir, "mlb_historical_data(2020-2024).csv"))
], ignore_index=True)

In [45]:
# Filter active players
rosters_active = rosters_df[rosters_df['status'].str.lower() == 'active']


In [46]:
# Prepare player-season-team association
# Ensure 'player_id' column exists in both DataFrames before merging
if 'player_id' not in player_stats_df.columns:
    raise KeyError("'player_id' column not found in player_stats_df")
if 'player_id' not in rosters_df.columns:
    raise KeyError("'player_id' column not found in rosters_df")

player_team_season_df = player_stats_df.merge(
    rosters_df[['player_id', 'team_id', 'season']],
    on='player_id', how='left'
)
at_bats = player_team_season_df['at_bats'].fillna(0)
innings_pitched = player_team_season_df['innings_pitched'].fillna(0) if 'innings_pitched' in player_team_season_df else 0
player_team_season_df['player_score'] = at_bats + innings_pitched

In [47]:
# Top 9 players per team-season
top_players = (
    player_team_season_df
    .sort_values(['team_id', 'season', 'player_score'], ascending=False)
    .groupby(['team_id', 'season'])
    .head(9)[['player_id', 'team_id', 'season']]
    .assign(in_lineup=1)
)


In [53]:
## Inspect columns and rename to expected names if necessary
print("Historical data columns:", historical_data_df.columns.tolist())

Historical data columns: ['is_home', 'venue_id', 'home_hits', 'away_hits', 'home_errors', 'away_errors', 'home_sp_era', 'away_sp_era', 'home_late_inning_runs', 'away_late_inning_runs', 'home_ops', 'away_ops', 'home_bullpen_era', 'away_bullpen_era', 'hit_diff', 'error_diff', 'era_diff', 'ops_diff', 'bullpen_diff', 'late_inning_diff', 'home_win_pct', 'away_win_pct', 'win_pct_diff', 'home_rank', 'away_rank', 'rank_diff', 'home_score', 'away_score', 'home_win']


In [54]:
# Rename only if necessary based on actual column presence
column_renames = {}
if 'home_team_id' in historical_data_df.columns:
    column_renames['home_team_id'] = 'home_team'
if 'away_team_id' in historical_data_df.columns:
    column_renames['away_team_id'] = 'away_team'
if 'year' in historical_data_df.columns:
    column_renames['year'] = 'season'
if 'match_id' in historical_data_df.columns:
    column_renames['match_id'] = 'game_id'

historical_data_df = historical_data_df.rename(columns=column_renames)


In [68]:
# Sample valid team-season matchups only from known top_players
valid_team_seasons = top_players[['team_id', 'season']].drop_duplicates()

sampled_games = valid_team_seasons.sample(n=1000, replace=True, random_state=42).reset_index(drop=True)
sampled_games['game_id'] = range(1000)
sampled_games = sampled_games.rename(columns={'team_id': 'home_team'})
sampled_games['away_team'] = np.random.choice(valid_team_seasons['team_id'].unique(), size=1000)


Unnamed: 0,home_team,season,game_id,away_team
0,1024.0,2025.0,0,1018.0
1,1011.0,2025.0,1,1022.0
2,1002.0,2025.0,2,1010.0
3,1016.0,2025.0,3,1017.0
4,1020.0,2025.0,4,1017.0
...,...,...,...,...
995,1010.0,2025.0,995,1017.0
996,1026.0,2025.0,996,1017.0
997,1021.0,2025.0,997,1003.0
998,1021.0,2025.0,998,1021.0


In [70]:
# Create realistic positive samples
realistic_lineup = []
for _, game in sampled_games.iterrows():
    for is_home, team in [(1, game['home_team']), (0, game['away_team'])]:
        players = top_players[(top_players['team_id'] == team) & (top_players['season'] == game['season'])]
        for _, player in players.iterrows():
            realistic_lineup.append({
                'game_id': game['game_id'],
                'player_id': player['player_id'],
                'team_id': team,
                'is_home': is_home,
                'in_lineup': 1,
                'season': game['season']
            })
realistic_lineup_df = pd.DataFrame(realistic_lineup)

display(realistic_lineup_df)

Unnamed: 0,game_id,player_id,team_id,is_home,in_lineup,season
0,0.0,33210.0,1024.0,1,1,2025.0
1,0.0,4629089.0,1024.0,1,1,2025.0
2,0.0,35124.0,1024.0,1,1,2025.0
3,0.0,41292.0,1024.0,1,1,2025.0
4,0.0,41273.0,1018.0,0,1,2025.0
...,...,...,...,...,...,...
7642,999.0,42468.0,1028.0,0,1,2025.0
7643,999.0,30193.0,1028.0,0,1,2025.0
7644,999.0,38309.0,1028.0,0,1,2025.0
7645,999.0,4872587.0,1028.0,0,1,2025.0


In [71]:
# Safety check
if realistic_lineup_df.empty:
    raise ValueError("realistic_lineup_df is empty. Check if 'top_players' matches the teams/seasons in 'sampled_games'.")

In [73]:
# Create negative samples
negative_lineup = []
positive_set = set(zip(realistic_lineup_df['game_id'], realistic_lineup_df['player_id']))
for _, game in sampled_games.iterrows():
    for is_home, team in [(1, game['home_team']), (0, game['away_team'])]:
        all_players = rosters_active[
            (rosters_active['team_id'] == team) &
            (rosters_active['season'] == game['season'])
        ]
        top_players_ids = top_players[
            (top_players['team_id'] == team) &
            (top_players['season'] == game['season'])
        ]['player_id'].values
        eligible_players = all_players[~all_players['player_id'].isin(top_players_ids)]
        if not eligible_players.empty:
            sampled_negatives = eligible_players.sample(n=min(9, len(eligible_players)), random_state=42)
            for _, player in sampled_negatives.iterrows():
                key = (game['game_id'], player['player_id'])
                if key not in positive_set:
                    negative_lineup.append({
                        'game_id': game['game_id'],
                        'player_id': player['player_id'],
                        'team_id': team,
                        'is_home': is_home,
                        'in_lineup': 0,
                        'season': game['season']
                    })
negative_lineup_df = pd.DataFrame(negative_lineup)

display(negative_lineup_df)

Unnamed: 0,game_id,player_id,team_id,is_home,in_lineup,season
0,0.0,41939,1024.0,1,0,2025.0
1,0.0,40593,1024.0,1,0,2025.0
2,0.0,40327,1024.0,1,0,2025.0
3,0.0,4894467,1024.0,1,0,2025.0
4,0.0,29703,1024.0,1,0,2025.0
...,...,...,...,...,...,...
17995,999.0,35844,1028.0,0,0,2025.0
17996,999.0,40937,1028.0,0,0,2025.0
17997,999.0,32623,1028.0,0,0,2025.0
17998,999.0,28963,1028.0,0,0,2025.0


In [74]:
# Combine and enrich data
combined_lineup_df = pd.concat([realistic_lineup_df, negative_lineup_df], ignore_index=True)
player_stats_subset = player_stats_df[[
    'player_id', 'batting_avg', 'home_runs', 'earned_run_avg',
    'runs_batted_in', 'on_base_percentage', 'slugging_percentage',
    'walks', 'strikeouts', 'innings_pitched', 'wins', 'losses', 'saves'
]]
training_df = combined_lineup_df.merge(player_stats_subset, on='player_id', how='left')
training_df = training_df.merge(injuries_df[['player_id', 'injury_date']], on='player_id', how='left')
training_df['injured'] = training_df['injury_date'].notnull().astype(int)
training_df.drop(columns=['injury_date'], inplace=True)
training_df.fillna(0, inplace=True)
training_df['team_id'] = LabelEncoder().fit_transform(training_df['team_id'].astype(str))



In [75]:
# Features and labels
features = [
    'team_id', 'is_home', 'injured',
    'batting_avg', 'home_runs', 'earned_run_avg',
    'runs_batted_in', 'on_base_percentage', 'slugging_percentage',
    'walks', 'strikeouts', 'innings_pitched', 'wins', 'losses', 'saves'
]
X = training_df[features].astype(np.float32)
y = training_df['in_lineup'].astype(int)


In [76]:
# Train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=2)
rf_model.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [77]:
# Evaluate
train_acc = rf_model.score(X_train, y_train)
test_acc = rf_model.score(X_test, y_test)
print(f"Training Accuracy: {train_acc:.2%}")
print(f"Testing Accuracy: {test_acc:.2%}")

Training Accuracy: 95.10%
Testing Accuracy: 95.36%


In [81]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

predictions = rf_model.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, predictions):.2%}")
print(f"Test Precision: {precision_score(y_test, predictions):.2%}")
print(f"Test Recall: {recall_score(y_test, predictions):.2%}")
print(f"Test F1 Score: {f1_score(y_test, predictions):.2%}")

# Confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("\nConfusion Matrix:")
print(conf_matrix)

Test Accuracy: 95.36%
Test Precision: 100.00%
Test Recall: 84.88%
Test F1 Score: 91.82%

Confusion Matrix:
[[3599    0]
 [ 241 1353]]


In [82]:
# Load model
rf_model.predict(X_test[0:1])


array([0])

In [85]:
X_test[0:12]

Unnamed: 0,team_id,is_home,injured,batting_avg,home_runs,earned_run_avg,runs_batted_in,on_base_percentage,slugging_percentage,walks,strikeouts,innings_pitched,wins,losses,saves
20905,26.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7732,3.0,1.0,0.0,0.0,0.0,2.55,0.0,0.0,0.0,15.0,62.0,74.0,6.0,4.0,0.0
6216,23.0,1.0,0.0,0.0,0.0,3.12,0.0,0.0,0.0,14.0,78.0,89.099998,6.0,4.0,0.0
7029,8.0,1.0,0.0,0.0,0.0,2.76,0.0,0.0,0.0,19.0,110.0,88.0,7.0,2.0,0.0
23693,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6077,8.0,0.0,0.0,0.0,0.0,3.05,0.0,0.0,0.0,28.0,88.0,79.199997,5.0,2.0,0.0
25073,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17803,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8145,11.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18696,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
# Export model
joblib.dump(rf_model, "random_forest_lineup_model.pkl")

['random_forest_lineup_model.pkl']