In [None]:
import sys
import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
sys.path.append(os.path.abspath('../src'))

from embeddings import PitchEmbedding

In [None]:
# ------------------------------
# Example usage:
# Suppose you have 100 continuous fields and 13 categorical fields.
# For demonstration, here we use 4 categorical features as an example.

continuous_dim = 100

# Dictionary for categorical variables:
# Keys are the feature names, and for each you specify the number of unique categories and desired embedding dimension.
categorical_info = {
    'pitcher_id': {'num_categories': 500, 'embedding_dim': 32},
    'pitch_type': {'num_categories': 10, 'embedding_dim': 16},
    'batter_side': {'num_categories': 3, 'embedding_dim': 8},
    'game_situation': {'num_categories': 20, 'embedding_dim': 16},
    # ... add the rest as needed to reach a total of 13 categorical features.
}

output_dim = 256  # Final token embedding dimension

# Instantiate the embedding model
pitch_embedding_model = PitchEmbedding(continuous_dim, categorical_info, output_dim)

# Create dummy data for a batch of 32 pitches
batch_size = 32
continuous_inputs = torch.randn(batch_size, continuous_dim)

categorical_inputs = {
    'pitcher_id': torch.randint(0, 500, (batch_size,)),
    'pitch_type': torch.randint(0, 10, (batch_size,)),
    'batter_side': torch.randint(0, 3, (batch_size,)),
    'game_situation': torch.randint(0, 20, (batch_size,))
    # ... ensure you provide inputs for all categorical features defined in categorical_info.
}

# Generate the embeddings
embeddings = pitch_embedding_model(continuous_inputs, categorical_inputs)
print("Embedding shape:", embeddings.shape)  # Expected output: (32, 256)


In [None]:
import torch
from src.embeddings import PitchEmbedding

# 1. ADD AT-BAT ID AS A CATEGORICAL FEATURE
categorical_info = {
    'pitch_type': {'num_categories': 10, 'embedding_dim': 16},
    'at_bat_id': {'num_categories': 10000, 'embedding_dim': 32},  # Unique ID for each at-bat
    'pitch_number_in_at_bat': {'num_categories': 20, 'embedding_dim': 8},  # Position in sequence
    # ... other categorical features
}

# 2. CREATE EMBEDDING MODEL
pitch_embedding_model = PitchEmbedding(continuous_dim, categorical_info, output_dim)

# 3. PREPARE FLAT BATCH OF PITCHES WITH AT-BAT CONTEXT
batch_size = 128  # Now this is just "number of pitches" (not at-bats)

# Each pitch now carries its at-bat context with it
continuous_inputs = torch.randn(batch_size, continuous_dim)
categorical_inputs = {
    'pitch_type': torch.randint(0, 10, (batch_size,)),
    'at_bat_id': torch.randint(0, 10000, (batch_size,)),  # Pitches from same at-bat share this ID
    'pitch_number_in_at_bat': torch.randint(0, 20, (batch_size,)),  # Position in the at-bat
    # ... other categorical inputs
}

# 4. GENERATE EMBEDDINGS
pitch_embeddings = pitch_embedding_model(continuous_inputs, categorical_inputs)

# 5. CREATE AT-BAT SEQUENCES FOR TRANSFORMER
# Group by at-bat ID
unique_at_bats = categorical_inputs['at_bat_id'].unique()
sequences = []

for at_bat_id in unique_at_bats[:16]:  # Process 16 at-bats
    # Find all pitches for this at-bat
    mask = categorical_inputs['at_bat_id'] == at_bat_id
    # Get embeddings for those pitches
    at_bat_pitches = pitch_embeddings[mask]
    # Sort by position in at-bat
    positions = categorical_inputs['pitch_number_in_at_bat'][mask]
    _, indices = positions.sort()
    sorted_pitches = at_bat_pitches[indices]
    sequences.append(sorted_pitches)

# Now feed these sequences to your transformer

***

In [3]:
# Import necessary libraries
import torch
import pandas as pd
import numpy as np
import sys
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

sys.path.append(os.path.abspath('../src'))
from embeddings import PitchEmbedding

In [7]:
# Memory-efficient random sampling from large file
import random
import os

# Set random seed
random.seed(42)

# Parameters
sample_size = 10000
input_file = '../data/processed/pitcher_final_21-24.csv'
output_file = '../data/processed/pitcher_test_sample.csv'

# Count total lines first (optional but helps with progress reporting)
with open(input_file, 'r') as f:
    total_lines = sum(1 for _ in f)

print(f"Total lines in file: {total_lines}")

# Calculate sampling probability
sampling_probability = sample_size / (total_lines - 1)  # Exclude header

# Perform reservoir sampling
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    # Copy header
    header = next(infile)
    outfile.write(header)
    
    # Process remaining lines with sampling
    lines_sampled = 0
    for i, line in enumerate(infile):
        if random.random() < sampling_probability:
            outfile.write(line)
            lines_sampled += 1
            
        # Report progress every million lines
        if (i+1) % 1000000 == 0:
            print(f"Processed {i+1:,} lines...")
            
    print(f"Sampled {lines_sampled} lines from {total_lines:,} total")

# Load the much smaller sample file
df = pd.read_csv(output_file)
print(f"Successfully loaded {len(df)} rows")

Total lines in file: 3073583
Processed 1,000,000 lines...
Processed 2,000,000 lines...
Processed 3,000,000 lines...
Sampled 9935 lines from 3,073,583 total
Successfully loaded 9935 rows


In [9]:
# 1. LOAD PROCESSED DATA
print("Loading processed data...")
# Use your engineered dataset
#df = pd.read_csv('../data/processed/pitcher_final_21-24.csv')

# 2. DEFINE CONTINUOUS & CATEGORICAL FEATURES
continuous_features = [
    'release_speed', 'release_pos_x', 'release_pos_z', 
    'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'hc_x', 'hc_y',
    'vx0', 'vy0', 'vz0', 'ax', 'ay', 
    'az', 'sz_top', 'sz_bot', 'hit_distance_sc', 
    'launch_speed', 'launch_angle', 'effective_speed', 
    'release_spin_rate', 'release_extension', 
    'release_pos_y', 'estimated_ba_using_speedangle',
    'estimated_woba_using_speedangle', 'woba_value',
    'woba_denom', 'babip_value', 'iso_value', 'spin_axis',
    'delta_home_win_exp', 'delta_run_exp', 'bat_speed' ,
    'swing_length', 'estimated_slg_using_speedangle',
    'delta_pitcher_run_exp', 'hyper_speed',
    'bat_win_exp',
    'pitcher_days_since_prev_game', 
    'batter_days_since_prev_game',
    'pitcher_days_until_next_game',
    'batter_days_until_next_game',
    'api_break_z_with_gravity',
    'api_break_x_arm',
    'api_break_x_batter_in',
    'arm_angle', 'home_win_exp',
    'on_3b', 'on_2b',
    'on_1b',

    'at_bat_number', 'bat_score', 'fld_score', 'age_pit', 'age_bat'
]

categorical_features = [
    'pitch_type', 'batter', 'pitcher', 'events', 'description', 'zone', 
    'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 
    ############'result_type',
    'hit_location', 'bb_type', 'balls', 'strikes', 'game_year',
    'outs_when_up', 'inning', 'inning_topbot', 'game_pk',
    'fielder_2', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
    'fielder_7', 'fielder_8', 'fielder_9', 
    #'at_bat_number', 
    'pitch_number',
    'launch_speed_angle', 
    #'home_score', 'away_score', 
    #'bat_score',
    #'fld_score', 'post_away_score', 'post_home_score', 'post_bat_score', 'post_fld_score', 
    'if_fielding_alignment', 'of_fielding_alignment',
    #'age_pit_legacy', 'age_bat_legacy', 
    #'age_pit', 'age_bat',
    'n_thruorder_pitcher', 'n_priorpa_thisgame_player_at_bat',
    'pitch_name',

    'at_bat_id', 'month','score_diff', 'risp', 'clutch', 'blowout', 'post_allstar_break'
]


# 3. NORMALIZE CONTINUOUS FEATURES
print("Normalizing continuous features...")
scaler = StandardScaler()
df_cont = df[continuous_features].copy()
# Handle any remaining NaNs
df_cont = df_cont.fillna(0)
df_cont_scaled = scaler.fit_transform(df_cont)

# 4. CREATE CATEGORY MAPPINGS & CONVERT TO INDICES
print("Converting categorical features to indices...")
category_maps = {}
cat_data = {}

# Define custom embedding dimensions for important features
# Format: 'feature_name': desired_embedding_dimension
custom_embedding_dims = {
    'pitch_type': 16, 
    'batter': 16, 
    'pitcher': 16, 
    'events': 12, 
    #'description', 
    'zone': 6, 
    'game_type': 4, 
    'stand': 4, 
    'p_throws': 4, 
    'home_team': 8, 
    'away_team': 8, 
    ##########'result_type': 4,
    'hit_location': 6, 
    'bb_type': 8, 
    'balls': 4, 
    'strikes': 4, 
    'game_year': 4,
    'outs_when_up': 8, 
    'inning': 16, 
    'inning_topbot': 2, 
    'game_pk': 16,
    #'fielder_2':, 
    #'fielder_3':, 
    #'fielder_4':, 
    #'fielder_5':, 
    #'fielder_6':,
    #'fielder_7':, 
    #'fielder_8':, 
    #'fielder_9':, 
    ####'at_bat_number': 12, 
    'pitch_number': 12,
    'launch_speed_angle': 12, 
    #'home_score', 
    #'away_score', 
    ####'bat_score': 16,
    ####'fld_score': 16, 
    #'post_away_score', 
    #'post_home_score', 
    #'post_bat_score',
    #'post_fld_score', 
    'if_fielding_alignment': 4, 
    'of_fielding_alignment': 4,
    #'age_pit_legacy', 
    #'age_bat_legacy', 
    ####'age_pit': 8, 
    ####'age_bat': 8,
    'n_thruorder_pitcher': 12, 
    'n_priorpa_thisgame_player_at_bat': 12,
    #'pitch_name',

    'at_bat_id': 16, 
    'month': 6,
    'score_diff': 8, 
    'risp': 12, 
    'clutch': 12,
    'blowout': 4, 
    'post_allstar_break': 4
}

for feature in categorical_features:
    # Create a mapping from category values to indices
    unique_values = df[feature].dropna().unique()
    category_maps[feature] = {val: idx for idx, val in enumerate(unique_values)}
    
    # Use custom dimension if specified, otherwise use heuristic
    if feature in custom_embedding_dims:
        embedding_dim = custom_embedding_dims[feature]
    else:
        # Fallback to heuristic
        #embedding_dim = min(50, max(4, int(np.sqrt(len(unique_values)) * 2)))
        embedding_dim = 2
    
    # Store category counts and embedding dimensions
    cat_data[feature] = {
        'num_categories': len(unique_values) + 1,  # +1 for unknown/padding
        'embedding_dim': embedding_dim
    }
    
    # Print some info
    print(f"{feature}: {len(unique_values)} categories, {cat_data[feature]['embedding_dim']} dimensions")



# 5. CONVERT TO PYTORCH TENSORS
print("Creating tensors...")
# Create a tensor for all continuous features
continuous_tensor = torch.tensor(df_cont_scaled, dtype=torch.float32)

# Create tensors for categorical features
categorical_tensors = {}
for feature in categorical_features:
    # Map values to indices, using 0 for NaN
    indices = [category_maps[feature].get(val, 0) for val in df[feature].fillna('unknown')]
    categorical_tensors[feature] = torch.tensor(indices, dtype=torch.long)

# 6. DEFINE MODEL PARAMETERS
continuous_dim = len(continuous_features)
#output_dim = 256  # Final embedding dimension
output_dim = 128  # Final embedding dimension (smaller for prototype)

# 7. CREATE EMBEDDING MODEL
print("Creating embedding model...")
embedding_model = PitchEmbedding(continuous_dim, cat_data, output_dim)

# 8. GENERATE EMBEDDINGS
print("Generating embeddings...")
with torch.no_grad():  # No need for gradients during inference
    pitch_embeddings = embedding_model(continuous_tensor, categorical_tensors)

print(f"Created embeddings with shape: {pitch_embeddings.shape}")

# 9. ORGANIZE INTO AT-BAT SEQUENCES
print("Organizing into at-bat sequences...")
# Group by at_bat_id
unique_at_bats = df['at_bat_id'].unique()
print(f"Total at-bats: {len(unique_at_bats)}")

# Store atbat sequences and their lengths
atbat_sequences = []
atbat_lengths = []
max_seq_len = 0

# Process a subset for demonstration (adjust as needed)
#sample_at_bats = unique_at_bats[:1000]     #.. (try with only 1000 at-bats for testing)
sample_at_bats = unique_at_bats[:5000]    #.. (try it with more data for more testing)
# sample_at_bats = unique_at_bats           .. (use all data for production)


for at_bat_id in sample_at_bats:
    # Get indices for this at-bat
    indices = df[df['at_bat_id'] == at_bat_id].index
    
    # Get embeddings for these pitches
    at_bat_embeddings = pitch_embeddings[indices]
    
    # Update max sequence length
    max_seq_len = max(max_seq_len, len(indices))
    
    # Store sequence and length
    atbat_sequences.append(at_bat_embeddings)
    atbat_lengths.append(len(indices))

print(f"Processed {len(atbat_sequences)} at-bat sequences")
print(f"Average pitches per at-bat: {np.mean(atbat_lengths):.2f}")
print(f"Maximum sequence length: {max_seq_len}")

# 10. PAD SEQUENCES FOR BATCH PROCESSING
print("Padding sequences...")
padded_sequences = []
attention_mask = []

for seq in atbat_sequences:
    seq_len = seq.shape[0]
    
    # Create padded sequence
    padded = torch.zeros(max_seq_len, output_dim)
    padded[:seq_len] = seq
    
    # Create mask (1 for real tokens, 0 for padding)
    mask = torch.zeros(max_seq_len)
    mask[:seq_len] = 1
    
    padded_sequences.append(padded)
    attention_mask.append(mask)

# Stack into tensors
padded_tensor = torch.stack(padded_sequences)
attention_mask = torch.stack(attention_mask).bool()

print(f"Final tensor shape: {padded_tensor.shape}")
print(f"Attention mask shape: {attention_mask.shape}")

# 11. SAVE FOR LATER USE
torch.save({
    'embeddings': padded_tensor,
    'attention_mask': attention_mask,
    'scaler': scaler,
    'category_maps': category_maps
}, '../data/embeddings/pitch_embeddings_test1000.pt')

print("Embeddings saved and ready for transformer training!")

Loading processed data...
Normalizing continuous features...
Converting categorical features to indices...
pitch_type: 18 categories, 16 dimensions
batter: 1006 categories, 16 dimensions
pitcher: 1217 categories, 16 dimensions
events: 20 categories, 12 dimensions
description: 12 categories, 2 dimensions
zone: 14 categories, 6 dimensions
game_type: 6 categories, 4 dimensions
stand: 2 categories, 4 dimensions
p_throws: 2 categories, 4 dimensions
home_team: 30 categories, 8 dimensions
away_team: 30 categories, 8 dimensions
hit_location: 10 categories, 6 dimensions
bb_type: 5 categories, 8 dimensions
balls: 4 categories, 4 dimensions
strikes: 3 categories, 4 dimensions
game_year: 4 categories, 4 dimensions
outs_when_up: 3 categories, 8 dimensions
inning: 13 categories, 16 dimensions
inning_topbot: 2 categories, 2 dimensions
game_pk: 6459 categories, 16 dimensions
fielder_2: 217 categories, 2 dimensions
fielder_3: 373 categories, 2 dimensions
fielder_4: 395 categories, 2 dimensions
fielder_