In [48]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import typing

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import defaultdict

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [49]:
from pitch_sequencing.ml.tokenizers.pitch_sequence import PitchSequenceWithCountTokenizer, SeparateSequenceTokenizer

interleaved_tokenizer = PitchSequenceWithCountTokenizer()
seperate_tokenizer = SeparateSequenceTokenizer()

In [50]:
from pitch_sequencing.ml.models.last_pitch import LastPitchTransformerModel, SeparateEmbeddingLayersLastPitchTransformerModel
import gcsfs 

interleaved_trained_model = LastPitchTransformerModel(interleaved_tokenizer.vocab_size(), d_model=64, nhead=4, num_layers=2)
seperate_trained_model = SeparateEmbeddingLayersLastPitchTransformerModel(seperate_tokenizer.vocab_size(), d_model=64, nhead=4, num_layers=2)

fs = gcsfs.GCSFileSystem()

#fixed_attn_mask_cel_model_path = "gs://pitch-sequencing/training_runs/countstate_training_job_20241014000018_attn_masking/final/model.pth"
#fixed_attn_mask_focal_model_path = "gs://pitch-sequencing/training_runs/countstate_training_job_20241014000108_attn_masking_focal_loss_enabled/final/model.pth"
#ce_loss_model_path = "gs://pitch-sequencing/training_runs/countstate_training_job_20241010153754_exploded_rows/final/model.pth"
#focal_loss_model_path = "gs://pitch-sequencing/training_runs/countstate_training_job_20241012150243_focal_loss_env_test/final/model.pth"
#model_path = "gs://pitch-sequencing/training_runs/countstate_training_job_20241014095006_attn_masking_cel/final/model.pth"
model_path = "gs://pitch-sequencing/training_runs/countstate_training_job_20241014211812_attn_masking_cel/final/model.pth"
with fs.open(model_path, "rb") as f:
   interleaved_trained_model.load_state_dict(torch.load(f, map_location=torch.device('cpu') ))

seperate_model_path = "gs://pitch-sequencing/training_runs/countstate_training_job_20241014094755_sep_embeds/final/model.pth"
with fs.open(seperate_model_path, "rb") as f:
   seperate_trained_model.load_state_dict(torch.load(f, map_location=torch.device('cpu') ))

In [51]:
class LastPitchPredictorWithCount:
    def __init__(self, model: LastPitchTransformerModel, tokenizer: PitchSequenceWithCountTokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def get_next_pitch_probs_ids(self, id_seq, attn_mask):
        #print(id_seq)
        #print(attn_mask)
        self.model.eval()  # Ensure the model is in evaluation mode
        with torch.no_grad():
            logits = self.model(id_seq, src_mask=attn_mask)
            # The output shape should be [1, vocab_size]
            logits = logits.squeeze(0)  # Remove batch dimension if present
            if logits.dim() > 1:
                logits = logits[-1]  # Take the last prediction if multiple outputs

            probabilities = torch.softmax(logits, dim=0)

            return probabilities
    
    def get_next_pitch_probs(self, pitch_sequence, count_sequence):
        encoded_seq, attn_mask = self.tokenizer.tokenize(pitch_sequence, count_sequence)
        input_seq = torch.tensor(encoded_seq, dtype=torch.long).unsqueeze(0)
        attn_mask = torch.tensor(attn_mask, dtype=torch.bool).unsqueeze(0)
        
        
        return self.get_next_pitch_probs_ids(input_seq, attn_mask)

    def predict_next_pitch_ids(self, id_sequence, attn_mask):
        probabilities = self.get_next_pitch_probs_ids(id_sequence, attn_mask)
        predicted_idx = torch.argmax(probabilities).item()
        return predicted_idx

    def predict_next_pitch(self, pitch_sequence, count_sequence):
        probabilities = self.get_next_pitch_probs(pitch_sequence, count_sequence)
        predicted_idx = torch.argmax(probabilities).item()
        return self.tokenizer.get_pitch_for_id(predicted_idx)

In [52]:
class LastPitchPredictorSeperateSequences:
    def __init__(self, model: SeparateEmbeddingLayersLastPitchTransformerModel, tokenizer: SeparateSequenceTokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def get_next_pitch_probs_ids(self, pitch_id_seq, count_id_seq, attn_mask):
        self.model.eval()  # Ensure the model is in evaluation mode
        with torch.no_grad():
            logits = self.model(pitch_id_seq, count_id_seq, src_mask=attn_mask)
            # The output shape should be [1, vocab_size]
            logits = logits.squeeze(0)  # Remove batch dimension if present
            if logits.dim() > 1:
                logits = logits[-1]  # Take the last prediction if multiple outputs

            probabilities = torch.softmax(logits, dim=0)

            return probabilities
    
    def get_next_pitch_probs(self, pitch_sequence, count_sequence):
        encoded_pitch_seq, encoded_count_seq, attn_mask = self.tokenizer.tokenize(pitch_sequence, count_sequence)
        input_pitch_seq = torch.tensor(encoded_pitch_seq, dtype=torch.long).unsqueeze(0)
        input_count_seq = torch.tensor(encoded_count_seq, dtype=torch.long).unsqueeze(0)
        attn_mask = torch.tensor(attn_mask, dtype=torch.bool).unsqueeze(0)
        
        
        return self.get_next_pitch_probs_ids(input_pitch_seq, input_count_seq, attn_mask)

    def predict_next_pitch_ids(self, pitch_id_sequence, count_id_seq, attn_mask):
        probabilities = self.get_next_pitch_probs_ids(pitch_id_sequence, count_id_seq, attn_mask)
        predicted_idx = torch.argmax(probabilities).item()
        return predicted_idx

    def predict_next_pitch(self, pitch_sequence, count_sequence):
        probabilities = self.get_next_pitch_probs(pitch_sequence, count_sequence)
        predicted_idx = torch.argmax(probabilities).item()
        return self.tokenizer.get_pitch_for_id(predicted_idx)

In [53]:
interleaved_predictor = LastPitchPredictorWithCount(interleaved_trained_model, interleaved_tokenizer)
seperate_predictor = LastPitchPredictorSeperateSequences(seperate_trained_model, seperate_tokenizer)

### Load test data

In [54]:
exploded_test_df = pd.read_csv('gs://pitch-sequencing/sequence_data/full_sequence_data/exploded/large_cur_test.csv')

In [64]:
three_zero_counts_df = exploded_test_df[exploded_test_df['count_sequence'].apply(lambda x: x.split(',')[-1] == '3-0')]
three_one_counts_df = exploded_test_df[exploded_test_df['count_sequence'].apply(lambda x: x.split(',')[-1] == '3-1')]

In [104]:
from pitch_sequencing.ml.tokenizers.pitch_sequence import ORDERED_PITCHES

probs = interleaved_predictor.get_next_pitch_probs("CH,SL,SI", "0-0,1-0,1-1,1-2")
#seperate_probs = seperate_predictor.get_next_pitch_probs("FF,CB,CH", "0-0,1-0,2-0,3-0")

for pitch in ORDERED_PITCHES:
    id = interleaved_tokenizer.get_id_for_pitch(pitch)
    print(f"{pitch}: {probs[id]:.4f}")
    #print(f"Seperate    {pitch}: {seperate_probs[id]:.4f}")

CB: 0.0222
KN: 0.7391
FC: 0.0055
FS: 0.0008
CH: 0.0479
FF: 0.1423
SL: 0.0061
PO: 0.0001
SI: 0.0290
ST: 0.0070


In [67]:
exploded_test_df['target_pitch'] = exploded_test_df['pitch_sequence'].apply(lambda x: x.split(',')[-1])
exploded_test_df['setup_count'] = exploded_test_df['count_sequence'].apply(lambda x: x.split(',')[-1])
exploded_test_df['input_pitch_sequence'] = exploded_test_df['pitch_sequence'].apply(lambda x: ','.join(x.split(',')[:-1]))

In [68]:
exploded_test_df.head(10)

Unnamed: 0,pitch_sequence,count_sequence,zone_sequence,p_throws,stand,pitcher_id,batter_id,at_bat_number,target_pitch,setup_count,input_pitch_sequence
0,"CH,SI","0-0,1-0",116,R,R,112526,572039,44,SI,1-0,CH
1,"CH,SI,SI","0-0,1-0,1-1",11611,R,R,112526,572039,44,SI,1-1,"CH,SI"
2,"CH,CH","0-0,1-0",144,R,L,543037,624512,37,CH,1-0,CH
3,"FF,FF","0-0,0-1",111,R,L,642121,544369,84,FF,0-1,FF
4,"FF,FF,FF","0-0,0-1,0-2",11111,R,L,642121,544369,84,FF,0-2,"FF,FF"
5,"FF,FF,FF,CH","0-0,0-1,0-2,1-2",1111113,R,L,642121,544369,84,CH,1-2,"FF,FF,FF"
6,"FF,FC","0-0,1-0",1212,R,R,608379,575929,28,FC,1-0,FF
7,"FF,FC,FC","0-0,1-0,1-1",12129,R,R,608379,575929,28,FC,1-1,"FF,FC"
8,"FF,FC,FC,CB","0-0,1-0,1-1,1-2",121296,R,R,608379,575929,28,CB,1-2,"FF,FC,FC"
9,"FF,FC,FC,CB,FF","0-0,1-0,1-1,1-2,1-2",12129613,R,R,608379,575929,28,FF,1-2,"FF,FC,FC,CB"


In [71]:
from tqdm import tqdm
tqdm.pandas()  # This enables pandas integration

In [72]:

exploded_test_df['predicted_pitch'] = exploded_test_df.progress_apply(lambda row: interleaved_predictor.predict_next_pitch(row['input_pitch_sequence'], row['count_sequence']), axis=1)

100%|██████████| 352058/352058 [08:23<00:00, 699.66it/s]


In [73]:
exploded_test_df.head(10)

Unnamed: 0,pitch_sequence,count_sequence,zone_sequence,p_throws,stand,pitcher_id,batter_id,at_bat_number,target_pitch,setup_count,input_pitch_sequence,predicted_pitch
0,"CH,SI","0-0,1-0",116,R,R,112526,572039,44,SI,1-0,CH,CH
1,"CH,SI,SI","0-0,1-0,1-1",11611,R,R,112526,572039,44,SI,1-1,"CH,SI",CH
2,"CH,CH","0-0,1-0",144,R,L,543037,624512,37,CH,1-0,CH,CH
3,"FF,FF","0-0,0-1",111,R,L,642121,544369,84,FF,0-1,FF,FF
4,"FF,FF,FF","0-0,0-1,0-2",11111,R,L,642121,544369,84,FF,0-2,"FF,FF",FF
5,"FF,FF,FF,CH","0-0,0-1,0-2,1-2",1111113,R,L,642121,544369,84,CH,1-2,"FF,FF,FF",FF
6,"FF,FC","0-0,1-0",1212,R,R,608379,575929,28,FC,1-0,FF,FF
7,"FF,FC,FC","0-0,1-0,1-1",12129,R,R,608379,575929,28,FC,1-1,"FF,FC",FC
8,"FF,FC,FC,CB","0-0,1-0,1-1,1-2",121296,R,R,608379,575929,28,CB,1-2,"FF,FC,FC",FF
9,"FF,FC,FC,CB,FF","0-0,1-0,1-1,1-2,1-2",12129613,R,R,608379,575929,28,FF,1-2,"FF,FC,FC,CB",FF


In [85]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(exploded_test_df['target_pitch'], exploded_test_df['predicted_pitch'])
precision = precision_score(exploded_test_df['target_pitch'], exploded_test_df['predicted_pitch'], average='macro')
precision_weighted = precision_score(exploded_test_df['target_pitch'], exploded_test_df['predicted_pitch'], average='weighted')
recall = recall_score(exploded_test_df['target_pitch'], exploded_test_df['predicted_pitch'], average='macro')
recall_weighted = recall_score(exploded_test_df['target_pitch'], exploded_test_df['predicted_pitch'], average='weighted')
f1 = f1_score(exploded_test_df['target_pitch'], exploded_test_df['predicted_pitch'], average='macro')
f1_weighted = f1_score(exploded_test_df['target_pitch'], exploded_test_df['predicted_pitch'], average='weighted')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [86]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Weighted Precision:", precision_weighted)
print("Recall:", recall)
print("Weighted Recall:", recall_weighted)
print("F1 Score:", f1)
print("Weighted F1 Score:", f1_weighted)

Accuracy: 0.4420265978901204
Precision: 0.40190733717838684
Weighted Precision: 0.42502591824203184
Recall: 0.38472726433438964
Weighted Recall: 0.4420265978901204
F1 Score: 0.38034037880848315
Weighted F1 Score: 0.41618564274038766


In [90]:
from sklearn.metrics import classification_report

print(classification_report(exploded_test_df['target_pitch'], exploded_test_df['predicted_pitch'], zero_division=1))

              precision    recall  f1-score   support

          CB       0.36      0.11      0.17     35034
          CH       0.35      0.14      0.20     42632
          FC       0.36      0.38      0.37     24014
          FF       0.47      0.66      0.55    119519
          FS       0.40      0.35      0.37      6892
          KN       0.78      0.87      0.82       293
          PO       1.00      0.00      0.00        23
          SI       0.46      0.50      0.48     56519
          SL       0.41      0.38      0.40     59898
          ST       0.42      0.45      0.43      7234

    accuracy                           0.44    352058
   macro avg       0.50      0.38      0.38    352058
weighted avg       0.43      0.44      0.42    352058



In [92]:
for setup, group in exploded_test_df.groupby('setup_count'):
    print(f"Metrics for Setup Count: {setup}")
    print(classification_report(group['target_pitch'], group['predicted_pitch'], zero_division=0))
    print("\n")

Metrics for Setup Count: 0-1
              precision    recall  f1-score   support

          CB       0.00      0.00      0.00      7062
          CH       0.33      0.17      0.22      7718
          FC       0.33      0.31      0.32      4305
          FF       0.41      0.66      0.51     18685
          FS       0.41      0.17      0.24      1221
          KN       0.83      0.62      0.71        64
          PO       0.00      0.00      0.00         9
          SI       0.41      0.50      0.45      9442
          SL       0.41      0.36      0.38     10652
          ST       0.39      0.34      0.36      1306

    accuracy                           0.40     60464
   macro avg       0.35      0.31      0.32     60464
weighted avg       0.35      0.40      0.36     60464



Metrics for Setup Count: 0-2
              precision    recall  f1-score   support

          CB       0.37      0.33      0.35      4539
          CH       0.36      0.18      0.24      3253
          FC      

### Now do some arsenal metrics

In [105]:
arsenal_df = pd.read_csv("gs://pitch-sequencing/arsenal_data/pitch_arsenal_data.csv")

In [112]:
test_df_with_arsenal = pd.merge(exploded_test_df, arsenal_df, left_on='pitcher_id', right_on='pitcher', how='left')
test_df_with_arsenal.head(5)

Unnamed: 0,pitch_sequence,count_sequence,zone_sequence,p_throws,stand,pitcher_id,batter_id,at_bat_number,target_pitch,setup_count,input_pitch_sequence,predicted_pitch,pitch_counts,pitcher,pitch_arsenal,pitch_arsenal_csv,arsenal_size
0,"CH,SI","0-0,1-0",116,R,R,112526,572039,44,SI,1-0,CH,CH,"{'CB': 0, 'CH': 470, 'FC': 69, 'FF': 815, 'FS'...",112526,"['SI', 'CH', 'FF', 'SL', 'FC']","SI,CH,FF,SL,FC",5
1,"CH,SI,SI","0-0,1-0,1-1",11611,R,R,112526,572039,44,SI,1-1,"CH,SI",CH,"{'CB': 0, 'CH': 470, 'FC': 69, 'FF': 815, 'FS'...",112526,"['SI', 'CH', 'FF', 'SL', 'FC']","SI,CH,FF,SL,FC",5
2,"CH,CH","0-0,1-0",144,R,L,543037,624512,37,CH,1-0,CH,CH,"{'CB': 3300, 'CH': 1885, 'FC': 461, 'FF': 1136...",543037,"['FF', 'SL', 'CB', 'CH', 'SI', 'PO', 'FC']","FF,SL,CB,CH,SI,PO,FC",7
3,"FF,FF","0-0,0-1",111,R,L,642121,544369,84,FF,0-1,FF,FF,"{'CB': 33, 'CH': 757, 'FC': 2, 'FF': 1226, 'FS...",642121,"['FF', 'SL', 'CH', 'CB', 'FC']","FF,SL,CH,CB,FC",5
4,"FF,FF,FF","0-0,0-1,0-2",11111,R,L,642121,544369,84,FF,0-2,"FF,FF",FF,"{'CB': 33, 'CH': 757, 'FC': 2, 'FF': 1226, 'FS...",642121,"['FF', 'SL', 'CH', 'CB', 'FC']","FF,SL,CH,CB,FC",5


In [121]:
test_df_with_arsenal.progress_apply(lambda x: x['predicted_pitch'], axis=1)

100%|██████████| 352058/352058 [00:01<00:00, 179957.29it/s]


0         CH
1         CH
2         CH
3         FF
4         FF
          ..
352053    FF
352054    FF
352055    FF
352056    SI
352057    SL
Length: 352058, dtype: object

In [123]:
test_df_with_arsenal['predicted_pitch_in_arsenal'] = test_df_with_arsenal.progress_apply(lambda x: x['predicted_pitch'] in x['pitch_arsenal_csv'], axis=1)
test_df_with_arsenal.head(10)

100%|██████████| 352058/352058 [00:02<00:00, 144642.51it/s]


Unnamed: 0,pitch_sequence,count_sequence,zone_sequence,p_throws,stand,pitcher_id,batter_id,at_bat_number,target_pitch,setup_count,input_pitch_sequence,predicted_pitch,pitch_counts,pitcher,pitch_arsenal,pitch_arsenal_csv,arsenal_size,predicted_pitch_in_arsenal
0,"CH,SI","0-0,1-0",116,R,R,112526,572039,44,SI,1-0,CH,CH,"{'CB': 0, 'CH': 470, 'FC': 69, 'FF': 815, 'FS'...",112526,"['SI', 'CH', 'FF', 'SL', 'FC']","SI,CH,FF,SL,FC",5,True
1,"CH,SI,SI","0-0,1-0,1-1",11611,R,R,112526,572039,44,SI,1-1,"CH,SI",CH,"{'CB': 0, 'CH': 470, 'FC': 69, 'FF': 815, 'FS'...",112526,"['SI', 'CH', 'FF', 'SL', 'FC']","SI,CH,FF,SL,FC",5,True
2,"CH,CH","0-0,1-0",144,R,L,543037,624512,37,CH,1-0,CH,CH,"{'CB': 3300, 'CH': 1885, 'FC': 461, 'FF': 1136...",543037,"['FF', 'SL', 'CB', 'CH', 'SI', 'PO', 'FC']","FF,SL,CB,CH,SI,PO,FC",7,True
3,"FF,FF","0-0,0-1",111,R,L,642121,544369,84,FF,0-1,FF,FF,"{'CB': 33, 'CH': 757, 'FC': 2, 'FF': 1226, 'FS...",642121,"['FF', 'SL', 'CH', 'CB', 'FC']","FF,SL,CH,CB,FC",5,True
4,"FF,FF,FF","0-0,0-1,0-2",11111,R,L,642121,544369,84,FF,0-2,"FF,FF",FF,"{'CB': 33, 'CH': 757, 'FC': 2, 'FF': 1226, 'FS...",642121,"['FF', 'SL', 'CH', 'CB', 'FC']","FF,SL,CH,CB,FC",5,True
5,"FF,FF,FF,CH","0-0,0-1,0-2,1-2",1111113,R,L,642121,544369,84,CH,1-2,"FF,FF,FF",FF,"{'CB': 33, 'CH': 757, 'FC': 2, 'FF': 1226, 'FS...",642121,"['FF', 'SL', 'CH', 'CB', 'FC']","FF,SL,CH,CB,FC",5,True
6,"FF,FC","0-0,1-0",1212,R,R,608379,575929,28,FC,1-0,FF,FF,"{'CB': 1287, 'CH': 3507, 'FC': 2385, 'FF': 534...",608379,"['FC', 'SI', 'CH', 'FF', 'CB', 'PO']","FC,SI,CH,FF,CB,PO",6,True
7,"FF,FC,FC","0-0,1-0,1-1",12129,R,R,608379,575929,28,FC,1-1,"FF,FC",FC,"{'CB': 1287, 'CH': 3507, 'FC': 2385, 'FF': 534...",608379,"['FC', 'SI', 'CH', 'FF', 'CB', 'PO']","FC,SI,CH,FF,CB,PO",6,True
8,"FF,FC,FC,CB","0-0,1-0,1-1,1-2",121296,R,R,608379,575929,28,CB,1-2,"FF,FC,FC",FF,"{'CB': 1287, 'CH': 3507, 'FC': 2385, 'FF': 534...",608379,"['FC', 'SI', 'CH', 'FF', 'CB', 'PO']","FC,SI,CH,FF,CB,PO",6,True
9,"FF,FC,FC,CB,FF","0-0,1-0,1-1,1-2,1-2",12129613,R,R,608379,575929,28,FF,1-2,"FF,FC,FC,CB",FF,"{'CB': 1287, 'CH': 3507, 'FC': 2385, 'FF': 534...",608379,"['FC', 'SI', 'CH', 'FF', 'CB', 'PO']","FC,SI,CH,FF,CB,PO",6,True


In [124]:
test_df_with_arsenal['predicted_pitch_in_sequence'] = test_df_with_arsenal.progress_apply(lambda x: x['predicted_pitch'] in x['input_pitch_sequence'], axis=1)
test_df_with_arsenal.head(10)

100%|██████████| 352058/352058 [00:02<00:00, 130708.75it/s]


Unnamed: 0,pitch_sequence,count_sequence,zone_sequence,p_throws,stand,pitcher_id,batter_id,at_bat_number,target_pitch,setup_count,input_pitch_sequence,predicted_pitch,pitch_counts,pitcher,pitch_arsenal,pitch_arsenal_csv,arsenal_size,predicted_pitch_in_arsenal,predicted_pitch_in_sequence
0,"CH,SI","0-0,1-0",116,R,R,112526,572039,44,SI,1-0,CH,CH,"{'CB': 0, 'CH': 470, 'FC': 69, 'FF': 815, 'FS'...",112526,"['SI', 'CH', 'FF', 'SL', 'FC']","SI,CH,FF,SL,FC",5,True,True
1,"CH,SI,SI","0-0,1-0,1-1",11611,R,R,112526,572039,44,SI,1-1,"CH,SI",CH,"{'CB': 0, 'CH': 470, 'FC': 69, 'FF': 815, 'FS'...",112526,"['SI', 'CH', 'FF', 'SL', 'FC']","SI,CH,FF,SL,FC",5,True,True
2,"CH,CH","0-0,1-0",144,R,L,543037,624512,37,CH,1-0,CH,CH,"{'CB': 3300, 'CH': 1885, 'FC': 461, 'FF': 1136...",543037,"['FF', 'SL', 'CB', 'CH', 'SI', 'PO', 'FC']","FF,SL,CB,CH,SI,PO,FC",7,True,True
3,"FF,FF","0-0,0-1",111,R,L,642121,544369,84,FF,0-1,FF,FF,"{'CB': 33, 'CH': 757, 'FC': 2, 'FF': 1226, 'FS...",642121,"['FF', 'SL', 'CH', 'CB', 'FC']","FF,SL,CH,CB,FC",5,True,True
4,"FF,FF,FF","0-0,0-1,0-2",11111,R,L,642121,544369,84,FF,0-2,"FF,FF",FF,"{'CB': 33, 'CH': 757, 'FC': 2, 'FF': 1226, 'FS...",642121,"['FF', 'SL', 'CH', 'CB', 'FC']","FF,SL,CH,CB,FC",5,True,True
5,"FF,FF,FF,CH","0-0,0-1,0-2,1-2",1111113,R,L,642121,544369,84,CH,1-2,"FF,FF,FF",FF,"{'CB': 33, 'CH': 757, 'FC': 2, 'FF': 1226, 'FS...",642121,"['FF', 'SL', 'CH', 'CB', 'FC']","FF,SL,CH,CB,FC",5,True,True
6,"FF,FC","0-0,1-0",1212,R,R,608379,575929,28,FC,1-0,FF,FF,"{'CB': 1287, 'CH': 3507, 'FC': 2385, 'FF': 534...",608379,"['FC', 'SI', 'CH', 'FF', 'CB', 'PO']","FC,SI,CH,FF,CB,PO",6,True,True
7,"FF,FC,FC","0-0,1-0,1-1",12129,R,R,608379,575929,28,FC,1-1,"FF,FC",FC,"{'CB': 1287, 'CH': 3507, 'FC': 2385, 'FF': 534...",608379,"['FC', 'SI', 'CH', 'FF', 'CB', 'PO']","FC,SI,CH,FF,CB,PO",6,True,True
8,"FF,FC,FC,CB","0-0,1-0,1-1,1-2",121296,R,R,608379,575929,28,CB,1-2,"FF,FC,FC",FF,"{'CB': 1287, 'CH': 3507, 'FC': 2385, 'FF': 534...",608379,"['FC', 'SI', 'CH', 'FF', 'CB', 'PO']","FC,SI,CH,FF,CB,PO",6,True,True
9,"FF,FC,FC,CB,FF","0-0,1-0,1-1,1-2,1-2",12129613,R,R,608379,575929,28,FF,1-2,"FF,FC,FC,CB",FF,"{'CB': 1287, 'CH': 3507, 'FC': 2385, 'FF': 534...",608379,"['FC', 'SI', 'CH', 'FF', 'CB', 'PO']","FC,SI,CH,FF,CB,PO",6,True,True


In [132]:
test_df_with_arsenal['target_pitch_in_sequence'] = test_df_with_arsenal.progress_apply(lambda x: x['target_pitch'] in x['input_pitch_sequence'], axis=1)
test_df_with_arsenal.head(10)

100%|██████████| 352058/352058 [00:02<00:00, 133982.94it/s]


Unnamed: 0,pitch_sequence,count_sequence,zone_sequence,p_throws,stand,pitcher_id,batter_id,at_bat_number,target_pitch,setup_count,...,predicted_pitch,pitch_counts,pitcher,pitch_arsenal,pitch_arsenal_csv,arsenal_size,predicted_pitch_in_arsenal,predicted_pitch_in_sequence,target_pitch_not_in_sequence,target_pitch_in_sequence
0,"CH,SI","0-0,1-0",116,R,R,112526,572039,44,SI,1-0,...,CH,"{'CB': 0, 'CH': 470, 'FC': 69, 'FF': 815, 'FS'...",112526,"['SI', 'CH', 'FF', 'SL', 'FC']","SI,CH,FF,SL,FC",5,True,True,False,False
1,"CH,SI,SI","0-0,1-0,1-1",11611,R,R,112526,572039,44,SI,1-1,...,CH,"{'CB': 0, 'CH': 470, 'FC': 69, 'FF': 815, 'FS'...",112526,"['SI', 'CH', 'FF', 'SL', 'FC']","SI,CH,FF,SL,FC",5,True,True,True,True
2,"CH,CH","0-0,1-0",144,R,L,543037,624512,37,CH,1-0,...,CH,"{'CB': 3300, 'CH': 1885, 'FC': 461, 'FF': 1136...",543037,"['FF', 'SL', 'CB', 'CH', 'SI', 'PO', 'FC']","FF,SL,CB,CH,SI,PO,FC",7,True,True,True,True
3,"FF,FF","0-0,0-1",111,R,L,642121,544369,84,FF,0-1,...,FF,"{'CB': 33, 'CH': 757, 'FC': 2, 'FF': 1226, 'FS...",642121,"['FF', 'SL', 'CH', 'CB', 'FC']","FF,SL,CH,CB,FC",5,True,True,True,True
4,"FF,FF,FF","0-0,0-1,0-2",11111,R,L,642121,544369,84,FF,0-2,...,FF,"{'CB': 33, 'CH': 757, 'FC': 2, 'FF': 1226, 'FS...",642121,"['FF', 'SL', 'CH', 'CB', 'FC']","FF,SL,CH,CB,FC",5,True,True,True,True
5,"FF,FF,FF,CH","0-0,0-1,0-2,1-2",1111113,R,L,642121,544369,84,CH,1-2,...,FF,"{'CB': 33, 'CH': 757, 'FC': 2, 'FF': 1226, 'FS...",642121,"['FF', 'SL', 'CH', 'CB', 'FC']","FF,SL,CH,CB,FC",5,True,True,False,False
6,"FF,FC","0-0,1-0",1212,R,R,608379,575929,28,FC,1-0,...,FF,"{'CB': 1287, 'CH': 3507, 'FC': 2385, 'FF': 534...",608379,"['FC', 'SI', 'CH', 'FF', 'CB', 'PO']","FC,SI,CH,FF,CB,PO",6,True,True,False,False
7,"FF,FC,FC","0-0,1-0,1-1",12129,R,R,608379,575929,28,FC,1-1,...,FC,"{'CB': 1287, 'CH': 3507, 'FC': 2385, 'FF': 534...",608379,"['FC', 'SI', 'CH', 'FF', 'CB', 'PO']","FC,SI,CH,FF,CB,PO",6,True,True,True,True
8,"FF,FC,FC,CB","0-0,1-0,1-1,1-2",121296,R,R,608379,575929,28,CB,1-2,...,FF,"{'CB': 1287, 'CH': 3507, 'FC': 2385, 'FF': 534...",608379,"['FC', 'SI', 'CH', 'FF', 'CB', 'PO']","FC,SI,CH,FF,CB,PO",6,True,True,False,False
9,"FF,FC,FC,CB,FF","0-0,1-0,1-1,1-2,1-2",12129613,R,R,608379,575929,28,FF,1-2,...,FF,"{'CB': 1287, 'CH': 3507, 'FC': 2385, 'FF': 534...",608379,"['FC', 'SI', 'CH', 'FF', 'CB', 'PO']","FC,SI,CH,FF,CB,PO",6,True,True,True,True


In [133]:
print(len(test_df_with_arsenal[test_df_with_arsenal['target_pitch_in_sequence']])/len(test_df_with_arsenal))

0.6137369410722097


In [134]:
print(len(test_df_with_arsenal[test_df_with_arsenal['predicted_pitch_in_sequence']])/len(test_df_with_arsenal))

0.9263786080702612


In [135]:
print(len(test_df_with_arsenal[test_df_with_arsenal['predicted_pitch_in_arsenal']])/len(test_df_with_arsenal))

0.9990427713615371


In [137]:
print(f"Target Pitch In Sequence:               {len(test_df_with_arsenal[test_df_with_arsenal['target_pitch_in_sequence']])/len(test_df_with_arsenal):.4f}")
print(f"Predicted Pitch Seen In Input Sequence: {len(test_df_with_arsenal[test_df_with_arsenal['predicted_pitch_in_sequence']])/len(test_df_with_arsenal):.4f}")
print(f"Predicted Pitch In Arsenal              {len(test_df_with_arsenal[test_df_with_arsenal['predicted_pitch_in_arsenal']])/len(test_df_with_arsenal):.4f}")

Target Pitch In Sequence:               0.6137
Predicted Pitch Seen In Input Sequence: 0.9264
Predicted Pitch In Arsenal              0.9990


In [138]:
target_pitch_not_seen_df = test_df_with_arsenal[~test_df_with_arsenal['target_pitch_in_sequence']]
print(classification_report(target_pitch_not_seen_df['target_pitch'], target_pitch_not_seen_df['predicted_pitch'], zero_division=0))

              precision    recall  f1-score   support

          CB       0.00      0.00      0.00     18053
          CH       0.00      0.00      0.00     24069
          FC       0.00      0.00      0.00     10801
          FF       0.13      0.26      0.18     32543
          FS       0.00      0.00      0.00      3577
          KN       0.00      0.00      0.00        35
          PO       1.00      0.00      0.00        23
          SI       0.00      0.00      0.00     18868
          SL       0.00      0.00      0.00     25065
          ST       0.00      0.00      0.00      2953

    accuracy                           0.06    135987
   macro avg       0.11      0.03      0.02    135987
weighted avg       0.03      0.06      0.04    135987



In [131]:
target_pitch_seen_df = test_df_with_arsenal[test_df_with_arsenal['target_pitch_in_sequence']]
print(classification_report(target_pitch_seen_df['target_pitch'], target_pitch_seen_df['predicted_pitch'], zero_division=0))

              precision    recall  f1-score   support

          CB       0.00      0.00      0.00     18053
          CH       0.00      0.00      0.00     24069
          FC       0.00      0.00      0.00     10801
          FF       0.13      0.26      0.18     32543
          FS       0.00      0.00      0.00      3577
          KN       0.00      0.00      0.00        35
          PO       1.00      0.00      0.00        23
          SI       0.00      0.00      0.00     18868
          SL       0.00      0.00      0.00     25065
          ST       0.00      0.00      0.00      2953

    accuracy                           0.06    135987
   macro avg       0.11      0.03      0.02    135987
weighted avg       0.03      0.06      0.04    135987

