In [5]:
import os
import pandas as pd
import numpy as np
from io import BytesIO
from fpdf import FPDF
import matplotlib.pyplot as plt

def ordinal(n):
    """
    Convert an integer to its ordinal representation:
      1 → 1st, 2 → 2nd, 3 → 3rd, 4 → 4th, etc.
    """
    suffix = "tsnrhtdd"[(n//10%10!=1)*(n%10<4)*n%10::4]
    return f"{n}{suffix}"

team_folder_paths = {
    'CHI_DOG': '/Users/ajfoeckler/Downloads/chicago_games',
    'MIL_MIL1': '/Users/ajfoeckler/Downloads/milwaukee_games',
    'KAN_COU': '/Users/ajfoeckler/Downloads/kane_county_games',
    'FAR_RED': '/Users/ajfoeckler/Downloads/fargo_games',
    'SIO_FAL':'/Users/ajfoeckler/Downloads/SF_games',
    'KAN_CIT3': '/Users/ajfoeckler/Downloads/KC_games',
    'LIN_SAL': '/Users/ajfoeckler/Downloads/lincoln_games',
    'WIN_GOL':'/Users/ajfoeckler/Downloads/winnipeg_games',
    'GAR_SOU':'/Users/ajfoeckler/Downloads/gary_games',
    'SIO_CIT1':'/Users/ajfoeckler/Downloads/sioux_city_games',
    'CLE_RAI':'/Users/ajfoeckler/Downloads/cleburne_games',
    # add other teams here as needed…
}
AA_ALL_GAMES   = '/Users/ajfoeckler/Downloads/AA_all_games'

# Define the folder paths
selected_team = 'CHI_DOG'
folder_path = team_folder_paths[selected_team]
save_folder_path = '/Users/ajfoeckler/Downloads/chicago_hitting'

# Create the save directory if it doesn't exist
os.makedirs(save_folder_path, exist_ok=True)

csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# === CONFIGURATION ===
pitcher_side = 'Right'    # or 'Left'
pitcher_abbrev = 'RHP' if pitcher_side == 'Right' else 'LHP'

# load team CSVs into original_data
data_frames = []
for fn in csv_files:
    df = pd.read_csv(os.path.join(folder_path, fn))
    df = df[df['PitcherThrows'] == pitcher_side]
    data_frames.append(df)

original_data = pd.concat(data_frames, ignore_index=True)


# standardize pitch‐type tags
original_data['AutoPitchType'] = (
    original_data['AutoPitchType']
    .replace({'ChangeUp':'Changeup','Splitter':'Changeup'})
)

# drop undefined hit‐types
original_data = original_data[
    original_data['TaggedHitType'] != 'undefined'
]

# Ensure 'Date' column is in datetime format
original_data['Date'] = pd.to_datetime(original_data['Date'], errors='coerce')
original_data = original_data.dropna(subset=['Date'])

# Normalize batter names
original_data['Batter'] = original_data['Batter'].replace({
    'Pruitt Jr, Reggie': 'Pruitt Jr, Reggie',
    'Law, Trey': 'Law, Trey'
}).str.strip()

# Create a dictionary mapping each batter to their team
batter_team_mapping = original_data[['Batter', 'BatterTeam']].drop_duplicates().set_index('Batter').to_dict()['BatterTeam']

# ——— manual override for batters who’ve switched teams ———
manual_batter_overrides = {

    # add more overrides here
}

# manual override: these hitters used to be on another club
manual_batter_original_team = {

}

for batter, new_team in manual_batter_overrides.items():
    batter_team_mapping[batter] = new_team
# ——————————————————————————————————————————————

# Prompt user to select a team
unique_teams = original_data['BatterTeam'].unique()
print("Available teams:", unique_teams)
selected_team = input("Please enter the team code to generate the report for: ").strip()

# figure out which folders to pull
teams_to_load = { selected_team } | set(manual_batter_original_team.values())
folders_to_load = [ team_folder_paths[t] for t in teams_to_load ]

# now read _all_ of those folders’ CSVs
data_frames = []
for folder in folders_to_load:
    for fn in os.listdir(folder):
        if not fn.endswith('.csv'): 
            continue
        df = pd.read_csv(os.path.join(folder, fn))
        df = df[df['PitcherThrows'] == pitcher_side]
        data_frames.append(df)

original_data = pd.concat(data_frames, ignore_index=True)

# Define the strike zone dimensions
strike_zone_top = 3.673333
strike_zone_bottom = 1.524166667
strike_zone_left = -0.83083333
strike_zone_right = 0.83083333

# Function to process the data and calculate required metrics
def process_data(data):
    data['AutoPitchType'] = data['AutoPitchType'].replace({
        'ChangeUp': 'Changeup', 'Splitter': 'Changeup'
    })
    
    # anything tagged Slider but really looks like a cutter:
    is_slider = data['AutoPitchType'] == 'Slider'
    high_vel  = data['RelSpeed'] > 85
    vert_flat = data['InducedVertBreak'].between(0, 15)    # 0 to 15"
    horiz_flat = data['HorzBreak'].between(-5, 5)          # –5 to +5"

    data.loc[is_slider & high_vel & vert_flat & horiz_flat, 'AutoPitchType'] = 'Cutter'

    data = data[data['TaggedHitType'] != 'undefined']
    data['Swing'] = data['PitchCall'].apply(lambda x: 1 if x in ['StrikeSwinging', 'FoulBallNotFieldable', 'InPlay'] else 0)
    data['Whiff'] = data['PitchCall'].apply(lambda x: 1 if x == 'StrikeSwinging' else 0)
    
    data['FlyBall'] = data['TaggedHitType'].apply(lambda x: 1 if x == 'FlyBall' else 0)
    data['GroundBall'] = data['TaggedHitType'].apply(lambda x: 1 if x == 'GroundBall' else 0)
    data['LineDrive'] = data['TaggedHitType'].apply(lambda x: 1 if x == 'LineDrive' else 0)
    data['Popup'] = data['TaggedHitType'].apply(lambda x: 1 if x == 'Popup' else 0)
    
    data['Barrel'] = ((data['ExitSpeed'] >= 95) & 
                      (data['Angle'] >= 5) & 
                      (data['Angle'] <= 37)).astype(int)
    
    data['InStrikeZone'] = ((data['PlateLocHeight'] <= strike_zone_top) &
                            (data['PlateLocHeight'] >= strike_zone_bottom) &
                            (data['PlateLocSide'] >= strike_zone_left) &
                            (data['PlateLocSide'] <= strike_zone_right)).astype(int)
    
    data['Chase'] = (data['Swing'] & (data['InStrikeZone'] == 0)).astype(int)
    data['BattedBallInPlay'] = data[['FlyBall', 'GroundBall', 'Popup', 'LineDrive']].sum(axis=1)
    
    grouping_columns = ['Batter', 'AutoPitchType']
    
    metrics = data.groupby(grouping_columns).agg({
        'FlyBall': 'sum',
        'GroundBall': 'sum',
        'Popup': 'sum',
        'LineDrive': 'sum',
        'Barrel': 'sum',
        'Swing': 'sum',
        'Whiff': 'sum',
        'Chase': 'sum',
        'InStrikeZone': 'sum',
        'BattedBallInPlay': 'sum',
        'PitchNo': 'count'
    }).reset_index()
    
    z_swing = data[data['InStrikeZone'] == 1].groupby(grouping_columns)['Swing'].sum().reset_index(name='Z_Swing')
    z_total = data[data['InStrikeZone'] == 1].groupby(grouping_columns)['PitchNo'].count().reset_index(name='Z_Total')
    o_swing = data[data['InStrikeZone'] == 0].groupby(grouping_columns)['Swing'].sum().reset_index(name='O_Swing')
    o_total = data[data['InStrikeZone'] == 0].groupby(grouping_columns)['PitchNo'].count().reset_index(name='O_Total')
    
    metrics = metrics.merge(z_swing, on=grouping_columns, how='left')
    metrics = metrics.merge(z_total, on=grouping_columns, how='left')
    metrics = metrics.merge(o_swing, on=grouping_columns, how='left')
    metrics = metrics.merge(o_total, on=grouping_columns, how='left')
    
    metrics['fly_ball_pct'] = (metrics['FlyBall'] / metrics['BattedBallInPlay'] * 100).round(2)
    metrics['ground_ball_pct'] = (metrics['GroundBall'] / metrics['BattedBallInPlay'] * 100).round(2)
    metrics['line_drive_pct'] = (metrics['LineDrive'] / metrics['BattedBallInPlay'] * 100).round(2)
    metrics['barrel_pct'] = (metrics['Barrel'] / metrics['BattedBallInPlay'] * 100).round(2)
    metrics['swing_pct'] = (metrics['Swing'] / metrics['PitchNo'] * 100).round(2)
    metrics['whiff_pct'] = (metrics['Whiff'] / metrics['Swing'] * 100).round(2)
    metrics['z_swing_pct'] = (metrics['Z_Swing'] / metrics['Z_Total'] * 100).round(2)
    metrics['o_swing_pct'] = (metrics['O_Swing'] / metrics['O_Total'] * 100).round(2)
    
    metrics['total_batted_balls'] = metrics['FlyBall'] + metrics['GroundBall'] + metrics['LineDrive'] + metrics['Popup']
    
    percentage_columns = [
        'Batter', 'AutoPitchType',
        'PitchNo',
        'total_batted_balls',
        'fly_ball_pct', 'ground_ball_pct', 'line_drive_pct',
        'barrel_pct', 'swing_pct', 'whiff_pct',
        'z_swing_pct', 'o_swing_pct'
    ]
    metrics = metrics[percentage_columns]
    
    return metrics

class PDF(FPDF):
    def __init__(self, team, vs_hand, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.team = team
        self.vs_hand = vs_hand

    def header(self):
        self.set_font('Arial','B',12)
        self.cell(0,10,f'Team: {self.team} vs. {self.vs_hand}',0,1,'C')

    def chapter_title(self,title):
        self.set_font('Arial','B',12)
        self.cell(0,10,title,0,1,'L')
        self.ln(4)

    def draw_percentile_bar(self,label,raw,pctile,total_width=60):
        self.set_font('Arial','',8)
        self.cell(25,5,label,0,0)
        bar_w = total_width - 25
        filled = bar_w * pctile/100
        red = int(255 * pctile/100); blue = 255 - red
        x,y = self.get_x(),self.get_y()
        self.set_draw_color(0,0,0); self.set_fill_color(red,0,blue)
        self.rect(x,y,bar_w,5)
        if filled>0: self.rect(x,y,filled,5,style='F')
        pct_lbl = ordinal(int(pctile))
        self.set_xy(x+bar_w+2,y)
        self.cell(30,5,f"{raw:.1f}% ({pct_lbl})",0,1)
        
# === 3) LEAGUE-WIDE PERCENTILE SETUP ===
metric_cols = ['swing_pct','whiff_pct','z_swing_pct','o_swing_pct','barrel_pct']

league_frames = []
for fn in os.listdir(AA_ALL_GAMES):
    if fn.lower().endswith('.csv'):
        df = pd.read_csv(os.path.join(AA_ALL_GAMES, fn))
        df = df[df['PitcherThrows']==pitcher_side]  # match your pitcher_side
        league_frames.append(df)
league_data = pd.concat(league_frames, ignore_index=True)

# Normalize league data
league_data['Date'] = pd.to_datetime(league_data['Date'], errors='coerce')
league_data = league_data.dropna(subset=['Date'])
league_data['AutoPitchType'] = league_data['AutoPitchType'].replace({'ChangeUp':'Changeup','Splitter':'Changeup'})
league_data = league_data[league_data['TaggedHitType']!='undefined']

# Compute league metrics and percentiles
league_metrics = process_data(league_data)
for col in metric_cols:
    pct_col = f'{col}_pctile'
    league_metrics[f'{col}_pctile'] = (
        league_metrics
        .groupby('AutoPitchType')[col]
        .rank(method='min', pct=True)
        .mul(100)
        .round(0)
    )
    league_metrics.loc[league_metrics[col] == 0, pct_col] = 0
league_pct = league_metrics[['Batter', 'AutoPitchType'] + [f'{c}_pctile' for c in metric_cols]]

# Compute league-wide averages for each metric × pitch
league_means = (
    league_metrics
      .groupby('AutoPitchType')[metric_cols]
      .mean()
      .rename(columns=lambda c: c.replace('_pct','_avg'))
      .reset_index()
)

# === 4) TEAM-SPECIFIC REPORT GENERATION ===
selected_team = 'CHI_DOG'
folder_path = team_folder_paths[selected_team]
save_folder_path = f'/Users/ajfoeckler/Downloads/{selected_team}_hitting'
os.makedirs(save_folder_path, exist_ok=True)

# Load team data
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
frames = []
for fn in csv_files:
    df = pd.read_csv(os.path.join(folder_path, fn))
    df = df[df['PitcherThrows']== pitcher_side]
    frames.append(df)
original_data = pd.concat(frames, ignore_index=True)

# Normalize team data
original_data['Date'] = pd.to_datetime(original_data['Date'], errors='coerce')
original_data = original_data.dropna(subset=['Date'])
original_data['AutoPitchType'] = original_data['AutoPitchType'].replace({'ChangeUp':'Changeup','Splitter':'Changeup'})
original_data = original_data[original_data['TaggedHitType']!='undefined']

# Batter-team mapping & overrides (unchanged)
original_data['Batter'] = original_data['Batter'].str.strip()
batter_team_mapping = original_data[['Batter','BatterTeam']].drop_duplicates().set_index('Batter')['BatterTeam'].to_dict()
manual_over = {'Ordonez, Ernny':selected_team}
orig_team = {'Ordonez, Ernny':'KAN_COU'}
for b,n in manual_over.items(): batter_team_mapping[b]=n

# Compute team metrics & merge percentiles
metrics_df = process_data(original_data)
metrics_df = metrics_df.merge(league_pct,
                              on=['Batter','AutoPitchType'],
                              how='left')
pctile_cols = [f'{c}_pctile' for c in metric_cols]
metrics_df[pctile_cols] = metrics_df[pctile_cols].fillna(0)

# Merge in the league averages
metrics_df = metrics_df.merge(
    league_means,
    on='AutoPitchType',
    how='left'
)


# bring in batter handedness so we can label each page
metrics_df = metrics_df.merge(
    original_data[['Batter','BatterSide']]
      .drop_duplicates(subset=['Batter']),
    on='Batter',
    how='left'
)

# PDF generation function
def generate_pdf_report(metrics_df, raw_df, team, out_path, mapping, vs_hand):
    pdf = PDF(team, vs_hand)
    pdf.set_auto_page_break(False)

    # get just this team’s batters
    
    excluded_batters = ["Cespedes, Yoelqui", "McDaniel, Chase", "Turbo, Johnni", "Pettigrew, Zion", "Sosa, Gus", "Novak, Nicholas", "Coulter, Clint", "Novak, Nick", "Alexander, Evan", "Davis, Jaylin", "Spinn, Zane", "Aldrete, Carter", "Alexander, Hill", "Berglund, Michael", "Bigford, Trey", "Boswell, Bret", "Brocato, Anthony", "Brothers, Blaze", "Cantleberry, Jake", "Dillard, Thomas", "Duzenack, Camden", "Groshans, Jaxx", "Holland, Korey", "Mechals, Kade", "O'Grady, Brian", "Palmeiro, Preston", "Velez, Antonio", "Washington, Mark", "Davis, Jaylin", "Givin, Matt", "Huckstorf, Kyle", "Layer, Abdiel", "Henson, Spencer", "Stroh, Parker", "Dragum, Jack", "Denning, Connor", "Sparks, Lamar", "Chiu, Marcus", "Davis, Colten", "Dykhoff, Jake", "Green, Thomas", "Farmer, Justin", "Gonzalez, Marcos", "Sierra, Miguelangel", "Hoover, Jake", "Galan, Yosy", "Cajigas, Julio", "Waite, Jonathan", "McCurdy, Carson", "Arroyo, Edwin", "Croes, Dayson", "Emery, Robert", "Enriquez, Roby", "McMurray, Jake", "Schwabe, Cadyn", "Simington, Miles", "Reeves, Dalton", "Byrd, Justin", "vooletich, zac", "Barfield, Jacob", "Busch, Will", "Corona, Emilio", "Gomez, Dario", "Green, Jake", "Jackson, Jaren", "Lingua, Daniel", "Martinez, Osvaldo", "Meyer, Jake", "Montano, Daniel", "Nogowski, John", "Perez, Daniel",  "Womack, Alsander", "Ward, Je'Von", "Dorighi, Brennen", "Childs, Dwight", "Drury, Austin", "Perez, Henderson", "Ramon, Amos", "Wetherbee, Jared", "Zinn, Delvin", "Bockelie, Jacob", "Decker, Will", "Fields, Brandon", "Gulino, Michael", "Hurd, Aaron", "Perez, Yanio", "Perez, Mikey", "Smith, Chad", "Calarco, Anthony", "Campagna, Joe", "Grant-Parks, Blake", "Maberry, David", "Mcarthur, General", "Parks, Pavin", "Aguilar, Bryan", "Amaral, Danny", "Armaral, Danny", "Milam, Kevin", "Ortiz, Channey", "Hjelle, Jake", "Phillips, Dakota", "Siket, Jordan", "Christopher, Shamoy", "Eickhoff, Logan", "Foster, Kendall", "Meiners, Tate", "Seay, Mason", "Brusa, Gio", "Hall, Adam", "Holgate, Ryan", "Turner, Braxton", "Escala, Willie", "Rodriguez, Brett", "Smith, Armani", "Cushing, Jared", "Del Valle, Francisco", "Panzetta, Nick", "Barranca, Antonio", "Pike, Chad", "Marrero, Wendell", "Costes, Marty", "Crook, Narciso", "Harris, Chase", "Abbatine, Anthony", "Diaz, Gio", "Levari, Kenneth", "Quintana, Guillermo", "Rincon, Carlos", "Valera, Jackson", "Law, Trey", "Santos, Oscar", "Taylor, John", "Cannon, Cameron", "Davis, Jonah", "Randolph, Cornelius", "Reid, Simon", "Smith, Harrison", "Dalesandro, Nick", "Reyes, Bryan", "Doersching, Griffin", "Morales, Roy", "Alcantara, Ismael", "Dexter, Sam", "Valdez, CJ", "Livorsi, Ben", "Meza, Eric", "Quiggle, Kona", "Ward, Drew", "Mount, Drew", "Sarringar, Spencer", "Spence, Liam", "Ulrich, Wyatt", "Ordonez, Ernny", "Cedrola, Lorenzo", "Gomez, Moises", "O'Conner, Justin", "Avelino, Abiatal", "Cannon, Cam", "Maxwell, Carson", "Amaral, Daniel", "Bradley, Tucker", "Rutherford, Blake", "Fry, Jared", "Anderson, Nick", "Awtry, Marshall", "Awtry , Marshall", "Baeza, Alex", "Conners, Dakota", "DeVine, Drew", "Epp, Alex", "Hewitt, Max", "Mattis, Gary", "Ortega, Jake", "Pita, Matt", "Roskam, Luke", "Takacs, Aaron", "Williams, Logan", "Zurbrugg, Zane"]
    excluded_batters = [b for b in excluded_batters if b not in {"Sermo, Jose", "Lujano, Jesus"}]
    batters = [b for b,t in mapping.items() if t == team and b not in excluded_batters]

    for batter in batters:
        # slice both the summary table and the raw pitch data
        summary = metrics_df[
            (metrics_df['Batter']==batter) &
            (metrics_df['AutoPitchType'].notnull())
        ]
        raw = raw_df[raw_df['Batter']==batter]
        hand = summary['BatterSide'].iloc[0]
        hand_text = 'L' if hand.lower().startswith('l') else 'R'

        pdf.add_page()
        pdf.chapter_title(f'Batter: {batter} ({hand_text})')

        usable_w = pdf.w - pdf.l_margin - pdf.r_margin
        bar_w    = usable_w * 0.5   # left half for bars
        left_x   = pdf.l_margin
        heat_x   = left_x + bar_w + 5

        pitch_order = ['Four-Seam','Sinker','Curveball','Slider','Changeup','Cutter']
        for pt in pitch_order:
            row = summary[summary['AutoPitchType']==pt]
            if row.empty: 
                continue

            cnt = int(row['PitchNo'].iloc[0])
            pdf.set_font('Arial','B',9)
            pdf.cell(0,5,f"{pt} ({cnt})",0,1)
            pdf.ln(0.5)

    # draw each metric + league avg underneath
            pdf.draw_percentile_bar('Swing %',  row['swing_pct'].iloc[0],  row['swing_pct_pctile'].iloc[0], total_width=bar_w)
            pdf.ln(0.5)
            
           # pdf.draw_percentile_bar('Whiff %',  row['whiff_pct'].iloc[0],  row['whiff_pct_pctile'].iloc[0], total_width=bar_w)
            #pdf.ln(0.5)

            pdf.draw_percentile_bar('Z-Swing %', row['z_swing_pct'].iloc[0], row['z_swing_pct_pctile'].iloc[0], total_width=bar_w)
            pdf.ln(0.5)

            pdf.draw_percentile_bar('Chase %',  row['o_swing_pct'].iloc[0],  row['o_swing_pct_pctile'].iloc[0], total_width=bar_w)
            pdf.ln(0.5)

            pdf.draw_percentile_bar('Barrel %', row['barrel_pct'].iloc[0], row['barrel_pct_pctile'].iloc[0], total_width=bar_w)
            pdf.set_font('Arial','I',7)
            pdf.ln(2)  # add a little extra spacing before next pitch

    pdf.output(out_path)

output_pdf = os.path.join(save_folder_path,
                          f'{selected_team}_percentiles!_{pitcher_abbrev}.pdf')
generate_pdf_report(
    metrics_df,        # your aggregated metrics
    original_data,     # your full raw DataFrame
    selected_team,     # e.g. 'SIO_FAL'
    output_pdf,        # your output path
    batter_team_mapping,
    pitcher_abbrev
)

print(f"PDF report created: {output_pdf}")

Available teams: ['CHI_DOG' 'SIO_CIT1' 'LIN_SAL' 'FAR_RED' 'WIN_GOL' 'LAK_COU10' 'MIL_MIL1'
 'KAN_COU' 'CLE_RAI' 'KAN_CIT3' 'SIO_FAL' 'GAR_SOU']
Please enter the team code to generate the report for: CHI_DOG
PDF report created: /Users/ajfoeckler/Downloads/CHI_DOG_hitting/CHI_DOG_percentiles!_RHP.pdf


  self.set_font('Arial','B',12)
  self.cell(0,10,f'Team: {self.team} vs. {self.vs_hand}',0,1,'C')
  self.set_font('Arial','B',12)
  self.cell(0,10,title,0,1,'L')
  pdf.set_font('Arial','B',9)
  pdf.cell(0,5,f"{pt} ({cnt})",0,1)
  self.set_font('Arial','',8)
  self.cell(25,5,label,0,0)
  self.cell(30,5,f"{raw:.1f}% ({pct_lbl})",0,1)
  pdf.set_font('Arial','I',7)
