In [15]:
import numpy as np
import pandas as pd
import statsapi

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

from urllib.request import urlopen

from bs4 import BeautifulSoup
import requests

from pybaseball import playerid_reverse_lookup
import random

pitching_header_tokens = ['Year', 'Team', 'League', 'Daily','Splits','G','GS','CG','SHO','GF','SV','IP','H','BFP','HR','R','ER','BB','IB','SO','SH','SF','WP','HBP','BK','2B','3B','GDP','ROE', 'W','L','ERA','RS','PW','YearX','TeamX','LeagueX']
batting_header_tokens = ['Year', 'Team', 'League', 'Daily','Splits','G','AB','R','H','2B','3B','HR','RBI','BB','IBB','SO','HBP','SH','SF','XI','ROE','GDP','SB','CS','AVG','OBP','SLG','BFW','YearX','TeamX','LeagueX'] 



In [16]:
def get_player_df(url, header_tokens, record):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    soup1 = list(soup.children)[-2]
    soup2 = list(soup1.children)[-1]
    soup3 = list(soup2.children)

    index_num = np.where([record in str(x) for x in soup2])[0][0]

    soup4 = soup3[index_num]

    for b in soup4.find_all('b'):
        b.unwrap()  # Removes the <b> tag but keeps the text inside

    clean_text = soup4.get_text()

    lines = clean_text.split("\n")

    data_lines = []
    for line in lines:
        if line.startswith("20") or line.startswith("19"):
            data_lines.append(line)

    tokenized_rows = [line.split() for line in data_lines]

    expected_len = len(header_tokens)

    tokenized_rows = [row for row in tokenized_rows if len(row) >= expected_len]

    df = pd.DataFrame(tokenized_rows, columns=header_tokens[:len(tokenized_rows[0])])

    df = df.drop(columns=['Daily', 'Splits', 'YearX', 'TeamX', 'LeagueX'], errors='ignore')

    return df

In [17]:
all_p_id = pd.read_csv('all_p_id.csv', header=None).squeeze().dropna().tolist()

all_b_id = pd.read_csv('all_b_id.csv', header=None).squeeze().dropna().tolist()

all_b_id = list(set(all_b_id) - set(all_p_id))

In [18]:
def get_player_info(p_ids, min_career_length, current_season):
     while True:
        random_id = random.choice(p_ids)
        data = playerid_reverse_lookup([random_id], key_type='retro')

        if data.empty:
            continue

        first_year = data.iloc[0]['mlb_played_first']
        last_year = data.iloc[0]['mlb_played_last']

        if (last_year - first_year) >= min_career_length and last_year == current_season:
             return data, random_id



In [19]:
def get_pitcher_data(min_career_length, current_season, pitching_header_tokens):
    data, random_id = get_player_info(all_p_id, min_career_length, current_season)

    letter = data.iloc[0]['name_last'][0].upper()

    url = "https://www.retrosheet.org/boxesetc/" + letter + "/P" + random_id + ".htm"
    df = get_player_df(url, pitching_header_tokens, record="Pitching Record")
    return data, df

def get_batter_data(min_career_length, current_season, batting_header_tokens):
    data, random_id = get_player_info(all_b_id, min_career_length, current_season)

    letter = data.iloc[0]['name_last'][0].upper()

    url = "https://www.retrosheet.org/boxesetc/" + letter + "/P" + random_id + ".htm"
    df = get_player_df(url, batting_header_tokens, record="Batting Record")
    return data, df

In [33]:
import random

min_career_length = 1
min_start_season = 2000
current_season = 2024

p_or_b = random.choice([1,2])

if p_or_b == 1:
    print("Pitching Stats")
    data, df = get_pitcher_data(min_career_length, current_season, pitching_header_tokens)

else:
    print("Batting Stats")
    data, df = get_batter_data(min_career_length, current_season, batting_header_tokens)

info = statsapi.player_stat_data(int(data.iloc[0]['key_mlbam']), group="[hitting,pitching,fielding]", type='season', sportId=1, season=None)
print("Player Position:", info['position'])
print("Bat Side:", info['bat_side'])
print("Pitch Hand:", info['pitch_hand'])
print("Active:", info['active'])

player_name = data.iloc[0]['name_first'] + " " +  data.iloc[0]['name_last']


df

Batting Stats
Player Position: SS
Bat Side: Left
Pitch Hand: Right
Active: False


Unnamed: 0,Year,Team,League,G,AB,R,H,2B,3B,HR,RBI,BB,IBB,SO,HBP,SH,SF,XI,ROE,GDP,SB,CS,AVG,OBP,SLG,BFW
0,2011,SF,N,66,196,22,40,5,2,3,21,23,1,31,0,1,0,0,2,4,1,3,0.204,0.288,0.296,-0.6
1,2012,SF,N,143,435,44,108,26,3,4,45,33,6,95,3,2,3,0,3,4,1,4,0.248,0.304,0.349,2.0
2,2013,SF,N,149,499,52,124,24,3,9,43,42,6,96,5,1,3,0,4,10,1,2,0.248,0.311,0.363,-0.5
3,2014,SF,N,153,491,54,121,20,10,10,69,59,10,129,2,2,10,0,1,4,5,3,0.246,0.324,0.389,1.4
4,2015,SF,N,143,507,65,130,33,4,21,84,39,9,119,11,0,4,0,5,18,6,4,0.256,0.321,0.462,2.7
5,2016,SF,N,155,553,67,152,28,11,12,84,57,10,115,4,0,9,0,1,13,7,0,0.275,0.342,0.43,2.0
6,2017,SF,N,144,518,58,131,34,1,14,77,42,3,113,1,0,9,0,4,18,3,5,0.253,0.305,0.403,-0.5
7,2018,SF,N,151,531,63,135,28,2,14,54,50,13,122,8,0,5,0,3,12,4,5,0.254,0.325,0.394,2.7
8,2019,SF,N,147,500,58,114,24,2,11,59,53,5,117,3,0,4,0,6,10,3,2,0.228,0.304,0.35,-1.4
9,2020,SF,N,54,172,26,44,12,0,8,28,15,2,47,4,0,2,0,1,3,1,2,0.256,0.326,0.465,0.6


In [34]:
print(player_name)

brandon crawford
