In [2]:
import requests
import json
from requests_html import HTMLSession
from bs4 import BeautifulSoup

In [None]:
years = list(range(2001,2024))
url_base = "https://www.baseball-reference.com/leagues/majors/{y}-standard-batting.shtml"

for year in years:
    url = url_base.format(y=year)
    
    soup = BeautifulSoup(
        requests.get(url).text.replace('<!--','').replace('-->','')
    )

    tables = soup.find_all('table')
    table = tables[1]

    tabledata = [[col.text for col in row.find_all('td')] for row in table.find_all('tr')][1:]

    tableheader = [[head.text for head in row.find_all('th')[1:]] for row in table.find_all('tr')][0]

    result = [dict(zip(tableheader, t)) for t in tabledata]

    with open(f'years/{year}.json', 'w') as f:
        json.dump(result, f)



In [None]:
#limits data to only qualified players
#and adds year attribute to every player so that when they are
#seperated it is easy to see what year its from
for year in years:
    file = f'years/{year}.json'
    file
    with open(file, 'r') as f:
        data = json.load(f)
        filtered_data = [player for player in data if int(player.get('PA', 0)) >= 502]
        for player in filtered_data:
            player['year'] = str(year)
    with open(file, 'w') as f:
        json.dump(filtered_data, f)


In [None]:
#seperate by positions

#if there is an asterik (*) in the position summary in front of a position, 
#it means the player played over 2/3 of his games at that positions

catcher = []
first_base = []
second_base = []
third_base = []
shortstop = []
left_field = []
center_field = []
right_field = []

for year in years:
    file = f'years/{year}.json'
    with open(file, 'r') as f:
        data = json.load(f)
        primary_pos = [player for player in data if player.get('Pos\xa0Summary') and player.get('Pos\xa0Summary')[0]in'*']
        for player in primary_pos:
            if player.get('Pos\xa0Summary')[1]=='2':
                catcher.append(player)
            elif player.get('Pos\xa0Summary')[1]=='3':
                first_base.append(player)
            elif player.get('Pos\xa0Summary')[1]=='4':
                second_base.append(player)
            elif player.get('Pos\xa0Summary')[1]=='5':
                third_base.append(player)
            elif player.get('Pos\xa0Summary')[1]=='6':
                shortstop.append(player)
            elif player.get('Pos\xa0Summary')[1]=='7':
                left_field.append(player)
            elif player.get('Pos\xa0Summary')[1]=='8':
                center_field.append(player)
            elif player.get('Pos\xa0Summary')[1]=='9':
                right_field.append(player)


In [None]:
#creating json files for each position


with open('positions/catcher.json', 'w') as f:
    json.dump(catcher, f)

with open('positions/first_base.json', 'w') as f:
    json.dump(first_base, f)

with open('positions/second_base.json', 'w') as f:
    json.dump(second_base, f)

with open('positions/third_base.json', 'w') as f:
    json.dump(third_base, f)

with open('positions/shortstop.json', 'w') as f:
    json.dump(shortstop, f)
    
with open('positions/left_field.json', 'w') as f:
    json.dump(left_field, f)

with open('positions/center_field.json', 'w') as f:
    json.dump(center_field, f)

with open('positions/right_field.json', 'w') as f:
    json.dump(right_field, f)




In [None]:
positions = ["catcher", "first_base", "second_base", "third_base", "shortstop", "left_field", "center_field", "right_field"]
position_names = {"catcher_names" : [], "first_base_names" : [], "second_base_names" : [], "third_base_names" : [], "shortstop_names" : [], "left_field_names" : [], "center_field_names" : [], "right_field_names" : []}
for position in positions:
    with open(f'positions/{position}.json', 'r') as f:
        data = json.load(f)
        for player in data:
            position_names[f'{position}_names'].append(str(player['Name']))
            unique_data = list({json.dumps(name, sort_keys=True) for name in position_names[f'{position}_names']})
            unique_json_data = [json.loads(item) for item in unique_data]
            for i in range(len(unique_json_data)):
                unique_json_data[i] = unique_json_data[i].replace("*", "").replace("#", "")
            with open(f'positions/names/{position}_names.json', 'w') as f2:
                json.dump(unique_json_data, f2)
#getting all individual names and eliminating repeats, also cleaning any weird punctuation out of names from bbref


In [None]:
#cleaning all names from bigger json files of all punctuation
positions = ["catcher", "first_base", "second_base", "third_base", "shortstop", "left_field", "center_field", "right_field"]
for position in positions:
    with open(f'positions/{position}.json', 'r') as f:
        data = json.load(f)
        for i in range(len(data)):
            data[i]['Name'] = data[i]['Name'].replace("*", "").replace("#", "")
        with open(f'positions/{position}.json', 'w') as f2:
            json.dump(data, f2)




In [5]:
# adding league/division info to each players data
positions = ["catcher", "first_base", "second_base", "third_base", "shortstop", "left_field", "center_field", "right_field"]

for position in positions:
    with open(f'positions/{position}.json', 'r') as f:
        data = json.load(f)
        for player in data:
            if player['Tm'] in ("NYY", "BOS", "TOR", "BAL", "TBR"):
                player['League'] = "AL"  
                player["Division"] = "AL East"
            elif player['Tm'] in ("CHW" , "KCR" , "MIN" , "DET" , "CLE"):
                player['League'] = "AL"  
                player["Division"] = "AL Central"
            elif player['Tm'] in ("LAA" , "TEX" , "SEA" , "OAK"):
                player['League'] = "AL"  
                player["Division"] = "AL West"
            elif player['Tm'] in ("ATL" , "NYM" , "WSN" , "MIA" , "PHI" , "FLA"):
                player['League'] = "NL"  
                player["Division"] = "NL East"
            elif player['Tm'] in("CHC" , "STL" , "CIN" , "MIL" , "PIT"):
                player['League'] = "NL"  
                player["Division"] = "NL Central"
            elif player['Tm'] in ("LAD" , "SFG" , "SDP" , "ARI" , "COL"):
                player['League'] = "NL"  
                player["Division"] = "NL West"
            elif player['Tm'] in "HOU":
                if int(player['year'])<2013:
                    player['League'] = "NL"  
                    player["Division"] = "NL Central"
                else:
                    player['League'] = "AL"  
                    player["Division"] = "AL West"
        with open(f'positions/{position}.json', 'w') as f2:
            json.dump(data, f2)







