In [1]:
# pull in dependencies needed from sportsreference api
from sportsreference.mlb.teams import Teams
from sportsreference.mlb.teams import Roster
from sportsreference.mlb.roster import Player

# pull in all other dependencies needed
import itertools
import pandas as pd
import time

In [None]:
# to create a list of player ids:
# set up an empty array that we will append to
player_list = []

# 1) create a loop to go through years
for year in range(1876,2020):

# 2) create a loop to go through teams and pull out the player names and their ids
    for team in Teams(year):
        player_data = Roster(team.abbreviation, year = year, slim=True).players
        player_list.append(player_data)

In [None]:
# convert the dictionay of arrays to an array of arrays using list comprehension
info = [list(x.keys()) for x in player_list]

In [None]:
# flatten the list using itertools
merged = list(itertools.chain(*info))

In [None]:
# create a pandas data frame that will show all of the player ids
df = pd.DataFrame(merged)

In [None]:
# use .unique() on the df to clear out any duplicate players
unique_player_id = df[0].unique()

In [None]:
# to create a list of data frames for each player's stats:
# set up an empty array that we will append to
player_df_list = []

# create a loop that will find each players' id and create a data frame of relevant statistics
for index, player_id in enumerate(unique_player_id):
    try:
        player_df = Player(player_id).dataframe[["player_id", "name", "at_bats", "runs", "hits", "runs_batted_in", "bases_on_balls", "batting_average", "doubles", "triples", "home_runs", "stolen_bases"]]
        player_df_list.append(player_df)
        print(index, len(unique_player_id))
    except:
        print("skip")
    time.sleep(1)

In [None]:
# Use .loc function in pandas to just obtain the career stats for every player
career_df_list = []
for df in player_df_list:
    career_df_list.append(df.loc["Career"])

In [None]:
# use concat to re-write the list of data frames as a single data frame
clean_df = pd.concat(career_df_list)

# and drop the duplicates of names using either 'player_id' or 'name'
mlb_history_df = clean_df.drop_duplicates(subset=['player_id'])
mlb_history_df.head()

In [None]:
# read in the hall of fame csv
hall_of_fame_df = pd.read_csv('Resources/Hall_of_Fame_List.csv')
hall_of_fame_df.head()

In [None]:
# clean the csv file to show only the names of the hall of famers and status
hall_of_fame_df = hall_of_fame_df.dropna()
hof_df = hall_of_fame_df.rename(columns={"Name":"name", "Primary position": "position"})
hof_cleaned = hof_df[["name", "position"]]
hof_cleaned.head()

In [None]:
# join the no_duplicates_df with hof_cleaned to show career stats and hall of fame status
merge_df = mlb_history_df.merge(hof_cleaned, on='name', how='outer')
final_df = merge_df.fillna('no').rename(columns={"position": "hall_of_fame"})
final_df

In [None]:
# export as a csv to Resources folder
final_df.to_csv('Resources/player_data.csv', index=False)