In [1]:
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup

In [2]:
def firstHalf(year):
    
    url = 'https://www.fangraphs.com/leaders.aspx?pos=all&stats=bat&lg=all&qual=200&type=8&season=' + str(year) + '&month=30&season1=' + str(year) + '&ind=0&team=0&rost=0&age=0&filter=&players=0&page=1_500'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Collect headers (should be uniform)
    headers = []
    for header in soup.find_all('th', {'class':['rgHeader']}):
        headers.append(header.get_text())
    headers = [header for header in headers if header not in ['#', 'Name', 'Team']]
    
    # Collect player stats
    players = soup.find_all('tr', {'class':['rgRow', 'rgAltRow']})
    players_dict = {}
    for player in players:
        name = player.find_all('a', {'href':True})[0].get_text()
        try:
            team = player.find_all('a', {'href':True})[1].get_text()
        except:
            team = 'Multiple'
        players_dict[name] = {}
        players_dict[name]['team'] = team
        for num, stat in enumerate(player.find_all('td', {'class':['grid_line_regular', 'grid_line_break']})[3:]):
            players_dict[name][headers[num]] = stat.get_text()
            
    return(players_dict)

In [3]:
# Scrape past thirty years
run = True
if run == True:
    master_dict = {}
    for year in range(2018,2018+1):
        master_dict[year] = firstHalf(year)

In [4]:
# Concatenate all season dfs
master_df = pd.DataFrame()
for year in master_dict:
    year_df = pd.DataFrame.from_dict(master_dict[year]).transpose().reset_index().rename(columns={'index':'player'})
    year_df['year'] = year
    master_df = pd.concat([master_df, year_df], axis = 0)

In [5]:
# Change types of columns
master_df['BB%'] = master_df.apply(lambda x: x['BB%'].replace(' %', ''), axis=1)
master_df['K%'] = master_df.apply(lambda x: x['K%'].replace(' %', ''), axis=1)
floatcols = master_df.columns.drop(['player', 'team'])
master_df[floatcols] = master_df[floatcols].apply(pd.to_numeric)

In [6]:
master_df.to_csv('.\\data\\fangraphs_scraped_2018.csv')