In [1]:
#allow output from every line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#import pandas and numpy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
import seaborn as sns
from matplotlib.pyplot import figure
import statsmodels.api as sm
import statsmodels.formula.api as smf
from urllib.request import urlopen
from bs4 import BeautifulSoup #package used for webscraping
import time #delays the server requests

In [2]:
#baseball reference standard team batting
html= urlopen("https://www.baseball-reference.com/teams/ATL/2019.shtml")
bs = BeautifulSoup(html.read(),"lxml")
table = bs.find_all("table", {"class":"sortable"})
read = display(pd.read_html(str(table[0]))) #table 0 is the batting table

#baseball reference standard team pitching
html= urlopen("https://www.baseball-reference.com/teams/ATL/2019.shtml")
bs = BeautifulSoup(html.read(),"lxml")
table = bs.find_all("table", {"class":"sortable"})
read = display(pd.read_html(str(table[1]))) #table 1 is the pitching table

[     Rk  Pos                 Name   Age    G    PA    AB    R     H   2B  ...  \
 0     1    C        Tyler Flowers    33   85   310   271   36    62   11  ...   
 1     2   1B     Freddie Freeman*    29  158   692   597  113   176   34  ...   
 2     3   2B        Ozzie Albies#    22  160   702   640  102   189   43  ...   
 3     4   SS       Dansby Swanson    25  127   545   483   77   121   26  ...   
 4     5   3B       Josh Donaldson    33  155   659   549   96   142   33  ...   
 5     6   LF         Austin Riley    22   80   297   274   41    62   11  ...   
 6     7   CF     Ronald Acuna Jr.    21  156   715   626  127   175   22  ...   
 7     8   RF       Nick Markakis*    35  116   469   414   61   118   25  ...   
 8    Rk  Pos                 Name   Age    G    PA    AB    R     H   2B  ...   
 9     9    C        Brian McCann*    35   85   316   277   28    69    9  ...   
 10   10   IF       Johan Camargo#    25   98   248   232   31    54   12  ...   
 11   11   RF   

[     Rk  Pos                 Name   Age   W   L   W-L%    ERA    G   GS  ...  \
 0     1   SP          Mike Soroka    21  13   4   .765   2.68   29   29  ...   
 1     2   SP        Julio Teheran    28  10  11   .476   3.81   33   33  ...   
 2     3   SP           Max Fried*    25  17   6   .739   4.02   33   30  ...   
 3     4   SP     Mike Foltynewicz    27   8   6   .571   4.54   21   21  ...   
 4     5   SP      Dallas Keuchel*    31   8   8   .500   3.75   19   19  ...   
 5     6   SP        Kevin Gausman    28   3   7   .300   6.19   16   16  ...   
 6    Rk  Pos                 Name   Age   W   L   W-L%    ERA    G   GS  ...   
 7     7   CL         Luke Jackson    27   9   2   .818   3.84   70    0  ...   
 8     8   RP          Josh Tomlin    34   2   1   .667   3.74   51    1  ...   
 9     9   RP        Sean Newcomb*    26   6   3   .667   3.16   55    4  ...   
 10   10   RP      Anthony Swarzak    33   1   2   .333   4.31   44    0  ...   
 11   11   RP       Jerry Bl

In [3]:
def getStandardBatting(team,season): #creating a scrape function
    url = "https://www.baseball-reference.com/teams/"+team+"/"+str(season)+".shtml"
    html= urlopen(url)
    bs = BeautifulSoup(html.read(),"lxml")
    table = bs.find_all("table", {"class":"sortable"})
    read = (pd.read_html(str(table)))
    batting_stats = read[0][:-5].copy() #gets rid of team average and totals rows
    batting_stats["Team"] = str(team) #creates a team column
    batting_stats["Season"]=str(season) #creates a sesason column
    batting_stats["Season"] = batting_stats["Season"].replace("*","")
    batting_stats["Name"] = batting_stats["Name"].replace("*","") #gets rid of * in the player name column, * denote lefties
    print(url) #printing url to check that the scrape is working
    return batting_stats

In [4]:
teams = ["NYY", "BOS", "BAL", "TOR", "TBR", "DET", "MIN", "KCR", "CHW", "CLE", "LAA", "SEA", "TEX", "OAK", "HOU",
        "NYM", "PHI", "MIA", "WSN", "ATL", "PIT", "MIL", "CHC", "STL", "CIN", "LAD", "SFG", "COL", "ARI", "SDP"]
#list of all the teams that are being passed in the for loop
seasons = [2019] #list of all the seasons being passed in the for loop
batting_data = pd.DataFrame() #empty dataframe to store all the scraped data
for i in range(len(seasons)): #outer for loop scrapes by season
    for j in range(len(teams)): #inner for loop scrapes by team
        team_batting=getStandardBatting(teams[j], seasons[i]) #run the scrape function and stores into a new dataframe
        time.sleep(np.random.uniform(0,5)) #delay the server requests by 0-5 seconds
        batting_data = pd.concat([batting_data, team_batting], ignore_index=True)
        #concat the team batting with the empty batting data
        #this will put the team_batting value (i.e NYY) into batting_data, then team_batting will be blank and ready for a new team
        #then BOS is scraped into team_batting, and concated with batting_data, which will then include both teams
        #this repeats for all 30 teams

https://www.baseball-reference.com/teams/NYY/2019.shtml
https://www.baseball-reference.com/teams/BOS/2019.shtml
https://www.baseball-reference.com/teams/BAL/2019.shtml
https://www.baseball-reference.com/teams/TOR/2019.shtml
https://www.baseball-reference.com/teams/TBR/2019.shtml
https://www.baseball-reference.com/teams/DET/2019.shtml
https://www.baseball-reference.com/teams/MIN/2019.shtml
https://www.baseball-reference.com/teams/KCR/2019.shtml
https://www.baseball-reference.com/teams/CHW/2019.shtml
https://www.baseball-reference.com/teams/CLE/2019.shtml
https://www.baseball-reference.com/teams/LAA/2019.shtml
https://www.baseball-reference.com/teams/SEA/2019.shtml
https://www.baseball-reference.com/teams/TEX/2019.shtml
https://www.baseball-reference.com/teams/OAK/2019.shtml
https://www.baseball-reference.com/teams/HOU/2019.shtml
https://www.baseball-reference.com/teams/NYM/2019.shtml
https://www.baseball-reference.com/teams/PHI/2019.shtml
https://www.baseball-reference.com/teams/MIA/201

In [5]:
#cleaning

batting_data = batting_data[batting_data["Pos"]!= "P"].copy() 
#creating a copy of all players who are not pitchers
batting_data = batting_data[batting_data["Pos"]!="Pos"].copy()
#creating a copy of all rows that are not column headers in baseball reference
batting_data["Name"] = batting_data["Name"].str.replace("*","") #gets rid of any remaining special characters
batting_data["Name"] = batting_data["Name"].str.replace("#","")

batting_data["Singles"] = batting_data["H"]- batting_data["2B"]-batting_data["3B"]-batting_data["HR"] 
#creates a singles column

#coercing all columns to a numeric datatype
batting_data['Singles'] = pd.to_numeric(batting_data['Singles'],errors='coerce')
batting_data['2B'] = pd.to_numeric(batting_data['2B'],errors='coerce')
batting_data['Age'] = pd.to_numeric(batting_data['Age'],errors='coerce')
batting_data['G'] = pd.to_numeric(batting_data['G'],errors='coerce')
batting_data['PA'] = pd.to_numeric(batting_data['PA'],errors='coerce')
batting_data['AB'] = pd.to_numeric(batting_data['AB'],errors='coerce')
batting_data['R'] = pd.to_numeric(batting_data['R'],errors='coerce')
batting_data['H'] = pd.to_numeric(batting_data['H'],errors='coerce')
batting_data['3B'] = pd.to_numeric(batting_data['3B'],errors='coerce')
batting_data['HR'] = pd.to_numeric(batting_data['HR'],errors='coerce')
batting_data['RBI'] = pd.to_numeric(batting_data['RBI'],errors='coerce')
batting_data['SB'] = pd.to_numeric(batting_data['SB'],errors='coerce')
batting_data['CS'] = pd.to_numeric(batting_data['CS'],errors='coerce')
batting_data['BB'] = pd.to_numeric(batting_data['BB'],errors='coerce')
batting_data['SO'] = pd.to_numeric(batting_data['SO'],errors='coerce')
batting_data['BA'] = pd.to_numeric(batting_data['BA'],errors='coerce')
batting_data['OBP'] = pd.to_numeric(batting_data['OBP'],errors='coerce')
batting_data['OPS+'] = pd.to_numeric(batting_data['2B'],errors='coerce')
batting_data['TB'] = pd.to_numeric(batting_data['TB'],errors='coerce')
batting_data['GDP'] = pd.to_numeric(batting_data['GDP'],errors='coerce')
batting_data['HBP'] = pd.to_numeric(batting_data['HBP'],errors='coerce')
batting_data['SH'] = pd.to_numeric(batting_data['SH'],errors='coerce')
batting_data['SF'] = pd.to_numeric(batting_data['SF'],errors='coerce')
batting_data['IBB'] = pd.to_numeric(batting_data['IBB'],errors='coerce')
batting_data['SLG'] = pd.to_numeric(batting_data['SLG'],errors='coerce')
batting_data['OPS'] = pd.to_numeric(batting_data['OPS'],errors='coerce')

#for each column in the table, this creates a percential variant of the column
batting_data['2B_percentile'] = batting_data['2B'].rank(pct = True)
batting_data['Age_percentile'] = batting_data['Age'].rank(pct = True)
batting_data['G_percentile'] = batting_data['G'].rank(pct = True)
batting_data['PA_percentile'] = batting_data['PA'].rank(pct = True)
batting_data['AB_percentile'] = batting_data['AB'].rank(pct = True)
batting_data['R_percentile'] = batting_data['R'].rank(pct = True)
batting_data['H_percentile'] = batting_data['H'].rank(pct = True)
batting_data['3B_percentile'] = batting_data['3B'].rank(pct = True)
batting_data['HR_percentile'] = batting_data['HR'].rank(pct = True)
batting_data['RBI_percentile'] = batting_data['RBI'].rank(pct = True)
batting_data['SB_percentile'] = batting_data['SB'].rank(pct = True)
batting_data['CS_percentile'] = batting_data['CS'].rank(pct = True)
batting_data['BB_percentile'] = batting_data['BB'].rank(pct = True)
batting_data['SO_percentile'] = batting_data['SO'].rank(pct = True)
batting_data["BA_percentile"] = batting_data["BA"].rank(pct=True)
batting_data['OBP_percentile'] = batting_data['OBP'].rank(pct = True)
batting_data['SLG_percentile'] = batting_data['SLG'].rank(pct = True)
batting_data['OPS_percentile'] = batting_data['OPS'].rank(pct = True)
batting_data['OPS+_percentile'] = batting_data['OPS+'].rank(pct = True)
batting_data['TB_percentile'] = batting_data['TB'].rank(pct = True)
batting_data['GDP_percentile'] = batting_data['GDP'].rank(pct = True)
batting_data['HBP_percentile'] = batting_data['HBP'].rank(pct = True)
batting_data['SH_percentile'] = batting_data['SH'].rank(pct = True)
batting_data['SF_percentile'] = batting_data['SF'].rank(pct = True)
batting_data['IBB_percentile'] = batting_data['IBB'].rank(pct = True)
batting_data['Singles_Percentile'] = batting_data['Singles'].rank(pct = True)


#write it to a csv
batting_data.to_csv("Batting_Data.csv")