# Web-Scraping
In this Notebook I do all of the scraping for the project.
I put the scraped data into Panda dataframes and pickle them so I can grab the data in other notebooks.

In [None]:
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
%config InlineBackend.figure_formats = ['svg']
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [None]:
# create dictionaries for mapping abbreviations to team names
bball_ref_team_abbrv = {
    'ARI': 'Arizona Diamondbacks',
    'ATL': 'Atlanta Braves',
    'BAL': 'Baltimore Orioles',
    'BOS': 'Boston Red Sox',
    'CHC': 'Chicago Cubs',
    'CHW': 'Chicago White Sox',
    'CIN': 'Cincinnati Reds',
    'CLE': 'Cleveland Indians',
    'COL': 'Colorado Rockies',
    'DET': 'Detroit Tigers',
    'HOU': 'Houston Astros',
    'KCR': 'Kansas City Royals',
    'ANA': 'Los Angeles Angels',
    'LAD': 'Los Angeles Dodgers',
    'FLA': 'Miami Marlins',
    'MIL': 'Milwaukee Brewers',
    'MIN': 'Minnesota Twins',
    'NYM': 'New York Mets',
    'NYY': 'New York Yankees',
    'OAK': 'Oakland Athletics',
    'PHI': 'Philadelphia Phillies',
    'PIT': 'Pittsburgh Pirates',
    'SDP': 'San Diego Padres',
    'SFG': 'San Francisco Giants',
    'SEA': 'Seattle Mariners',
    'STL': 'St. Louis Cardinals',
    'TBD': 'Tampa Bay Rays',
    'TEX': 'Texas Rangers',
    'TOR': 'Toronto Blue Jays',
    'WSN': 'Washington Nationals'
    
}

# create lists for the headers of our data

offensive_stat_header = ['Year', 'Team', 'Lg', 'W', 'L', 'Finish', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR',
                         'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'E', 'DP', 'Fld%']

pitching_stat_header = ["Year", 'Team', "Lg", "W", "L", "Finish", "RA/G", "ERA", "G", "CG", "tSho", "SV", "IP", "H_A", "R_A", "ER", "HRA", 
                        "BB_A", "SO_A", "WHIP", "SO9", "HR9", "E_P", "DP_P", "Fld%_P", "PAge"]

# Practice on the Giants Page

In [None]:
# set the url for the SF Giants batters
url = 'https://www.baseball-reference.com/teams/SFG/batteam.shtml' 

In [None]:
# grab and check the response
response = requests.get(url)
response.status_code

In [None]:
page = response.text
type(page)

In [None]:
soup = BeautifulSoup(page, "lxml")

In [None]:
# data is produced dynaically... need to use driver and selenium
batter_stats_table = soup.find_all(id='div_yby_team_bat_per_game')
batter_stats_table

In [None]:
# set the path and the driver 
PATH="/Applications/chromedriver"
driver = webdriver.Chrome(PATH)


In [None]:
# have the driver grab the url
driver.get("https://www.baseball-reference.com/teams/SFG/batteam.shtml")

html = driver.page_source
soup = BeautifulSoup(html)

# now the table has data
batter_stats_table = soup.find(id='div_yby_team_bat_per_game');

In [None]:
# grab all the rows and put the data in a season list
headers = batter_stats_table.find('thead').find_all('th')
header = [h.text for h in headers]
rows = batter_stats_table.find('tbody').find_all('tr')
seasons_list = []
for r in rows:
    year = r.find('th').text
    if year.isdigit() and int(year) > 1950:
        data = r.find_all('td')
        season_stats=[d.text for d in data]
        season_stats = [year] + ['SFG'] + season_stats
        seasons_list.append(season_stats)

In [None]:
driver.close()

In [None]:
# it looks good, lets make this a function
pd.DataFrame(seasons_list)

# Take what we did above and put it in a function and grab all teams

In [None]:
def scrape_team_batting_data(team, driver):
    # have the driver grab the url
    url = "https://www.baseball-reference.com/teams/{}/batteam.shtml".format(team)
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html)
    
    # grab the table we want (stats per game by year)
    batter_stats_table = soup.find(id='div_yby_team_bat_per_game');
    
    # grab all the rows and put the data in a season list
    rows = batter_stats_table.find('tbody').find_all('tr')
    seasons_list = []
    for r in rows:
        year = r.find('th').text
        if year.isdigit() and int(year) >= 1950:
            data = r.find_all('td')
            season_stats=[d.text for d in data]
            season_stats = [year] + [team] + season_stats
            seasons_list.append(season_stats)
            
    return(seasons_list)

In [None]:
def scrape_team_pitching_data(team, driver):
    # have the driver grab the url
    url = "https://www.baseball-reference.com/teams/{}/pitchteam.shtml".format(team)
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html)
    
    # grab the table we want (stats per game by year)
    pitcher_stats_table = soup.find(id='div_yby_team_pitch_per_game');
    
    # grab all the rows and put the data in a season list
    rows = pitcher_stats_table.find('tbody').find_all('tr')
    seasons_list = []
    for r in rows:
        year = r.find('th').text
        if year.isdigit() and int(year) >= 1950:
            data = r.find_all('td')
            season_stats=[d.text for d in data]
            season_stats = [year] + [team] + season_stats
            seasons_list.append(season_stats)
            
    return(seasons_list)

In [None]:
"""
# run it on the SFG page
PATH="/Applications/chromedriver"
driver = webdriver.Chrome(PATH)

url = "SFG"

team_batting_data = scrape_team_batting_data(url, driver)

driver.close()

# it works
"""

In [None]:
teams = list(bball_ref_team_abbrv.keys())

In [None]:
PATH="/Applications/chromedriver"
driver = webdriver.Chrome(PATH)
all_batting_data = []
all_pitching_data = []
for t in teams:
    print(t)
    team_batting_data = scrape_team_batting_data(t, driver)
    all_batting_data = all_batting_data + team_batting_data
    
    team_pitching_data = scrape_team_pitching_data(t, driver)
    all_pitching_data = all_pitching_data + team_pitching_data
    
    
driver.close()

In [None]:
# add the header 
all_batting_data.insert(0, offensive_stat_header)
all_batting_data

In [None]:
# add the header 
all_pitching_data.insert(0, pitching_stat_header)
all_pitching_data

In [None]:
# create data frame
all_batting_df = pd.DataFrame(all_batting_data[1:],columns=all_batting_data[0])
all_pitching_df = pd.DataFrame(all_pitching_data[1:],columns=all_pitching_data[0])

In [None]:
len(all_pitching_df.index), len(all_batting_df.index)

In [None]:
for c in all_batting_df.columns:
    try:
        all_batting_df[c] = pd.to_numeric(all_batting_df[c])
    except:
        pass

In [None]:
for c in all_pitching_df.columns:
    try:
        all_pitching_df[c] = pd.to_numeric(all_pitching_df[c])
    except:
        pass

In [None]:
all_pitching_df.columns = pitching_stat_header

In [None]:
# pickle this so we dont need to redo
all_batting_df.to_pickle("full_batting_from_1950_df.pkl")
all_pitching_df.to_pickle("full_batting_from_1950_df.pkl")

In [None]:
full_baseball_df = pd.merge(all_batting_df, all_pitching_df.drop(columns=["Lg", "W", "L", "Finish","G"]), on=['Year', 'Team'])

In [None]:
full_baseball_df.head()

In [None]:
full_baseball_df.to_pickle("full_baseball_from_1950_df.pkl")

# Grab Opponent against stats

In [None]:
url = "https://www.baseball-reference.com/leagues/MLB/2020-batting-pitching.shtml"

In [None]:
def scrape_opponent_batting(year, driver):
    # have the driver grab the url
    print(year)
    url = "https://www.baseball-reference.com/leagues/MLB/{}-batting-pitching.shtml".format(str(year))
    driver.get(url)
    time.sleep(0.5)
    html = driver.page_source
    soup = BeautifulSoup(html)

    # now the table has data
    stats_table = soup.find(id='teams_batting_pitching');
    rows = stats_table.find('tbody').find_all('tr')
    all_teams = []
    for r in rows:
        team = r.find('th').text
        if team != 'LgAvg':
            data = r.find_all('td')
            team_stats=[d.text for d in data]
            team_stats = [year] + [team] + team_stats
            all_teams.append(team_stats)
    return(all_teams)
   

In [None]:
"""
# set the path and the driver 
PATH="/Applications/chromedriver"
driver = webdriver.Chrome(PATH)
opponents_2020 = scrape_opponent_batting(2020, driver)
driver.close()
""";

In [None]:
PATH="/Applications/chromedriver"
driver = webdriver.Chrome(PATH)
full_list = []
for i in range(1950, 2021):
    opponents_yearly = scrape_opponent_batting(i, driver)  
    full_list = full_list + opponents_yearly
    
driver.close()

In [None]:
opponent_headers = ["Year", "Team", "RA/G", "PAu", "G", "PA", "AB", "R", "H", "2B", "3B", "HR", "SB", 
                    "CS", "BB", "SO", "BA", "OBP", "SLG", "OPS", "BAbip", "TB", "GDP", "HBP", "SH", 
                    "SF", "IBB", "ROE"]


In [None]:
opponents_df = pd.DataFrame(full_list,columns=opponent_headers)
opponents_df

In [None]:
opponents_df.Team.unique()

In [None]:
bball_ref_team_abbrv_convert = {
    'MLN': 'ATL',
    'BRO': 'LAD',
    'BSN': 'ATL',
    'NYG': 'SFG',
    'PHA': 'OAK',
    'SLB': 'BAL',
    'KCA': 'OAK',
    'WSA': 'TEX',
    'CAL': 'ANA',
    'MON': 'WSN',
    'SEP': 'MIL',
    'LAA': 'ANA',
    'WSH': 'MIN',
    'TBR': 'TBD',
    'MIA': 'FLA',
    'ARI': 'ARI',
    'ATL': 'ATL',
    'BAL': 'BAL',
    'BOS': 'BOS',
    'CHC': 'CHC',
    'CHW': 'CHW',
    'CIN': 'CIN',
    'CLE': 'CLE',
    'COL': 'COL',
    'DET': 'DET',
    'HOU': 'HOU',
    'KCR': 'KCR',
    'ANA': 'ANA',
    'LAD': 'LAD',
    'FLA': 'FLA',
    'MIL': 'MIL',
    'MIN': 'MIN',
    'NYM': 'NYM',
    'NYY': 'NYY',
    'OAK': 'OAK',
    'PHI': 'PHI',
    'PIT': 'PIT',
    'SDP': 'SDP',
    'SFG': 'SFG',
    'SEA': 'SEA',
    'STL': 'STL',
    'TBD': 'TBD',
    'TEX': 'TEX',
    'TOR': 'TOR',
    'WSN': 'WSN',
}

In [None]:
opponents_df.Team = opponents_df.Team.map(bball_ref_team_abbrv_convert, na_action='ignore')

In [None]:
opponents_df.Team.unique()

In [None]:
for c in opponents_df.columns:
    try:
        opponents_df[c] = pd.to_numeric(opponents_df[c])
    except:
        pass

In [None]:
opponents_df.to_pickle("opponents_from_1950_df.pkl")