# Scraping Tables From The Web

## Scraping Tables using requests and BeautifulSoup

In [15]:
url = 'https://www.ncaa.com/rankings/basketball-men/d1/ncaa-mens-basketball-net-rankings'

test = pd.read_html(url)

In [19]:
test[0]

Unnamed: 0,Rank,Previous,School,Conference,Record,Road,Neutral,Home,Quad 1,Quad 2,Quad 3,Quad 4
0,1,1,Houston,AAC,31-3,11-0,4-1,16-2,7-2,8-0,7-1,9-0
1,2,2,Alabama,SEC,29-5,9-3,5-2,15-0,13-5,6-0,7-0,3-0
2,3,3,UCLA,Pac-12,29-5,9-2,3-3,17-0,8-5,9-0,5-0,7-0
3,4,4,Tennessee,SEC,23-10,4-6,5-2,14-2,7-7,3-3,6-0,7-0
4,5,5,Purdue,Big Ten,29-5,8-3,7-0,14-2,10-4,9-1,5-0,5-0
...,...,...,...,...,...,...,...,...,...,...,...,...
358,359,359,Florida A&M,SWAC,5-22,3-15,0-0,2-7,0-5,0-0,0-4,5-13
359,360,360,IUPUI,Horizon,3-27,0-15,1-3,2-9,0-2,0-1,0-8,3-16
360,361,361,Green Bay,Horizon,3-29,1-17,0-2,2-10,0-0,0-5,1-10,2-14
361,362,362,LIU,NEC,1-26,1-14,0-2,0-10,0-2,0-3,0-2,1-19


In [1]:
import bs4
import requests
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [2]:
url = 'https://www.ncaa.com/rankings/basketball-men/d1/ncaa-mens-basketball-net-rankings'

headers = {
    'user-agent': 'myuseragent',
}

r = requests.get(url, params=headers)
print(f'Status code for NCAA Site - {r.status_code}')

Status code for NCAA Site - 200


Websites html as text:

In [3]:
soup = bs4.BeautifulSoup(r.text, "html.parser")

Parsing to just the table

In [4]:
table = soup.find('table')

Parsing to just the table body

In [5]:
data = table.find('tbody')

Parsing to just the table rows

In [6]:
rows = data.find_all('tr')

First row of the table

In [7]:
rows[0]

<tr>
<td>1</td>
<td>1</td>
<td>Houston</td>
<td>AAC</td>
<td>31-3</td>
<td>11-0</td>
<td>4-1</td>
<td>16-2</td>
<td>7-2</td>
<td>8-0</td>
<td>7-1</td>
<td>9-0</td>
</tr>

Grabbing the data from the first row

In [8]:
rows[0].find_all('td')[2].text

'Houston'

Looping through each row and grabbing the data we wanted

In [9]:
net = {
    'rank': [],
    'team': [],
    'road': [],
    'neutral': [],
    'home': [],
    'quad1': [],
    'quad2': [],
    'quad3': [],
    'quad4': [],
}

for row in rows:
    rank = row.find_all('td')[0].text
    team = row.find_all('td')[2].text
    road = row.find_all('td')[5].text
    neutral = row.find_all('td')[6].text
    home = row.find_all('td')[7].text
    quad1 = row.find_all('td')[8].text
    quad2 = row.find_all('td')[9].text
    quad3 = row.find_all('td')[10].text
    quad4 = row.find_all('td')[11].text
    
    net['rank'].append(rank)
    net['team'].append(team)
    net['road'].append(road)
    net['neutral'].append(neutral)
    net['home'].append(home)
    net['quad1'].append(quad1)
    net['quad2'].append(quad2)
    net['quad3'].append(quad3)
    net['quad4'].append(quad4)
    
df = pd.DataFrame.from_dict(net)

Table from the website is now a pandas dataframe

In [10]:
df

Unnamed: 0,rank,team,road,neutral,home,quad1,quad2,quad3,quad4
0,1,Houston,11-0,4-1,16-2,7-2,8-0,7-1,9-0
1,2,Alabama,9-3,5-2,15-0,13-5,6-0,7-0,3-0
2,3,UCLA,9-2,3-3,17-0,8-5,9-0,5-0,7-0
3,4,Tennessee,4-6,5-2,14-2,7-7,3-3,6-0,7-0
4,5,Purdue,8-3,7-0,14-2,10-4,9-1,5-0,5-0
...,...,...,...,...,...,...,...,...,...
358,359,Florida A&M,3-15,0-0,2-7,0-5,0-0,0-4,5-13
359,360,IUPUI,0-15,1-3,2-9,0-2,0-1,0-8,3-16
360,361,Green Bay,1-17,0-2,2-10,0-0,0-5,1-10,2-14
361,362,LIU,1-14,0-2,0-10,0-2,0-3,0-2,1-19


## Scraping Tables using Selenium and BeautifulSoup

In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

Setting up the browser 

In [12]:
path = r'C:\chromedriver/chromedriver'
service = Service(executable_path = path)
browser = webdriver.Chrome(executable_path = path)
browser.get('https://www.espn.com/mens-college-basketball/bpi')

  browser = webdriver.Chrome(executable_path = path)


Intial tables from the website html

In [None]:
soup = bs4.BeautifulSoup(browser.page_source, "html.parser")
table = soup.find_all('table')
table

We only have the first 50 rows of the table. We need can use Selenium to show the whole table.

In [None]:
page_num = 0

try:
    while browser.find_element(By.CLASS_NAME, "loadMore"):
        browser.find_element(By.CLASS_NAME, "loadMore__link").click()
        page_num += 1
        print("getting page number "+str(page_num))
        time.sleep(1)
except Exception as e:
    print("\nEnd of Page")

Website tables as html text (this time with all rows)

In [None]:
soup = bs4.BeautifulSoup(browser.page_source, "html.parser")
table = soup.find_all('table')

Tables 1 (school names) and 2 (school data)

In [None]:
table_names = table[0]
table_data = table[1]

Parsing to just the tables body

In [None]:
names = table_names.find('tbody')
data = table_data.find('tbody')

Parsing to just the tables rows

In [None]:
names_rows = names.find_all('tr')
rows = data.find_all('tr')

First row of school names table

In [None]:
names_rows[0]

Grabbing data from the first row

In [None]:
names_rows[0].find_all('td')[0].text

First row of school data table

In [None]:
rows[0]

Grabbing data from the first row

In [None]:
rows[0].find_all('td')[0].text

Looping through each row and grabbing the data we wanted

In [None]:
BPI_data = {
    'team': [],
    'BPI': [],
    'rank': [],
    'off_BPI': [],
    'def_BPI': [],
}


for row in names_rows:
    team = row.find_all('td')[0].text
    
    BPI_data['team'].append(team)

for row in rows:
    BPI = row.find_all('td')[1].text
    rank = row.find_all('td')[2].text
    off_BPI = row.find_all('td')[4].text
    def_BPI = row.find_all('td')[5].text
    
    BPI_data['BPI'].append(BPI)
    BPI_data['rank'].append(rank)
    BPI_data['off_BPI'].append(off_BPI)
    BPI_data['def_BPI'].append(def_BPI)
    
df = pd.DataFrame.from_dict(BPI_data)

Table from the website is now a pandas dataframe

In [None]:
df

## ScrapingBee

Copy of the created code from ScrapingBee's Scraping API

Example scraping the list of Super Bowl champions

In [13]:
#  Install the Python Requests library:
# `pip install requests`
import requests

def send_request():
    response = requests.get(
        url='https://app.scrapingbee.com/api/v1/',
        params={
            'api_key': 'J3TN6YNLZKUM42Y2Q22NKR17V3L3NUT8L36M4DINDFDLC5FVHMQETNYJEV5TGNLBU4ZI4LR1Y7HCUW37',
            'url': 'https://en.wikipedia.org/wiki/List_of_Super_Bowl_champions', 
            'extract_rules': '{"table_json":{"selector":".jquery-tablesorter","output":"table_json"}}', 
        },
        
    )
    #print('Response HTTP Status Code: ', response.status_code)
    #print('Response HTTP Response Body: ', response.content)
    data = response.content
    
    return data
data = send_request()

Table from the website is now a pandas dataframe

In [14]:
import json
data = data.decode('utf-8')
json_string = json.loads(data)
df = pd.DataFrame(json_string['table_json'])
df = df.rename(columns = {'Game\n': 'Game', 
                      'Date/Season\n': 'Date/Season', 
                      'Winning team\n': 'Winning team', 
                      'Score\n': 'Score', 
                      'Losing team\n': 'Losing team',
                      'Venue\n': 'Venue', 
                      'City\n': 'City', 
                      'Attendance\n': 'Attendance', 
                      'Referee\n': 'Referee', 
                      'Ref\n': 'Ref'})

for col in df.columns:
    df[col] = df[col].str.replace('\n', '')

df.tail()

Unnamed: 0,Game,Date/Season,Winning team,Score,Losing team,Venue,City,Attendance,Referee,Ref
52,LIII,"February 3, 2019 (2018)","New England PatriotsA(11, 6–5)",13–3,"Los Angeles RamsN(4, 1–3)",Mercedes-Benz Stadium,"Atlanta, Georgia (3)",70081,John Parry,[78][79][80]
53,LIV,"February 2, 2020 (2019)","Kansas City ChiefsA(3, 2–1)",31–20,"San Francisco 49ersN(7, 5–2)",Hard Rock Stadium (6)[sb 12],"Miami Gardens, Florida (11)[sb 3]",62417,Bill Vinovich,[79][80]
54,LV,"February 7, 2021 (2020)","Tampa Bay BuccaneersN(2, 2–0) [W]",31–9,"Kansas City ChiefsA(4, 2–2)",Raymond James Stadium (3),"Tampa, Florida (5)",24835,Carl Cheffers,[79][80]
55,LVI,"February 13, 2022 (2021)","Los Angeles RamsN(5, 2–3)",23–20,"Cincinnati BengalsA(3, 0–3)",SoFi Stadium,"Inglewood, California (8)[sb 2]",70048,Ron Torbert,[79][80]
56,LVII,"February 12, 2023 (2022)","Kansas City ChiefsA(5, 3–2)",38–35,"Philadelphia EaglesN(4, 1–3)",State Farm Stadium(3)[sb 15],"Glendale, Arizona (4)[sb 13]",67827,Carl Cheffers,


In [None]:
import requests

def send_request():
    response = requests.get(
        url='https://app.scrapingbee.com/api/v1/',
        params={
            'api_key': 'J3TN6YNLZKUM42Y2Q22NKR17V3L3NUT8L36M4DINDFDLC5FVHMQETNYJEV5TGNLBU4ZI4LR1Y7HCUW37',
            'url': 'https://rolltide.com/sports/baseball/schedule', 
            'extract_rules': '{"images":{"selector":"img","type":"list","output":{"src":"img@src","alt":"img@alt",}}}', 
        },
        
    )
    
    return response.content
data = send_request()

In [None]:
from PIL import Image
import urllib.request
import json

data = data.decode('utf-8')
json_string = json.loads(data)

In [None]:
link = json_string['images'][6]['src']
urllib.request.urlretrieve(link, "test.jpg")
Image.open('test.jpg')