In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Using the requests library

In [2]:
url = 'https://rldaggie.github.io/sample-html/'
res = requests.get(url)

### Status Codes

In [3]:
res.status_code

200

### Creating a `BeautifulSoup` object

In [4]:
soup = BeautifulSoup(res.content, 'lxml')

# `soup.find()`

Returns either:

1. A soup object of the first match
2. `None`

In [5]:
h1 = soup.find('h1')

In [6]:
type(h1)

bs4.element.Tag

In [7]:
h1.text

'This is an h1'

In [8]:
h1.attrs

{'class': ['foobar'], 'id': 'title'}

In [9]:
if h1:
    print(h1.text)

This is an h1


# `soup.find_all()`

Returns a **_LIST_** of soup objects that match your query

In [10]:
h1_tags = soup.find_all('h1')

In [11]:
[tag.text for tag in h1_tags]

['This is an h1', 'This is yet another heading.']

In [12]:
[tag.attrs for tag in h1_tags]

[{'class': ['foobar'], 'id': 'title'}, {'class': ['foobar']}]

# Creating a `pandas` DataFrame from a scrape

In [13]:
people = [
    {'name': 'Bethany', 'market': 'BOS'},
    {'name': 'Tucker', 'market': 'NYC'}
]

pd.DataFrame(people)

Unnamed: 0,market,name
0,BOS,Bethany
1,NYC,Tucker


### Todo List

In [14]:
ol = soup.find('ol', {'class': 'done'})
ol

<ol class="done">
<li>Mow lawn</li>
<li class="foobar"><span>Take out compost</span></li>
<li><span>Create scraping lecture</span></li>
</ol>

In [15]:
todos = []
for li in ol.find_all('li'):
    todo = {}
    todo['task'] = li.text
    todos.append(todo)
pd.DataFrame(todos)

Unnamed: 0,task
0,Mow lawn
1,Take out compost
2,Create scraping lecture


### GA Directory

In [16]:
table = soup.find('table', {'id': 'directory'})

In [17]:
people = []
for row in table.find('tbody').find_all('tr'):
    person = {}
    person['name'] = row.find('a').text.strip()
    person['email'] = row.find('a').attrs['href'].replace('mailto:', '')
    person['role'] = row.find('td').text.strip()
    
    people.append(person)
pd.DataFrame(people)

Unnamed: 0,email,name,role
0,praveen@ga.co,Praveen,Student
1,fred@ga.co,Fred,Student
2,homer@ga.co,Homer,Student
3,kyle@ga.co,Kyle,Student
4,sam@ga.co,Sam,Student
5,javier@ga.co,Javier,Student
6,nengkuan@ga.co,Nengkuan,Student
7,kieth@ga.co,Kieth,Student
8,bola@ga.co,Bola,Student
9,steve@ga.co,Steve,Student


### Basketball Reference

In [18]:
url = 'https://www.basketball-reference.com/'
res = requests.get(url)
res.status_code

soup = BeautifulSoup(res.content, 'lxml')

In [19]:
teams = []
for conf in ['E', 'W']:
    table = soup.find('table', {'id': 'confs_standings_'+conf})
    for row in table.find('tbody').find_all('tr'):
        team = {}
        team['slug'] = row.find('a').text
        team['name'] = row.find('a').attrs['title']
        team['wins'] = row.find_all('td')[2].text
        team['wins'] = row.find('td', {'data-stat': 'wins'}).text
        team['losses'] = row.find('td', {'data-stat': 'losses'}).text
        team['rank'] = row.find('span').text.strip()[1:-1]
        team['conference'] = conf

        teams.append(team)
df = pd.DataFrame(teams)
df

Unnamed: 0,conference,losses,name,rank,slug,wins
0,E,6,Milwaukee Bucks,1,MIL,41
1,E,14,Toronto Raptors,2,TOR,33
2,E,15,Miami Heat,3,MIA,32
3,E,15,Boston Celtics,4,BOS,31
4,E,17,Philadelphia 76ers,5,PHI,31
5,E,17,Indiana Pacers,6,IND,30
6,E,27,Orlando Magic,7,ORL,21
7,E,26,Brooklyn Nets,8,BRK,19
8,E,30,Chicago Bulls,9,CHI,19
9,E,31,Detroit Pistons,10,DET,17


In [20]:
df.dtypes

conference    object
losses        object
name          object
rank          object
slug          object
wins          object
dtype: object