In [1]:
import pandas as pd
import requests # use this to create a connection to a url
from bs4 import BeautifulSoup # parse HTML

## Using the requests library

In [2]:
url = 'https://rldaggie.github.io/sample-html/'

res = requests.get(url)

### Status Codes

In [3]:
res.status_code

200

### Creating a *BeautifulSoup* Object

In [4]:
soup = BeautifulSoup(res.content, 'lxml')

## soup.find()

Returns either:

1. A soup object of the first match
2. None

In [5]:
h1 = soup.find('h1')

In [6]:
type(h1)

bs4.element.Tag

In [7]:
h1.text

'This is an h1'

In [8]:
h1.attrs

{'class': ['foobar'], 'id': 'title'}

In [9]:
if h1:
    print(h1.text)

This is an h1


## soup.find_all()

Returns a **LIST** of soup objects that match your query

In [10]:
h1_tags = soup.find_all('h1')

In [11]:
[tag.text for tag in h1_tags]

['This is an h1', 'This is yet another heading.']

In [12]:
[tag.attrs for tag in h1_tags]

[{'class': ['foobar'], 'id': 'title'}, {'class': ['foobar']}]

## Creating a Pandas DataFrame from a Scrape

In [13]:
people = [
    {'name': 'Bethany', 'market': 'BOS'},
    {'name': 'Tucker', 'market': 'NYC'}
]

pd.DataFrame(people)

Unnamed: 0,name,market
0,Bethany,BOS
1,Tucker,NYC


### Todo List

In [14]:
ol = soup.find('ol', {'class': 'done'})

ol

<ol class="done">
<li>Mow lawn</li>
<li class="foobar"><span>Take out compost</span></li>
<li><span>Create scraping lecture</span></li>
</ol>

In [15]:
todos = []

for li in ol.find_all('li'):
    todo = {}
    todo['task'] = li.text
    todos.append(todo)
    
pd.DataFrame(todos)

Unnamed: 0,task
0,Mow lawn
1,Take out compost
2,Create scraping lecture


## GA Directory

In [16]:
table = soup.find('table', {'id': 'directory'})

In [18]:
people = []

for row in table.find('tbody').find_all('tr'):
    person = {}
    
    person['name'] = row.find('a').text.strip()
    person['email'] = row.find('a').attrs['href'].replace('mailto:', '')
    person['role'] = row.find('td').text.strip()
    
    people.append(person)
    
people

[{'name': 'Praveen', 'email': 'praveen@ga.co', 'role': 'Student'},
 {'name': 'Fred', 'email': 'fred@ga.co', 'role': 'Student'},
 {'name': 'Homer', 'email': 'homer@ga.co', 'role': 'Student'},
 {'name': 'Kyle', 'email': 'kyle@ga.co', 'role': 'Student'},
 {'name': 'Sam', 'email': 'sam@ga.co', 'role': 'Student'},
 {'name': 'Javier', 'email': 'javier@ga.co', 'role': 'Student'},
 {'name': 'Nengkuan', 'email': 'nengkuan@ga.co', 'role': 'Student'},
 {'name': 'Kieth', 'email': 'kieth@ga.co', 'role': 'Student'},
 {'name': 'Bola', 'email': 'bola@ga.co', 'role': 'Student'},
 {'name': 'Steve', 'email': 'steve@ga.co', 'role': 'Student'},
 {'name': 'Nichole', 'email': 'nichole@ga.co', 'role': 'Instructor'},
 {'name': 'Riley', 'email': 'riley@ga.co', 'role': 'Instructor'}]

In [19]:
pd.DataFrame(people)

Unnamed: 0,name,email,role
0,Praveen,praveen@ga.co,Student
1,Fred,fred@ga.co,Student
2,Homer,homer@ga.co,Student
3,Kyle,kyle@ga.co,Student
4,Sam,sam@ga.co,Student
5,Javier,javier@ga.co,Student
6,Nengkuan,nengkuan@ga.co,Student
7,Kieth,kieth@ga.co,Student
8,Bola,bola@ga.co,Student
9,Steve,steve@ga.co,Student


## Basketball Reference

In [21]:
url = 'https://www.basketball-reference.com/'

res = requests.get(url)

print(res.status_code)

soup = BeautifulSoup(res.content, 'lxml')

200


In [22]:
teams = []

for conf in ['E', 'W']:
    table = soup.find('table', {'id': 'confs_standings_'+conf})
    
    for row in table.find('tbody').find_all('tr'):
        team = {}
        
        team['slug'] = row.find('a').text
        team['name'] = row.find('a').attrs['title']
        team['wins'] = row.find_all('td')[2].text
        team['wins'] = row.find('td', {'data-stat': 'wins'}).text
        team['losses'] = row.find('td', {'data-stat': 'losses'}).text
        team['rank'] = row.find('span').text.strip()[1:-1]
        team['conference'] = conf
        
        teams.append(team)
        
df = pd.DataFrame(teams)

df

Unnamed: 0,slug,name,wins,losses,rank,conference
0,BRK,Brooklyn Nets,14,6,1,E
1,CHI,Chicago Bulls,14,8,2,E
2,MIA,Miami Heat,13,8,3,E
3,WAS,Washington Wizards,13,8,4,E
4,MIL,Milwaukee Bucks,13,8,5,E
5,CHO,Charlotte Hornets,13,10,6,E
6,NYK,New York Knicks,11,9,7,E
7,PHI,Philadelphia 76ers,11,10,8,E
8,CLE,Cleveland Cavaliers,11,10,9,E
9,ATL,Atlanta Hawks,11,10,10,E
