
# Beautiful Soup (Keith Galli)

---




In [None]:
import requests # pip install requests
from bs4 import BeautifulSoup as bs # pip insta beautifulsoup4
# load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")
# convert to a besutiful soup object
soup = bs(r.content)
# print out our html
# print(soup.prettify())

# find and findall
first_header = soup.find('h2')
headers = soup.find_all('h2')
print(first_header, headers, '\n')

first_header = soup.find(['h1', 'h2'])
headers = soup.find_all(['h1', 'h2'])
print(first_header, headers, '\n')

# pass attributes to the find/find_all function
paragraph = soup.find_all('p', attrs={"id": "paragraph-id"})
print(paragraph, '\n')

# nest find/find_all
body = soup.find('body')
div = body.find('div')
header = div.find(['h1', 'h2'])
print(header, '\n')

# search for specific strings in find/find_all function
import re
paragraphs = soup.find_all('p', text = re.compile('Some'))
print(paragraphs, '\n')

headers = soup.find_all('h2', text = re.compile('(H|h)eader'))
print(headers, '\n')

# select(css selector)
content = soup.select('div p') # only those paragraphs which are inside div
print(content, '\n')

content = soup.select('h2 ~ p') # all the <p> preceeded by <h2>
print(content, '\n')

content = soup.select('p#paragraph-id b') # bold text after paragraph of id paragraph-id

content = soup.select('body > p') # direct child of <body>
print(content, '\n')
for p in content:
  print(p.select('i'))

content = soup.select('[align=middle]') # grab an element with a specific property
print(content, '\n')

# get different properties of html

header = soup.find('h2')
header.text
div = soup.find('div')
print(div.prettify(), '\n')
print(div.get_text(), '\n')

# get a specific property from element
link = soup.find('a')
link['href']
paragraph = soup.select('p#paragraph-id')
print(paragraph[0]['id'])

# Path Syntax
soup.body.div.h1.string

# Know the terms : Parent, Sibling, Child
soup.body.find('div').find_next_siblings()

# A tag’s children are available in a list called '.contents'
# A string does not have '.contents', because it can’t contain anything:

div = soup.div
# iterating over a tag's children
for child in div.children:
  print(child)

# iterating over a tag's descendants:
for descendant in div.descendants:
  print(descendant)

# other methods for searching the tree
''' 
find_parents(name, attrs, string, limit, **kwargs)
find_parent(name, attrs, string, **kwargs) 
find_next_sibling(name, attrs, string, **kwargs)
find_next_siblings(name, attrs, string, limit, **kwargs)
find_previous_sibling(name, attrs, string, **kwargs)
find_previous_siblings(name, attrs, string, limit, **kwargs)
find_next(name, attrs, string, **kwargs)
find_all_next(name, attrs, string, limit, **kwargs)
find_previous(name, attrs, string, **kwargs)
find_all_previous(name, attrs, string, limit, **kwargs)
'''

<h2>A Header</h2> [<h2>A Header</h2>, <h2>Another header</h2>] 

<h1>HTML Webpage</h1> [<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>] 

[<p id="paragraph-id"><b>Some bold text</b></p>] 

<h1>HTML Webpage</h1> 

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>] 

[<h2>A Header</h2>, <h2>Another header</h2>] 

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>] 

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>] 

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>] 

[<i>Some italicized text</i>]
[]
[<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>] 

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
 

' \nfind_parents(name, attrs, string, limit, **kwargs)\nfind_parent(name, attrs, string, **kwargs) \nfind_next_sibling(name, attrs, string, **kwargs)\nfind_next_siblings(name, attrs, string, limit, **kwargs)\nfind_previous_sibling(name, attrs, string, **kwargs)\nfind_previous_siblings(name, attrs, string, limit, **kwargs)\nfind_next(name, attrs, string, **kwargs)\nfind_all_next(name, attrs, string, limit, **kwargs)\nfind_previous(name, attrs, string, **kwargs)\nfind_all_previous(name, attrs, string, limit, **kwargs)\n'

## Excercises!

In [None]:
import re
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')
soup = bs(r.content)
print(soup.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

Task 1: grab all of the social links from the webpage

In [None]:
link = soup.select('ul .social a')
for l in link:
  print(l['href'])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [None]:
link = soup.select("[class~=social] a")
for l in link:
  print(l['href'])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [None]:
link = soup.find('ul', attrs={'class':'socials'})
l1 = link.select('a')
for l in l1:
  print(l['href'])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


Task 2: scrape the table

In [None]:
import pandas as pd
th, td_list = [], []
table_info = soup.find('table', attrs={'class':'hockey-stats'})
table_headers = table_info.select('thead th')
for x in table_headers:
  th.append(x.text)
th

table_data = table_info.select('tbody tr')
l = []
for x in table_data:
  columns = x.find_all('td') 
  row = [str(x.text).strip() for x in columns]
  l.append(row)
df = pd.DataFrame(l, columns = th) # defining the dataframe
df


Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


Task 3 : grab all the fun facts that use the word 'is'

In [None]:
items = soup.find('ul', attrs={'class':'fun-facts'}).select('li')
word = re.compile(r'is+')
for x in items:
  if word.search(x.text)!=None:
    print(x.text)

Middle name is Ronald
Dunkin Donuts coffee is better than Starbucks
A favorite book series of mine is Ender's Game
Current video game of choice is Rocket League
The band that I've seen the most times live is the Zac Brown Band


In [None]:
facts = soup.select('ul.fun-facts li')
facts_with_is = [fact.find(string = re.compile('is+')) for fact in facts]
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
for x in facts_with_is:
  print(x)

Middle name is Ronald
Dunkin Donuts coffee is better than Starbucks
A favorite book series of mine is Ender's Game
Current video game of choice is Rocket League
The band that I've seen the most times live is the Zac Brown Band


Task 4: download all images

In [None]:
import urllib.request

url = 'https://keithgalli.github.io/web-scraping/'
def download_image(url, file_path, file_name):
  full_path = file_path + file_name + '.jpg'
  urllib.urlretrieve(url, full_path)
img = soup.find_all('img')

for x in img:
  full_url = url + x['src']
  img_data = requests.get(full_url).content
  with open('x.jpg', 'wb') as handler:
    handler.write(img_data)

Task 5: solving the mystery challenge!

In [None]:
import re
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')
soup = bs(r.content, 'html.parser')

secret_msg = ''
main_url = 'https://keithgalli.github.io/web-scraping/'
l = soup.select('a')
word = re.compile('^challenge')
for x in l:
    if word.search(x['href']):
        full_url = main_url + x['href']
        r1 = requests.get(full_url)
        soup1 = bs(r1.content, 'html.parser')
        secret_msg = secret_msg + soup1.find('p', attrs={'id':'secret-word'}).text + ' '
print(secret_msg)

Make sure to smash that like button and subscribe !!! 


# Python Requests Tutorial (Corey Schafer)

---



In [None]:
import requests

In [None]:
r = requests.get('https://xkcd.com/353/')

In [None]:
dir(r)

['__attrs__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_content',
 '_content_consumed',
 '_next',
 'apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'next',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

In [None]:
help(r)

Help on Response in module requests.models object:

class Response(builtins.object)
 |  The :class:`Response <Response>` object, which contains a
 |  server's response to an HTTP request.
 |  
 |  Methods defined here:
 |  
 |  __bool__(self)
 |      Returns True if :attr:`status_code` is less than 400.
 |      
 |      This attribute checks if the status code of the response is between
 |      400 and 600 to see if there was a client error or a server error. If
 |      the status code, is between 200 and 400, this will return True. This
 |      is **not** a check to see if the response code is ``200 OK``.
 |  
 |  __enter__(self)
 |  
 |  __exit__(self, *args)
 |  
 |  __getstate__(self)
 |  
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __iter__(self)
 |      Allows you to use a response as an iterator.
 |  
 |  __nonzero__(self)
 |      Returns True if :attr:`status_code` is less than 400.
 |      
 |      This attribute checks if

## Downloading image

In [None]:
with open('comic.png', 'wb') as file:
  file.write(r.content)

In [None]:
ls

comic.png  [0m[01;34msample_data[0m/


## Check whether we got a good response or not

In [None]:
r.status_code

200

In [None]:
r.ok #returns true for anything that is less than a 400 response

True

In [None]:
r.headers #all headers come back with response

{'Connection': 'keep-alive', 'Content-Length': '3436', 'Server': 'nginx', 'Content-Type': 'text/html; charset=UTF-8', 'Last-Modified': 'Wed, 22 Jun 2022 22:31:58 GMT', 'ETag': 'W/"62b3985e-2111"', 'Expires': 'Thu, 23 Jun 2022 15:07:42 GMT', 'Cache-Control': 'max-age=300', 'Content-Encoding': 'gzip', 'Via': '1.1 varnish, 1.1 varnish', 'Accept-Ranges': 'bytes', 'Date': 'Fri, 24 Jun 2022 01:42:01 GMT', 'Age': '288', 'X-Served-By': 'cache-dfw18630-DFW, cache-fty21348-FTY', 'X-Cache': 'HIT, HIT', 'X-Cache-Hits': '1, 1', 'X-Timer': 'S1656034922.648346,VS0,VE62', 'Vary': 'Accept-Encoding'}

In [None]:
payload_g = {
    'page':2,
    'count':25
}
payload_p = {
    'username':'riddhi',
    'password': 'test'
}

In [None]:
r = requests.get('http://httpbin.org/get', params = payload_g)

In [None]:
r.url

'http://httpbin.org/get?page=2&count=25'

In [None]:
r.text

'{\n  "args": {\n    "count": "25", \n    "page": "2"\n  }, \n  "headers": {\n    "Accept": "*/*", \n    "Accept-Encoding": "gzip, deflate", \n    "Host": "httpbin.org", \n    "User-Agent": "python-requests/2.23.0", \n    "X-Amzn-Trace-Id": "Root=1-62b51c41-14532f726ae6c399013b1e52"\n  }, \n  "origin": "35.231.250.49", \n  "url": "http://httpbin.org/get?page=2&count=25"\n}\n'

In [None]:
r = requests.post('http://httpbin.org/post', params = payload_p)

In [None]:
r.url

'http://httpbin.org/post?username=riddhi&password=text'

In [None]:
r.text

'{\n  "args": {\n    "password": "text", \n    "username": "riddhi"\n  }, \n  "data": "", \n  "files": {}, \n  "form": {}, \n  "headers": {\n    "Accept": "*/*", \n    "Accept-Encoding": "gzip, deflate", \n    "Content-Length": "0", \n    "Host": "httpbin.org", \n    "User-Agent": "python-requests/2.23.0", \n    "X-Amzn-Trace-Id": "Root=1-62b51cc6-7b76dcb270977be106662ddf"\n  }, \n  "json": null, \n  "origin": "35.231.250.49", \n  "url": "http://httpbin.org/post?username=riddhi&password=text"\n}\n'

In [None]:
r = requests.post('http://httpbin.org/post', data = payload_p)

In [None]:
r.url

'http://httpbin.org/post'

In [None]:
r.text

'{\n  "args": {}, \n  "data": "", \n  "files": {}, \n  "form": {\n    "password": "text", \n    "username": "riddhi"\n  }, \n  "headers": {\n    "Accept": "*/*", \n    "Accept-Encoding": "gzip, deflate", \n    "Content-Length": "29", \n    "Content-Type": "application/x-www-form-urlencoded", \n    "Host": "httpbin.org", \n    "User-Agent": "python-requests/2.23.0", \n    "X-Amzn-Trace-Id": "Root=1-62b51ccf-1cc2de4653b7daaf4a4bb3c0"\n  }, \n  "json": null, \n  "origin": "35.231.250.49", \n  "url": "http://httpbin.org/post"\n}\n'

In [None]:
r.json()

{'args': {},
 'data': '',
 'files': {},
 'form': {'password': 'text', 'username': 'riddhi'},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Content-Length': '29',
  'Content-Type': 'application/x-www-form-urlencoded',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.23.0',
  'X-Amzn-Trace-Id': 'Root=1-62b51ccf-1cc2de4653b7daaf4a4bb3c0'},
 'json': None,
 'origin': '35.231.250.49',
 'url': 'http://httpbin.org/post'}

In [None]:
r = requests.post('http://httpbin.org/post', params = payload_p)

# tabula

In [None]:
!pip install tabula-py

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tabula-py
  Downloading tabula_py-2.4.0-py3-none-any.whl (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 26.9 MB/s 
Collecting distro
  Downloading distro-1.7.0-py3-none-any.whl (20 kB)
Installing collected packages: distro, tabula-py
Successfully installed distro-1.7.0 tabula-py-2.4.0


In [None]:
import tabula

In [None]:
import os
import pandas as pd
import requests
import urllib.request
import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup as bs

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
url = 'https://www.premierleague.com/publications'

folder_location = r'/content/drive/MyDrive/Colab Notebooks/primere_league'
if not(os.path.exists(folder_location)):
  os.mkdir(folder_location)

In [None]:
r = requests.get(url)
soup = bs(r.text)
link = soup.find_all('a')
for l in link:
  filename = os.path.join(folder_location, l['href'].split('/')[-1])
  with open(filename, 'wb') as f:
    f.write(requests.get(urljoin(url, l['href'])).content)

OSError: ignored

# practice

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os

from google.colab import drive
drive.mount('/content/drive')

url = 'https://www.premierleague.com/publications'
folder_location = r'/content/drive/MyDrive/Colab Notebooks/premier_league'
if not os.path.exists(folder_location):
    os.mkdir(folder_location)
 
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")     
 
for link in soup.select("a[href$='.pdf']"):
    filename = os.path.join(folder_location, link['href'].split('/')[-1])
    with open(filename, 'wb') as f:
        f.write(requests.get(urljoin(url,link['href'])).content)

Mounted at /content/drive


In [None]:
c = 0
for link in soup.select("a[href$='.pdf']"):
    c += 1
print(c)

106


In [None]:
import requests
from bs4 import BeautifulSoup as bs 
r = requests.get("https://www.senate.gov/legislative/LIS/roll_call_votes/vote1171/vote_117_1_00205.htm#name")
soup = bs(r.content)
print(soup.prettify())

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!-- [if lt IE 7]> <html class="ie6 oldie"> <![endif] -->
<!-- [if IE 7]>    <html class="ie7 oldie"> <![endif] -->
<!-- [if IE 8]>    <html class="ie8 oldie"> <![endif] -->
<!-- [if gt IE 8]> <! -->
<html class="">
 <!-- <![endif] -->
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="vote_117_1_00205.xml" name="object"/>
  <meta content="" name="version"/>
  <meta content="/Company Home/Sites/senategov/documentLibrary/Senate.gov/legislative/LIS/roll_call_votes/vote1171" name="path"/>
  <meta content="Tuesday, May 25, 2021" name="date"/>
  <meta content="6:30:09 PM EDT" name="time"/>
  <meta content="" name="keywords"/>
  <meta content="legislative" name="bucket"/>
  <meta content="" name="description"/>
  <title>
   U.S. Se

In [None]:

content = soup.select('div.contenttext > b')
print(content)

[<b>Vote Number:   </b>, <b> Vote Date: </b>, <b> Required For Majority:   </b>, <b> Vote Result:   </b>, <b>Amendment Number: </b>, <b>Statement of Purpose: </b>, <b>Vote Counts:</b>, <b>YEAs ---</b>, <b>53</b>, <b>NAYs ---</b>, <b>46</b>, <b>Not Voting -
			 1</b>, <b>Alabama:</b>, <b>Yea</b>, <b>Yea</b>, <b>Alaska:</b>, <b>Yea</b>, <b>Yea</b>, <b>Arizona:</b>, <b>Yea</b>, <b>Yea</b>, <b>Arkansas:</b>, <b>Yea</b>, <b>Yea</b>, <b>California:</b>, <b>Nay</b>, <b>Nay</b>, <b>Colorado:</b>, <b>Nay</b>, <b>Nay</b>, <b>Connecticut:</b>, <b>Nay</b>, <b>Nay</b>, <b>Delaware:</b>, <b>Nay</b>, <b>Nay</b>, <b>Florida:</b>, <b>Yea</b>, <b>Yea</b>, <b>Georgia:</b>, <b>Nay</b>, <b>Nay</b>, <b>Hawaii:</b>, <b>Nay</b>, <b>Nay</b>, <b>Idaho:</b>, <b>Yea</b>, <b>Yea</b>, <b>Illinois:</b>, <b>Nay</b>, <b>Nay</b>, <b>Indiana:</b>, <b>Yea</b>, <b>Yea</b>, <b>Iowa:</b>, <b>Yea</b>, <b>Yea</b>, <b>Kansas:</b>, <b>Yea</b>, <b>Yea</b>, <b>Kentucky:</b>, <b>Yea</b>, <b>Yea</b>, <b>Louisiana:</b>, <b>Yea</b>, 