## HTML Parsing using BeautifulSoup

In [106]:
from bs4 import BeautifulSoup

In [145]:
# """ 我是multiline string """
html_doc = """
<html>
  <head>
  </head>
  <body>
    <h1 class="title">My awesome website</h1>
    <a id="one" class="links red" href="/one">One</a>
    <a id="two" class="links blue" href="/two">Two</a>
    <a id="three" class="links yellow" href="/three">Three</a>
  </body>
</html>
"""

In [112]:
type(html_doc)

str

In [148]:
soup = BeautifulSoup(html_doc, "lxml")
type(soup)

bs4.BeautifulSoup

常用BeautifulSoup methods
---
1. find_all()
2. select()

In [119]:
# find_all可接受
# 1. tag
# 2. class
# 3. id
# soup => html_doc
soup.find_all('a')

[<a class="links" href="/one" id="one">One</a>,
 <a class="links" href="/two" id="two">Two</a>,
 <a class="links" href="/three" id="three">Three</a>]

In [120]:
soup.find_all('h1')

[<h1 class="title">My awesome website</h1>]

In [123]:
soup.find_all(class_='links')

[<a class="links" href="/one" id="one">One</a>,
 <a class="links" href="/two" id="two">Two</a>,
 <a class="links" href="/three" id="three">Three</a>]

In [124]:
soup.find_all(id='one')

[<a class="links" href="/one" id="one">One</a>]

In [149]:
soup.find_all('a', class_='links')

[<a class="links red" href="/one" id="one">One</a>,
 <a class="links blue" href="/two" id="two">Two</a>,
 <a class="links yellow" href="/three" id="three">Three</a>]

In [150]:
soup.find_all('a', class_='links blue')

[<a class="links blue" href="/two" id="two">Two</a>]

In [126]:
soup.find_all(string="One")

[u'One']

In [130]:
one = soup.find_all("a", string="One")
one

[<a class="links" href="/one" id="one">One</a>]

In [131]:
# list-like
type(one)

bs4.element.ResultSet

In [132]:
dir(one)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__delslice__',
 '__dict__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getslice__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__setslice__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'append',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort',
 'source']

In [134]:
tag_one = one[0]

In [135]:
# dictionary-like Object
type(tag_one)

bs4.element.Tag

In [136]:
dir(tag_one)

['HTML_FORMATTERS',
 'XML_FORMATTERS',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__doc__',
 '__eq__',
 '__format__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__hash__',
 '__init__',
 '__iter__',
 '__len__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_attr_value_as_string',
 '_attribute_checker',
 '_find_all',
 '_find_one',
 '_formatter_for_name',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_select_debug',
 '_selector_combinators',
 '_should_pretty_print',
 '_tag_name_matches_and',
 'append',
 'attribselect_re',
 'attrs',
 'can_be_empty_element',
 'childGenerator',
 'children',
 'clear',
 'contents',
 'decode',
 'decode_contents',
 'decompose',
 'descendants',
 'encode',
 'encode_contents',
 'extract',
 'fetchNex

In [138]:
# <a class="links" href="/one" id="one">One</a>
tag_one['href']

'/one'

In [139]:
tag_one['id']

'one'

In [140]:
tag_one['class']

['links']

In [141]:
tag_one.string

u'One'

In [142]:
type(tag_one.string)

bs4.element.NavigableString

### CSS Selector

In [154]:
# """ 我是multiline string """
html_doc = """
<html>
  <head>
  <title>My awesome website</title>
  </head>
  <body>
    <h1 class="title">My awesome website</h1>
    <a id="one" class="links red" href="/one">One</a>
    <a id="two" class="links blue" href="/two">Two</a>
    <a id="three" class="links yellow" href="/three">Three</a>
  </body>
</html>
"""

In [155]:
soup = BeautifulSoup(html_doc, 'lxml')

In [156]:
soup.select('h1')

[<h1 class="title">My awesome website</h1>]

In [157]:
soup.select("a")

[<a class="links red" href="/one" id="one">One</a>,
 <a class="links blue" href="/two" id="two">Two</a>,
 <a class="links yellow" href="/three" id="three">Three</a>]

In [158]:
soup.select("body a")

[<a class="links red" href="/one" id="one">One</a>,
 <a class="links blue" href="/two" id="two">Two</a>,
 <a class="links yellow" href="/three" id="three">Three</a>]

In [159]:
soup.select("head > title")

[<title>My awesome website</title>]

In [160]:
soup.select("#one")

[<a class="links red" href="/one" id="one">One</a>]

In [163]:
links = soup.select(".links")
links

[<a class="links red" href="/one" id="one">One</a>,
 <a class="links blue" href="/two" id="two">Two</a>,
 <a class="links yellow" href="/three" id="three">Three</a>]

In [164]:
type(links)

list

In [167]:
first_link = links[0]
first_link

<a class="links red" href="/one" id="one">One</a>

In [168]:
type(first_link)

bs4.element.Tag

In [169]:
# """ 我是multiline string """
html_doc = """
<html>
  <head>
  <title>My awesome website</title>
  </head>
  <body>
    <h1 class="title">My awesome website</h1>
    <a id="one" class="links red" href="/one">One</a>
    <a id="two" class="links blue" href="/two">Two</a>
    <a id="three" class="links yellow" href="/three">Three</a>
  </body>
</html>
"""

In [172]:
body_list = soup.select('body')
body_list

[<body>\n<h1 class="title">My awesome website</h1>\n<a class="links red" href="/one" id="one">One</a>\n<a class="links blue" href="/two" id="two">Two</a>\n<a class="links yellow" href="/three" id="three">Three</a>\n</body>]

In [183]:
len(body_list)

1

In [173]:
type(body_list)

list

In [174]:
body_tag = body_list[0]

In [175]:
type(body_tag)

bs4.element.Tag

In [176]:
body_tag.find_all('a')

[<a class="links red" href="/one" id="one">One</a>,
 <a class="links blue" href="/two" id="two">Two</a>,
 <a class="links yellow" href="/three" id="three">Three</a>]

In [182]:
# write the above in one line
soup.select('body')[0].select('a')

[<a class="links red" href="/one" id="one">One</a>,
 <a class="links blue" href="/two" id="two">Two</a>,
 <a class="links yellow" href="/three" id="three">Three</a>]

In [184]:
len(soup.select('body')[0].select('a'))

3

In [190]:
# Extract all URLs
article_links_list = []
links_list = soup.select('body')[0].select('a') # This is type <list>
for link in links_list:
    #print type(link) # This is dictionary-list object <Tag>
    #print link['href'] # /one, /two, /three....
    article_links_list.append(link['href'])
    
    
print article_links_list

['/one', '/two', '/three']
