In [2]:
# let's try parsing wiki page and find links to other wiki projects
import requests
resp = requests.get('https://wikipedia.org/')
html = resp.text

In [4]:
import re
re.findall(r'<a[^>]*other-project-link[^>]*href="([^""]*)', html)

['//commons.wikimedia.org/',
 '//www.wikivoyage.org/',
 '//www.wiktionary.org/',
 '//www.wikibooks.org/',
 '//www.wikinews.org/',
 '//www.wikidata.org/',
 '//www.wikiversity.org/',
 '//www.wikiquote.org/',
 '//www.mediawiki.org/',
 '//www.wikisource.org/',
 '//species.wikimedia.org/',
 '//meta.wikimedia.org/']

In [8]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
[tag['href'] for tag in soup('a', 'other-project-link')]

['//commons.wikimedia.org/',
 '//www.wikivoyage.org/',
 '//www.wiktionary.org/',
 '//www.wikibooks.org/',
 '//www.wikinews.org/',
 '//www.wikidata.org/',
 '//www.wikiversity.org/',
 '//www.wikiquote.org/',
 '//www.mediawiki.org/',
 '//www.wikisource.org/',
 '//species.wikimedia.org/',
 '//meta.wikimedia.org/']

In [14]:
# let's take a look at method BeautifulSoup
html = """<!DOCTYPE html>
<html lang='en'>
    <head>
        <title>test page</title>
    </head>
    <body class='mybody' id='js-body'>
        <p class='text odd'>first <b>bold</b> paragraph</p>
        <p class='text even'>second <a href='https://mail.ru'>link</a></p>
        <p class='list odd'>third <a id='paragraph'><b>bold link</b></a></p>
    </body>
<html>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')

In [16]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <title>
   test page
  </title>
 </head>
 <body class="mybody" id="js-body">
  <p class="text odd">
   first
   <b>
    bold
   </b>
   paragraph
  </p>
  <p class="text even">
   second
   <a href="https://mail.ru">
    link
   </a>
  </p>
  <p class="list odd">
   third
   <a id="paragraph">
    <b>
     bold link
    </b>
   </a>
  </p>
 </body>
</html>


In [17]:
soup.p

<p class="text odd">first <b>bold</b> paragraph</p>

In [18]:
type(soup.p)

bs4.element.Tag

In [19]:
type(soup.p.b)

bs4.element.Tag

In [20]:
type(soup.p.b.string)

bs4.element.NavigableString

In [21]:
soup.p.b.string

'bold'

In [22]:
soup.b.name

'b'

In [23]:
soup.p['class']

['text', 'odd']

In [25]:
soup.body['id']

'js-body'

In [26]:
soup.p.b.parent

<p class="text odd">first <b>bold</b> paragraph</p>

In [27]:
[tag.name for tag in soup.p.b.parents]

['p', 'body', 'html', '[document]']

In [28]:
soup.p.next

'first '

In [29]:
soup.p.next.next

<b>bold</b>

In [30]:
soup.p.next_sibling

'\n'

In [31]:
soup.p.next_sibling.next_sibling

<p class="text even">second <a href="https://mail.ru">link</a></p>

In [32]:
soup.p.contents

['first ', <b>bold</b>, ' paragraph']

In [36]:
list(soup.p.children) # generator

['first ', <b>bold</b>, ' paragraph']

In [44]:
# let's take a look at advanced method BeautifulSoup
html = """<!DOCTYPE html>
<html lang='en'>
    <head>
        <title>test page</title>
    </head>
    <body class='mybody' id='js-body'>
        <p class='text odd'>first <b>bold</b> paragraph</p>
        <p class='text even'>second <a href='https://mail.ru'>link</a></p>
        <p class='list odd'>third <a id='paragraph'><b>bold link</b></a></p>
    </body>
<html>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')

In [2]:
soup.b.parent.name

'p'

In [3]:
soup.p.b.find_parent(id='js-body').name

'body'

In [6]:
soup.p.b.find_parent('body')['id']

'js-body'

In [7]:
soup.p.next_sibling

'\n'

In [8]:
soup.p.next_sibling.next_sibling

<p class="text even">second <a href="https://mail.ru">link</a></p>

In [9]:
soup.p.find_next_sibling(class_='odd')

<p class="list odd">third <a id="paragraph"><b>bold link</b></a></p>

In [10]:
list(soup.p.next_siblings)

['\n',
 <p class="text even">second <a href="https://mail.ru">link</a></p>,
 '\n',
 <p class="list odd">third <a id="paragraph"><b>bold link</b></a></p>,
 '\n']

In [11]:
soup.p.find_next_siblings()

[<p class="text even">second <a href="https://mail.ru">link</a></p>,
 <p class="list odd">third <a id="paragraph"><b>bold link</b></a></p>]

In [12]:
soup.p.contents

['first ', <b>bold</b>, ' paragraph']

In [15]:
soup.p.children

<list_iterator at 0x105406e50>

In [16]:
list(soup.p.children)

['first ', <b>bold</b>, ' paragraph']

In [17]:
soup.p.find('b')

<b>bold</b>

In [18]:
soup.find(id='js-body')['class']

['mybody']

In [19]:
soup.find('b', text='bold')

<b>bold</b>

In [22]:
soup.find_all('p')

[<p class="text odd">first <b>bold</b> paragraph</p>,
 <p class="text even">second <a href="https://mail.ru">link</a></p>,
 <p class="list odd">third <a id="paragraph"><b>bold link</b></a></p>]

In [23]:
soup.find_all('p', class_='text odd')

[<p class="text odd">first <b>bold</b> paragraph</p>]

In [24]:
soup.find_all('p', class_='odd text')

[]

In [27]:
print(soup.select('p.text.odd'))
print(soup.select('p.odd.text'))

[<p class="text odd">first <b>bold</b> paragraph</p>]
[<p class="text odd">first <b>bold</b> paragraph</p>]


In [28]:
soup.select('p:nth-of-type(3)')

[<p class="list odd">third <a id="paragraph"><b>bold link</b></a></p>]

In [29]:
soup.select('a > b')

[<b>bold link</b>]

In [30]:
# BeautifulSoup and Regular Expressions
import re
[i.name for i in soup.find_all(name=re.compile('^b'))]

['body', 'b', 'b']

In [31]:
[i for i in soup(['a', 'b'])]

[<b>bold</b>,
 <a href="https://mail.ru">link</a>,
 <a id="paragraph"><b>bold link</b></a>,
 <b>bold link</b>]

In [45]:
tag = soup.b
tag

<b>bold</b>

In [46]:
print(soup.p)
tag.name = 'i'
tag['id'] = 'myid'
tag.string = 'italic'
print(soup.p)

<p class="text odd">first <b>bold</b> paragraph</p>
<p class="text odd">first <i id="myid">italic</i> paragraph</p>


In [47]:
# let's try parsing mail news site
import requests
result = requests.get('https://news.mail.ru/')
html = result.text
soup = BeautifulSoup(html, 'lxml')

In [64]:
[
    (section.string, 
     [
         link.string for link in section.find_parents()[4].find_all('span', class_='link__text')
     ]
    ) for section in soup.find_all('span', class_='hdr__inner')
]

[('Новости Санкт-Петербурга',
  ['Берегите зонтики: в\xa0Петербурге февральская «жара», дождь и\xa0ветер',
   'Двое покинувших карантин по\xa0коронавирусу вернулись в\xa0больницу',
   'Мединский рассказал об\xa0открытии памятника Александру III',
   'Девушка устроила скандал на борту самолета Петербург - Сочи']),
 ('Политика',
  ['Посол России в Турции прокомментировал угрозы в свой адрес',
   'Глава МИД Польши назвал РФ серьезной угрозой, которой ЕС должен противостоять вместе с США',
   'Переводчики Путина перечислили его самые затруднительные фразы']),
 ('Экономика',
  ['Банки получили инструкцию по борьбе с мошенниками',
   'Мишустин поручил снять ограничения на закупки иностранных лекарств',
   'Сбербанк опроверг сообщения об утечке данных клиентов']),
 ('Общество',
  ['Тысячи японцев собрались на «пикантный» фестиваль (фото)',
   'В РПЦ призвали не воспринимать буквально слова протоиерея Смирнова о гражданских женах',
   '«Триумвират тепла». Синоптик предрёк апрельскую погоду в М