# Set-up and Workflow

### Importing the packages

In [1]:
import requests
from bs4 import BeautifulSoup

### Making a GET request

In [2]:
base_site = "https://en.wikipedia.org/wiki/Music"

response = requests.get(base_site)
response.status_code

200

In [3]:
html = response.content
# Checking that the reply is indeed an HTML code by inspecting the first 100 symbols
html[:100]

b'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-la'

### Making the soup

In [4]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

In [5]:
# Exporting the HTML to a file
with open('Wiki_response.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

# Searching and navigating the HTML tree

## Searching - find() and find_all()

In [6]:
# The soup variable (BeautifulSoup object) we defined earlier can be seen as representing the whole document
soup

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Music - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-l

In [7]:
# We can search by tag name
# This returns as the element with all its contents and all nested elements inside
soup.find('head')

<head>
<meta charset="utf-8"/>
<title>Music - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTab

In [8]:
# If there is no result it returns None value
# Note: None is not displayed in IPython unless print() or repr() is used
soup.find('video')

In [9]:
# Display the None value
print(soup.find('video'))

None


In [10]:
# verify the type of output
type(soup.find('video'))

NoneType

In [11]:
# .find() returns only the first such result
soup.find('a')

<a class="mw-jump-link" href="#bodyContent">Jump to content</a>

In [12]:
# If we want all the results we use find_all() 
links = soup.find_all('a')
links

[<a class="mw-jump-link" href="#bodyContent">Jump to content</a>,
 <a accesskey="z" href="/wiki/Main_Page" title="Visit the main page [z]"><span>Main page</span></a>,
 <a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a>,
 <a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a>,
 <a accesskey="x" href="/wiki/Special:Random" title="Visit a randomly selected article [x]"><span>Random article</span></a>,
 <a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a>,
 <a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a>,
 <a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a>,
 <a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a>,
 <a href="/wiki/Wikipedia:Community_portal" title="The

##### So `.find()` returns the first tag matching the search & `.find_all()` returns a list of all tags

In [13]:
isinstance(links, list)

True

In [14]:
# If no result is found it returns an empty list
soup.find_all('video')

[]

In [15]:
# How many links are on the page?
len(links)

2500

##### So far we have searched the whole document for elements, but it is possible to search only a specific part of it, This is achived by applying the methods to an element instead of the Beautiful Soup object

In [16]:
# Usually, we prefer to store the result in a variable
# Let's store the body of a table in a table variable
table = soup.find('tbody')

In [17]:
# Inspect the value of the variable
table

<tbody><tr><td class="sidebar-pretitle" style="background:antiquewhite;">Part of <a href="/wiki/Category:Performing_arts" title="Category:Performing arts">a series</a> on</td></tr><tr><th class="sidebar-title-with-pretitle" style="background:antiquewhite;;display:block;margin-bottom:0.4em;"><a href="/wiki/Performing_arts" title="Performing arts">Performing arts</a></th></tr><tr><td class="sidebar-content hlist">
<ul><li><a href="/wiki/Acrobatics" title="Acrobatics">Acrobatics</a></li>
<li><a href="/wiki/Ballet" title="Ballet">Ballet</a></li>
<li><a href="/wiki/List_of_circus_skills" title="List of circus skills">Circus skills</a></li>
<li><a href="/wiki/Clown" title="Clown">Clown</a></li>
<li><a href="/wiki/Dance" title="Dance">Dance</a></li>
<li><a href="/wiki/Gymnastics" title="Gymnastics">Gymnastics</a></li>
<li><a href="/wiki/Magic_(illusion)" title="Magic (illusion)">Magic</a></li>
<li><a href="/wiki/Mime_artist" title="Mime artist">Mime</a></li>
<li><a class="mw-selflink selflink

In [18]:
# Inspect the type of the variable
type(table)

bs4.element.Tag

That `tag` type can be treated and searched in the same way we search the whole document object

In [19]:
table.find_all('td')

[<td class="sidebar-pretitle" style="background:antiquewhite;">Part of <a href="/wiki/Category:Performing_arts" title="Category:Performing arts">a series</a> on</td>,
 <td class="sidebar-content hlist">
 <ul><li><a href="/wiki/Acrobatics" title="Acrobatics">Acrobatics</a></li>
 <li><a href="/wiki/Ballet" title="Ballet">Ballet</a></li>
 <li><a href="/wiki/List_of_circus_skills" title="List of circus skills">Circus skills</a></li>
 <li><a href="/wiki/Clown" title="Clown">Clown</a></li>
 <li><a href="/wiki/Dance" title="Dance">Dance</a></li>
 <li><a href="/wiki/Gymnastics" title="Gymnastics">Gymnastics</a></li>
 <li><a href="/wiki/Magic_(illusion)" title="Magic (illusion)">Magic</a></li>
 <li><a href="/wiki/Mime_artist" title="Mime artist">Mime</a></li>
 <li><a class="mw-selflink selflink">Music</a></li>
 <li><a href="/wiki/Opera" title="Opera">Opera</a></li>
 <li><a href="/wiki/Professional_wrestling" title="Professional wrestling">Professional wrestling</a></li>
 <li><a href="/wiki/Pupp

In [20]:
len(table.find_all('td'))

3

## Navigating the tree

In [21]:
# Navigating down
# A tag's children (the tages inside) are stored in a list, accessed with .contents
table.contents

[<tr><td class="sidebar-pretitle" style="background:antiquewhite;">Part of <a href="/wiki/Category:Performing_arts" title="Category:Performing arts">a series</a> on</td></tr>,
 <tr><th class="sidebar-title-with-pretitle" style="background:antiquewhite;;display:block;margin-bottom:0.4em;"><a href="/wiki/Performing_arts" title="Performing arts">Performing arts</a></th></tr>,
 <tr><td class="sidebar-content hlist">
 <ul><li><a href="/wiki/Acrobatics" title="Acrobatics">Acrobatics</a></li>
 <li><a href="/wiki/Ballet" title="Ballet">Ballet</a></li>
 <li><a href="/wiki/List_of_circus_skills" title="List of circus skills">Circus skills</a></li>
 <li><a href="/wiki/Clown" title="Clown">Clown</a></li>
 <li><a href="/wiki/Dance" title="Dance">Dance</a></li>
 <li><a href="/wiki/Gymnastics" title="Gymnastics">Gymnastics</a></li>
 <li><a href="/wiki/Magic_(illusion)" title="Magic (illusion)">Magic</a></li>
 <li><a href="/wiki/Mime_artist" title="Mime artist">Mime</a></li>
 <li><a class="mw-selflink

In [22]:
len(table.contents)

4

In [23]:
table.contents[1]

<tr><th class="sidebar-title-with-pretitle" style="background:antiquewhite;;display:block;margin-bottom:0.4em;"><a href="/wiki/Performing_arts" title="Performing arts">Performing arts</a></th></tr>

In [24]:
# We can also go up the tree with .parent
table.parent

<table class="sidebar nomobile nowraplinks"><tbody><tr><td class="sidebar-pretitle" style="background:antiquewhite;">Part of <a href="/wiki/Category:Performing_arts" title="Category:Performing arts">a series</a> on</td></tr><tr><th class="sidebar-title-with-pretitle" style="background:antiquewhite;;display:block;margin-bottom:0.4em;"><a href="/wiki/Performing_arts" title="Performing arts">Performing arts</a></th></tr><tr><td class="sidebar-content hlist">
<ul><li><a href="/wiki/Acrobatics" title="Acrobatics">Acrobatics</a></li>
<li><a href="/wiki/Ballet" title="Ballet">Ballet</a></li>
<li><a href="/wiki/List_of_circus_skills" title="List of circus skills">Circus skills</a></li>
<li><a href="/wiki/Clown" title="Clown">Clown</a></li>
<li><a href="/wiki/Dance" title="Dance">Dance</a></li>
<li><a href="/wiki/Gymnastics" title="Gymnastics">Gymnastics</a></li>
<li><a href="/wiki/Magic_(illusion)" title="Magic (illusion)">Magic</a></li>
<li><a href="/wiki/Mime_artist" title="Mime artist">Mime

In [25]:
# table.parent is also a tag
# Thus, we can use .parent on it as well
table.parent.parent

<div class="mw-content-ltr mw-parser-output" dir="ltr" lang="en"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Form of art using sound</div>
<style data-mw-deduplicate="TemplateStyles:r1236090951">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}@media print{body.ns-0 .mw-parser-output .hatnote{display:none!important}}</style><div class="hatnote navigation-not-searchable" role="note">For other uses, see <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>.</div>
<p class="mw-empty-elt">
</p>
<figure class="mw-default-size" typeof="mw:File/Thumb"><a class="mw-file-description" href="/wiki/File:The_Sounds_of_Earth_-_GPN-2000-001976.jpg"><img class="mw-file-element" data-file-height="2388" data-file-width="2389" deco

In [26]:
# We use .parent to go up the tree
# But what about .children?
table.children

<list_iterator at 0x22282d44790>

#### If we want a list of an element's children, we need to use  `table.contents`  as shown before <br> `.children` is an iterator over that list, which means we can use it in a for loop to iterate over all the children

In [27]:
for child in table.children:
    print(child)

<tr><td class="sidebar-pretitle" style="background:antiquewhite;">Part of <a href="/wiki/Category:Performing_arts" title="Category:Performing arts">a series</a> on</td></tr>
<tr><th class="sidebar-title-with-pretitle" style="background:antiquewhite;;display:block;margin-bottom:0.4em;"><a href="/wiki/Performing_arts" title="Performing arts">Performing arts</a></th></tr>
<tr><td class="sidebar-content hlist">
<ul><li><a href="/wiki/Acrobatics" title="Acrobatics">Acrobatics</a></li>
<li><a href="/wiki/Ballet" title="Ballet">Ballet</a></li>
<li><a href="/wiki/List_of_circus_skills" title="List of circus skills">Circus skills</a></li>
<li><a href="/wiki/Clown" title="Clown">Clown</a></li>
<li><a href="/wiki/Dance" title="Dance">Dance</a></li>
<li><a href="/wiki/Gymnastics" title="Gymnastics">Gymnastics</a></li>
<li><a href="/wiki/Magic_(illusion)" title="Magic (illusion)">Magic</a></li>
<li><a href="/wiki/Mime_artist" title="Mime artist">Mime</a></li>
<li><a class="mw-selflink selflink">Mus

## Searching by attributes

#### Search for tags with specific attributes value, in addition to their name

In [28]:
soup.find('div', id = 'siteSub')

<div class="noprint" id="siteSub">From Wikipedia, the free encyclopedia</div>

## There are two ways in which we can do that:

### Passing attributes as function parameters

In [29]:
# By writing them as function parameters
# Notice that since class is a reserved word, we write class_
soup.find_all('a', class_ = 'mw-jump-link')

[<a class="mw-jump-link" href="#bodyContent">Jump to content</a>]

In [30]:
# We can filter against multiple attributes at once
soup.find('a', class_ = 'mw-jump-link', href = '#bodyContent')

<a class="mw-jump-link" href="#bodyContent">Jump to content</a>

### Placing the attributes in a dictionary

By writting the attributes in a dictionary

less elegent but more robust and can deal with user defined attributes

In [31]:
soup.find('a', attrs={ 'class':'mw-jump-link', 'href':'#bodyContent' })

<a class="mw-jump-link" href="#bodyContent">Jump to content</a>

In [32]:
soup.find('div', {'id' : 'bodyContent'})

<div aria-labelledby="firstHeading" class="vector-body" data-mw-ve-target-container="" id="bodyContent">
<div class="vector-body-before-content">
<div class="mw-indicators">
<div class="mw-indicator" id="mw-indicator-pp-default"><div class="mw-parser-output"><span typeof="mw:File"><a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected."><img alt="Page semi-protected" class="mw-file-element" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/></a></span></div></div>
</div>
<div class="noprint" id="siteSub">From Wikipedia, the free encyclopedia</div>
</div>
<div id="con

# Extracting data from the HTML tree

In [33]:
# Let's use some placeholder object to manipulate in the examples below
a = soup.find('a', class_ = 'mw-jump-link')
a

<a class="mw-jump-link" href="#bodyContent">Jump to content</a>

In [34]:
# We can obtain the name of the tag with the .name attribute
a.name

'a'

## Getting the attribute value

#### We can access a tag’s attributes by treating the tag just like a dictionary

Retuens the value as String

In [35]:
# First way
a['href']

'#bodyContent'

In [36]:
# Notice how multi-valued attributes, such as class, return as a list
a['class']

['mw-jump-link']

In [37]:
# Second way
a.get('href')

'#bodyContent'

In [38]:
# Again, class returns a list
a.get('class')

['mw-jump-link']

#### Differences between these methods manifest when the key is missing

 ` tag['missing-key'] ` returns an error, ex. `a['id']`

While in the second method, `tag.get('missing-key')` returns a default value None

In [39]:
a.get('id')

`repr()` shows official string representation of an object, display all special characters and combinations (None, \n...)

In [40]:
repr(a.get('id'))

'None'

### if you do not have any prior knowledge about the attributes

In [41]:
# get all attribute name-value pairs in a dictionary
a.attrs

{'class': ['mw-jump-link'], 'href': '#bodyContent'}

## Extracting the text

### .string vs .text

In [42]:
# We can access the raw string of an element by using .string
a.string

'Jump to content'

In [43]:
# Alternativelly we can use .text
a.text

'Jump to content'

#### They exhibit different behaviour when the element contains more than one distinct string

In [44]:
# This paragraph has many nested elements, with lots of different fragments of text
p = soup.find_all('p')[1]
p

<p><b>Music</b> is the arrangement of <a href="/wiki/Sound" title="Sound">sound</a> to create some combination of <a href="/wiki/Musical_form" title="Musical form">form</a>, <a href="/wiki/Harmony" title="Harmony">harmony</a>, <a href="/wiki/Melody" title="Melody">melody</a>, <a href="/wiki/Rhythm" title="Rhythm">rhythm</a>, or otherwise <a href="/wiki/Musical_expression" title="Musical expression">expressive content</a>.<sup class="reference" id="cite_ref-FOOTNOTE''OED''§&amp;nbsp;1_1-0"><a href="#cite_note-FOOTNOTE''OED''§&amp;nbsp;1-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup class="reference" id="cite_ref-FOOTNOTE''AHD''§&amp;nbsp;1_2-0"><a href="#cite_note-FOOTNOTE''AHD''§&amp;nbsp;1-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup><sup class="reference" id="cite_ref-FOOTNOTEEpperson2022§_para._1_3-0"><a href="#cite_note-FOOTNOTEEpperson2022§_para._1-3"><span class="cite-bracket">[</span>3<span class="

In [45]:
# .text returns everything inside the element
p.text

'Music is the arrangement of sound to create some combination of form, harmony, melody, rhythm, or otherwise expressive content.[1][2][3] Music is generally agreed to be a cultural universal that is present in all human societies.[4] Definitions of music vary widely in substance and approach.[5] While scholars agree that music is defined by a small number of specific elements, there is no consensus as to what these necessary elements are.[6] Music is often characterized as a highly versatile medium for expressing human creativity.[7] Diverse activities are involved in the creation of music, and are often divided into categories of composition, improvisation, and performance.[8] Music may be performed using a wide variety of musical instruments, including the human voice. It can also be composed, sequenced, or otherwise produced to be indirectly played mechanically or electronically, such as via a music box, barrel organ, or digital audio workstation software on a computer.\n'

In [46]:
# .string returns None when there is more than 1 string
p.string

In [47]:
repr(p.string)

'None'

In [48]:
p.parent

<div class="mw-content-ltr mw-parser-output" dir="ltr" lang="en"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Form of art using sound</div>
<style data-mw-deduplicate="TemplateStyles:r1236090951">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}@media print{body.ns-0 .mw-parser-output .hatnote{display:none!important}}</style><div class="hatnote navigation-not-searchable" role="note">For other uses, see <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>.</div>
<p class="mw-empty-elt">
</p>
<figure class="mw-default-size" typeof="mw:File/Thumb"><a class="mw-file-description" href="/wiki/File:The_Sounds_of_Earth_-_GPN-2000-001976.jpg"><img class="mw-file-element" data-file-height="2388" data-file-width="2389" deco

In [49]:
# We can stack different operations one after the other
p.parent.text

'Form of art using sound\nFor other uses, see Music (disambiguation).\n\n\nGrooved side of the Voyager Golden Record launched along the Voyager probes to space, which feature music from around the world\nPart of a series onPerforming arts\nAcrobatics\nBallet\nCircus skills\nClown\nDance\nGymnastics\nMagic\nMime\nMusic\nOpera\nProfessional wrestling\nPuppetry\nSpeech\nStand-up comedy\nStreet performance\nTheatre\nVentriloquism\nvte\nMusic is the arrangement of sound to create some combination of form, harmony, melody, rhythm, or otherwise expressive content.[1][2][3] Music is generally agreed to be a cultural universal that is present in all human societies.[4] Definitions of music vary widely in substance and approach.[5] While scholars agree that music is defined by a small number of specific elements, there is no consensus as to what these necessary elements are.[6] Music is often characterized as a highly versatile medium for expressing human creativity.[7] Diverse activities are in

In [50]:
# semi-properly displayed text
print(p.parent.text)

Form of art using sound
For other uses, see Music (disambiguation).


Grooved side of the Voyager Golden Record launched along the Voyager probes to space, which feature music from around the world
Part of a series onPerforming arts
Acrobatics
Ballet
Circus skills
Clown
Dance
Gymnastics
Magic
Mime
Music
Opera
Professional wrestling
Puppetry
Speech
Stand-up comedy
Street performance
Theatre
Ventriloquism
vte
Music is the arrangement of sound to create some combination of form, harmony, melody, rhythm, or otherwise expressive content.[1][2][3] Music is generally agreed to be a cultural universal that is present in all human societies.[4] Definitions of music vary widely in substance and approach.[5] While scholars agree that music is defined by a small number of specific elements, there is no consensus as to what these necessary elements are.[6] Music is often characterized as a highly versatile medium for expressing human creativity.[7] Diverse activities are involved in the creation of

In [51]:
# We can also use .get_text() instead of .text
p.parent.get_text()

'Form of art using sound\nFor other uses, see Music (disambiguation).\n\n\nGrooved side of the Voyager Golden Record launched along the Voyager probes to space, which feature music from around the world\nPart of a series onPerforming arts\nAcrobatics\nBallet\nCircus skills\nClown\nDance\nGymnastics\nMagic\nMime\nMusic\nOpera\nProfessional wrestling\nPuppetry\nSpeech\nStand-up comedy\nStreet performance\nTheatre\nVentriloquism\nvte\nMusic is the arrangement of sound to create some combination of form, harmony, melody, rhythm, or otherwise expressive content.[1][2][3] Music is generally agreed to be a cultural universal that is present in all human societies.[4] Definitions of music vary widely in substance and approach.[5] While scholars agree that music is defined by a small number of specific elements, there is no consensus as to what these necessary elements are.[6] Music is often characterized as a highly versatile medium for expressing human creativity.[7] Diverse activities are in

In [52]:
print(p.parent.get_text())

Form of art using sound
For other uses, see Music (disambiguation).


Grooved side of the Voyager Golden Record launched along the Voyager probes to space, which feature music from around the world
Part of a series onPerforming arts
Acrobatics
Ballet
Circus skills
Clown
Dance
Gymnastics
Magic
Mime
Music
Opera
Professional wrestling
Puppetry
Speech
Stand-up comedy
Street performance
Theatre
Ventriloquism
vte
Music is the arrangement of sound to create some combination of form, harmony, melody, rhythm, or otherwise expressive content.[1][2][3] Music is generally agreed to be a cultural universal that is present in all human societies.[4] Definitions of music vary widely in substance and approach.[5] While scholars agree that music is defined by a small number of specific elements, there is no consensus as to what these necessary elements are.[6] Music is often characterized as a highly versatile medium for expressing human creativity.[7] Diverse activities are involved in the creation of

In [53]:
# We can also extract the whole text of the webpage
# CAUTION: This includes JavaScript text, CSS and other not directly displayed text
print(soup.text)





Music - Wikipedia



































Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload file



















Search











Search






















Appearance
















Donate

Create account

Log in








Personal tools





Donate Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1
Etymology and terminology








2
History




Toggle History subsection





2.1
Origins and prehistory








2.2
Antiquity








2.3
Asian cultures








2.4
Western classical






2.4.1
Early music








2.4.2
Common practice period






2.4.2.1
Baroque








2.4.2.2
Classicism








2.4.2.3
Romanticism












2.5
20th and 21st century










3
Creat

### .strings and .stripped_strings

In [54]:
# All strings inside an element can be accessed separatelly by using the .strings iterator

In [55]:
for s in p.strings:
    print(repr(s))

'Music'
' is the arrangement of '
'sound'
' to create some combination of '
'form'
', '
'harmony'
', '
'melody'
', '
'rhythm'
', or otherwise '
'expressive content'
'.'
'['
'1'
']'
'['
'2'
']'
'['
'3'
']'
' Music is generally agreed to be a '
'cultural universal'
' that is present in all human societies.'
'['
'4'
']'
' '
'Definitions of music'
' vary widely in substance and approach.'
'['
'5'
']'
' While scholars agree that music is defined by a small number of '
'specific elements'
', there is no consensus as to what these necessary elements are.'
'['
'6'
']'
' Music is often characterized as a highly versatile medium for expressing human '
'creativity'
'.'
'['
'7'
']'
' Diverse activities are involved in the creation of music, and are often divided into categories of '
'composition'
', '
'improvisation'
', and '
'performance'
'.'
'['
'8'
']'
' Music may be performed using a wide variety of '
'musical instruments'
', including the '
'human voice'
'. It can also be composed, sequenced,

In [56]:
# The extra whitespace can be removed by using the .stripped_strings iterator instead
for s in p.stripped_strings:
    print(repr(s))

'Music'
'is the arrangement of'
'sound'
'to create some combination of'
'form'
','
'harmony'
','
'melody'
','
'rhythm'
', or otherwise'
'expressive content'
'.'
'['
'1'
']'
'['
'2'
']'
'['
'3'
']'
'Music is generally agreed to be a'
'cultural universal'
'that is present in all human societies.'
'['
'4'
']'
'Definitions of music'
'vary widely in substance and approach.'
'['
'5'
']'
'While scholars agree that music is defined by a small number of'
'specific elements'
', there is no consensus as to what these necessary elements are.'
'['
'6'
']'
'Music is often characterized as a highly versatile medium for expressing human'
'creativity'
'.'
'['
'7'
']'
'Diverse activities are involved in the creation of music, and are often divided into categories of'
'composition'
','
'improvisation'
', and'
'performance'
'.'
'['
'8'
']'
'Music may be performed using a wide variety of'
'musical instruments'
', including the'
'human voice'
'. It can also be composed, sequenced, or otherwise produced to b

## Exercise

### 1- Extract a list of the title of all links on the page. Only links that have the ‘href’ attribute are considered. You can also clean the list from ‘None’ values.

In [57]:
# Finding all links on the page 
links = soup.find_all('a')
links

[<a class="mw-jump-link" href="#bodyContent">Jump to content</a>,
 <a accesskey="z" href="/wiki/Main_Page" title="Visit the main page [z]"><span>Main page</span></a>,
 <a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a>,
 <a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a>,
 <a accesskey="x" href="/wiki/Special:Random" title="Visit a randomly selected article [x]"><span>Random article</span></a>,
 <a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a>,
 <a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a>,
 <a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a>,
 <a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a>,
 <a href="/wiki/Wikipedia:Community_portal" title="The

In [58]:
# Dropping the links without href attribute
clean_links = [l for l in links if l.get('href') != None]
clean_links

[<a class="mw-jump-link" href="#bodyContent">Jump to content</a>,
 <a accesskey="z" href="/wiki/Main_Page" title="Visit the main page [z]"><span>Main page</span></a>,
 <a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a>,
 <a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a>,
 <a accesskey="x" href="/wiki/Special:Random" title="Visit a randomly selected article [x]"><span>Random article</span></a>,
 <a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a>,
 <a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a>,
 <a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a>,
 <a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a>,
 <a href="/wiki/Wikipedia:Community_portal" title="The

In [59]:
# Getting all titles of links using a list comprehension
titles = [l.get('title') for l in clean_links]
titles

[None,
 'Visit the main page [z]',
 'Guides to browsing Wikipedia',
 'Articles related to current events',
 'Visit a randomly selected article [x]',
 'Learn about Wikipedia and how it works',
 'How to contact Wikipedia',
 'Guidance on how to use and edit Wikipedia',
 'Learn how to edit Wikipedia',
 'The hub for editors',
 'A list of recent changes to Wikipedia [r]',
 'Add images or other media for use on Wikipedia',
 None,
 'Search Wikipedia [f]',
 None,
 'You are encouraged to create an account and log in; however, it is not mandatory',
 "You're encouraged to log in; however, it's not mandatory. [o]",
 None,
 'You are encouraged to create an account and log in; however, it is not mandatory',
 "You're encouraged to log in; however, it's not mandatory. [o]",
 None,
 'A list of edits made from this IP address [y]',
 'Discussion about edits from this IP address [n]',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 Non

In [60]:
# Removing the 'None' titles
clean_titles = [t for t in titles if t != None]
clean_titles

['Visit the main page [z]',
 'Guides to browsing Wikipedia',
 'Articles related to current events',
 'Visit a randomly selected article [x]',
 'Learn about Wikipedia and how it works',
 'How to contact Wikipedia',
 'Guidance on how to use and edit Wikipedia',
 'Learn how to edit Wikipedia',
 'The hub for editors',
 'A list of recent changes to Wikipedia [r]',
 'Add images or other media for use on Wikipedia',
 'Search Wikipedia [f]',
 'You are encouraged to create an account and log in; however, it is not mandatory',
 "You're encouraged to log in; however, it's not mandatory. [o]",
 'You are encouraged to create an account and log in; however, it is not mandatory',
 "You're encouraged to log in; however, it's not mandatory. [o]",
 'A list of edits made from this IP address [y]',
 'Discussion about edits from this IP address [n]',
 'Musiek – Afrikaans',
 'Musik – Alemannic',
 'ሙዚቃ – Amharic',
 'Muusik – Inari Sami',
 'Drēam – Old English',
 'موسيقى – Arabic',
 'Musica – Aragonese',
 'ܙܡ

### 2- Extract the text of all ‘h2’ tags.

In [61]:
# Inspect all h2 tags
soup.find_all('h2')

[<h2 class="vector-pinnable-header-label">Contents</h2>,
 <h2 id="Etymology_and_terminology">Etymology and terminology</h2>,
 <h2 id="History">History</h2>,
 <h2 id="Creation">Creation</h2>,
 <h2 id="Art_and_entertainment">Art and entertainment</h2>,
 <h2 id="Elements">Elements</h2>,
 <h2 id="Philosophy">Philosophy</h2>,
 <h2 id="Psychology">Psychology</h2>,
 <h2 id="Sociological_aspects">Sociological aspects</h2>,
 <h2 id="Media_and_technology">Media and technology</h2>,
 <h2 id="Education">Education</h2>,
 <h2 id="Academic_study">Academic study</h2>,
 <h2 id="Therapy">Therapy</h2>,
 <h2 id="See_also">See also</h2>,
 <h2 id="References">References</h2>,
 <h2 id="Further_reading">Further reading</h2>,
 <h2 id="External_links">External links</h2>]

In [62]:
# Get the text
h2_strings = [h2.string for h2 in soup.find_all('h2')]
h2_strings

['Contents',
 'Etymology and terminology',
 'History',
 'Creation',
 'Art and entertainment',
 'Elements',
 'Philosophy',
 'Psychology',
 'Sociological aspects',
 'Media and technology',
 'Education',
 'Academic study',
 'Therapy',
 'See also',
 'References',
 'Further reading',
 'External links']

### 3- Find, extract and print the text of the footer of the page.

In [63]:
# By inspection: we see that the footer is contained inside a 'footer' tag with id set to 'footer'
print(soup.find('footer', id = 'footer').text)



 This page was last edited on 19 November 2024, at 17:29 (UTC).
Text is available under the Creative Commons Attribution-ShareAlike 4.0 License;
additional terms may apply. By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia® is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.


Privacy policy
About Wikipedia
Disclaimers
Contact Wikipedia
Code of Conduct
Developers
Statistics
Cookie statement
Mobile view








# Practical examples

## Links - absolute path URL

In [64]:
# Let's use the variable links we defined a couple of lectures ago for this example
# It contains all the 'a' tags on this page
links

[<a class="mw-jump-link" href="#bodyContent">Jump to content</a>,
 <a accesskey="z" href="/wiki/Main_Page" title="Visit the main page [z]"><span>Main page</span></a>,
 <a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a>,
 <a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a>,
 <a accesskey="x" href="/wiki/Special:Random" title="Visit a randomly selected article [x]"><span>Random article</span></a>,
 <a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a>,
 <a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a>,
 <a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a>,
 <a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a>,
 <a href="/wiki/Wikipedia:Community_portal" title="The

In [65]:
# Let's choose one link to manipulate
link = links[21]
link

<a accesskey="y" href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]"><span>Contributions</span></a>

In [66]:
# Get the link's text
link.string

'Contributions'

In [67]:
# Extract the link's URL
link['href']

'/wiki/Special:MyContributions'

This is a relative URL

To obtain the absolute URL address we will use `urljoin`

In [68]:
from urllib.parse import urljoin

need the address of the current page + the relative URL to compute the full-path URL

In [69]:
base_site

'https://en.wikipedia.org/wiki/Music'

In [70]:
relative_url = link['href']
relative_url

'/wiki/Special:MyContributions'

In [71]:
full_url = urljoin(base_site, relative_url)
full_url

'https://en.wikipedia.org/wiki/Special:MyContributions'

## Processing multiple links at once

#### we want to devicse a method to obtain all the links in bulk

In [72]:
# We will work with:
links

[<a class="mw-jump-link" href="#bodyContent">Jump to content</a>,
 <a accesskey="z" href="/wiki/Main_Page" title="Visit the main page [z]"><span>Main page</span></a>,
 <a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a>,
 <a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a>,
 <a accesskey="x" href="/wiki/Special:Random" title="Visit a randomly selected article [x]"><span>Random article</span></a>,
 <a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a>,
 <a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a>,
 <a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a>,
 <a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a>,
 <a href="/wiki/Wikipedia:Community_portal" title="The

### First we need to get the 'H ref' value for every link in the list

In [73]:
[l.get('href') for l in links]  
# Note that if l['href'] was written instead of l.get('href'), this would produce an error

['#bodyContent',
 '/wiki/Main_Page',
 '/wiki/Wikipedia:Contents',
 '/wiki/Portal:Current_events',
 '/wiki/Special:Random',
 '/wiki/Wikipedia:About',
 '//en.wikipedia.org/wiki/Wikipedia:Contact_us',
 '/wiki/Help:Contents',
 '/wiki/Help:Introduction',
 '/wiki/Wikipedia:Community_portal',
 '/wiki/Special:RecentChanges',
 '/wiki/Wikipedia:File_upload_wizard',
 '/wiki/Main_Page',
 '/wiki/Special:Search',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 '/w/index.php?title=Special:CreateAccount&returnto=Music',
 '/w/index.php?title=Special:UserLogin&returnto=Music',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 '/w/index.php?title=Special:CreateAccount&returnto=Music',
 '/w/index.php?title=Special:UserLogin&returnto=Music',
 '/wiki/Help:Introduction',
 '/wiki/Special:MyContributions',
 '/wi

Notice that some links don't have URL (None appears)

In [74]:
# Dropping the links without href attribute
clean_links = [l for l in links if l.get('href') != None]

In [75]:
# Obtaining the relative URLs
relative_urls = [link.get('href') for link in clean_links]
relative_urls

['#bodyContent',
 '/wiki/Main_Page',
 '/wiki/Wikipedia:Contents',
 '/wiki/Portal:Current_events',
 '/wiki/Special:Random',
 '/wiki/Wikipedia:About',
 '//en.wikipedia.org/wiki/Wikipedia:Contact_us',
 '/wiki/Help:Contents',
 '/wiki/Help:Introduction',
 '/wiki/Wikipedia:Community_portal',
 '/wiki/Special:RecentChanges',
 '/wiki/Wikipedia:File_upload_wizard',
 '/wiki/Main_Page',
 '/wiki/Special:Search',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 '/w/index.php?title=Special:CreateAccount&returnto=Music',
 '/w/index.php?title=Special:UserLogin&returnto=Music',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 '/w/index.php?title=Special:CreateAccount&returnto=Music',
 '/w/index.php?title=Special:UserLogin&returnto=Music',
 '/wiki/Help:Introduction',
 '/wiki/Special:MyContributions',
 '/wi

In [76]:
# Transforming to absolute path URLs
full_urls = [urljoin(base_site, url) for url in relative_urls]
full_urls

['https://en.wikipedia.org/wiki/Music#bodyContent',
 'https://en.wikipedia.org/wiki/Main_Page',
 'https://en.wikipedia.org/wiki/Wikipedia:Contents',
 'https://en.wikipedia.org/wiki/Portal:Current_events',
 'https://en.wikipedia.org/wiki/Special:Random',
 'https://en.wikipedia.org/wiki/Wikipedia:About',
 'https://en.wikipedia.org/wiki/Wikipedia:Contact_us',
 'https://en.wikipedia.org/wiki/Help:Contents',
 'https://en.wikipedia.org/wiki/Help:Introduction',
 'https://en.wikipedia.org/wiki/Wikipedia:Community_portal',
 'https://en.wikipedia.org/wiki/Special:RecentChanges',
 'https://en.wikipedia.org/wiki/Wikipedia:File_upload_wizard',
 'https://en.wikipedia.org/wiki/Main_Page',
 'https://en.wikipedia.org/wiki/Special:Search',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 'https://en.wikipedia.org/w/index.php?title=Special:CreateAccount&returnto=Music',
 'https://en.wikipedia.org/w/index.

In [77]:
# Extracting only URLs pointing to Wikipedia (internal URLs)
internal_links = [url for url in full_urls if 'wikipedia.org' in url]
internal_links

['https://en.wikipedia.org/wiki/Music#bodyContent',
 'https://en.wikipedia.org/wiki/Main_Page',
 'https://en.wikipedia.org/wiki/Wikipedia:Contents',
 'https://en.wikipedia.org/wiki/Portal:Current_events',
 'https://en.wikipedia.org/wiki/Special:Random',
 'https://en.wikipedia.org/wiki/Wikipedia:About',
 'https://en.wikipedia.org/wiki/Wikipedia:Contact_us',
 'https://en.wikipedia.org/wiki/Help:Contents',
 'https://en.wikipedia.org/wiki/Help:Introduction',
 'https://en.wikipedia.org/wiki/Wikipedia:Community_portal',
 'https://en.wikipedia.org/wiki/Special:RecentChanges',
 'https://en.wikipedia.org/wiki/Wikipedia:File_upload_wizard',
 'https://en.wikipedia.org/wiki/Main_Page',
 'https://en.wikipedia.org/wiki/Special:Search',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 'https://en.wikipedia.org/w/index.php?title=Special:CreateAccount&returnto=Music',
 'https://en.wikipedia.org/w/index.

# Extracting data from nested tags

#### Our objective now is to extract all links that can be found under a section heading Marked as 'Main article:' or 'See also:' <br>By quick inspection, we see that these are contained in `div` tags with attribute 'role' set to 'note'

In [78]:
div_notes = soup.find_all("div", {"role": "note"})
div_notes

[<div class="hatnote navigation-not-searchable" role="note">For other uses, see <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>.</div>,
 <div class="hatnote navigation-not-searchable" role="note">Main article: <a href="/wiki/History_of_music" title="History of music">History of music</a></div>,
 <div class="hatnote navigation-not-searchable" role="note">Further information: <a class="mw-redirect" href="/wiki/Origins_of_music" title="Origins of music">Origins of music</a> and <a href="/wiki/Prehistoric_music" title="Prehistoric music">Prehistoric music</a></div>,
 <div class="hatnote navigation-not-searchable" role="note">Main article: <a href="/wiki/Ancient_music" title="Ancient music">Ancient music</a></div>,
 <div class="hatnote navigation-not-searchable" role="note">Main article: <a href="/wiki/Music_of_Asia" title="Music of Asia">Music of Asia</a></div>,
 <div class="hatnote navigation-not-searchable" role="note">

In [79]:
div_notes[0]

<div class="hatnote navigation-not-searchable" role="note">For other uses, see <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>.</div>

In [80]:
# We can apply find() and find_all() to a tag in the same way we do it to the whole document
div_notes[0].find('a')

<a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>

In [81]:
# A naive approach to get all links would be to use find
div_links = [div.find('a') for div in div_notes]
div_links

[<a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a href="/wiki/History_of_music" title="History of music">History of music</a>,
 <a class="mw-redirect" href="/wiki/Origins_of_music" title="Origins of music">Origins of music</a>,
 <a href="/wiki/Ancient_music" title="Ancient music">Ancient music</a>,
 <a href="/wiki/Music_of_Asia" title="Music of Asia">Music of Asia</a>,
 <a href="/wiki/Classical_music" title="Classical music">Classical music</a>,
 <a href="/wiki/Baroque_music" title="Baroque music">Baroque music</a>,
 <a href="/wiki/Classical_period_(music)" title="Classical period (music)">Classical period (music)</a>,
 <a href="/wiki/Romantic_music" title="Romantic music">Romantic music</a>,
 <a href="/wiki/20th-century_music" title="20th-century music">20th-century music</a>,
 <a href="/wiki/Musical_composition" title="Musical composition">Musical composition</a>,
 <a href="/wiki/Performance" title="Performance"

In [82]:
len(div_links)

39

### But some divs can have more than 1 link, so with `.find()` we ignored the rest and pulled just the first one

### and we can not solve that with `.find_all()` as it returns a list so the result will be list of lists 2D and we dont not want that

In [83]:
div_notes[38]

<div class="hatnote navigation-not-searchable" role="note">Main articles: <a href="/wiki/Outline_of_music" title="Outline of music">Outline of music</a> and <a href="/wiki/Index_of_music_articles" title="Index of music articles">Index of music articles</a></div>

In [84]:
div_notes[38].find_all('a')

[<a href="/wiki/Outline_of_music" title="Outline of music">Outline of music</a>,
 <a href="/wiki/Index_of_music_articles" title="Index of music articles">Index of music articles</a>]

#### that method can get us just a smiple list as we wanted

In [85]:
# Define initially empty list of links
div_links = []

for div in div_notes:
    anchors = div.find_all('a')
    
    # Need to add every link from anchors to div_links
    for a in anchors:
        div_links.append(a)
    
    # anither way: div_links.extend(anchors) instead of the for loop
    

In [86]:
div_links

[<a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a href="/wiki/History_of_music" title="History of music">History of music</a>,
 <a class="mw-redirect" href="/wiki/Origins_of_music" title="Origins of music">Origins of music</a>,
 <a href="/wiki/Prehistoric_music" title="Prehistoric music">Prehistoric music</a>,
 <a href="/wiki/Ancient_music" title="Ancient music">Ancient music</a>,
 <a href="/wiki/Music_of_Asia" title="Music of Asia">Music of Asia</a>,
 <a href="/wiki/Classical_music" title="Classical music">Classical music</a>,
 <a href="/wiki/Baroque_music" title="Baroque music">Baroque music</a>,
 <a href="/wiki/Classical_period_(music)" title="Classical period (music)">Classical period (music)</a>,
 <a href="/wiki/Romantic_music" title="Romantic music">Romantic music</a>,
 <a href="/wiki/20th-century_music" title="20th-century music">20th-century music</a>,
 <a href="/wiki/Musical_composition" title="Musical co

In [87]:
# We now have a complete list
len(div_links)

44

In [88]:
# Let's get the URLs
note_urls = [urljoin(base_site, l.get('href')) for l in div_links]
note_urls

['https://en.wikipedia.org/wiki/Music_(disambiguation)',
 'https://en.wikipedia.org/wiki/History_of_music',
 'https://en.wikipedia.org/wiki/Origins_of_music',
 'https://en.wikipedia.org/wiki/Prehistoric_music',
 'https://en.wikipedia.org/wiki/Ancient_music',
 'https://en.wikipedia.org/wiki/Music_of_Asia',
 'https://en.wikipedia.org/wiki/Classical_music',
 'https://en.wikipedia.org/wiki/Baroque_music',
 'https://en.wikipedia.org/wiki/Classical_period_(music)',
 'https://en.wikipedia.org/wiki/Romantic_music',
 'https://en.wikipedia.org/wiki/20th-century_music',
 'https://en.wikipedia.org/wiki/Musical_composition',
 'https://en.wikipedia.org/wiki/Performance',
 'https://en.wikipedia.org/wiki/Musical_improvisation',
 'https://en.wikipedia.org/wiki/Musical_notation',
 'https://en.wikipedia.org/wiki/Elements_of_music',
 'https://en.wikipedia.org/wiki/Pitch_(music)',
 'https://en.wikipedia.org/wiki/Melody',
 'https://en.wikipedia.org/wiki/Harmony',
 'https://en.wikipedia.org/wiki/Rhythm',
 'h

In [89]:
len(note_urls)

44

# Scraping multiple pages automatically - Extracting all the text from the note URLs

#### Now we will try to collect the main text of every page in the links

In [90]:
# We will use the links we obtained above
note_urls

['https://en.wikipedia.org/wiki/Music_(disambiguation)',
 'https://en.wikipedia.org/wiki/History_of_music',
 'https://en.wikipedia.org/wiki/Origins_of_music',
 'https://en.wikipedia.org/wiki/Prehistoric_music',
 'https://en.wikipedia.org/wiki/Ancient_music',
 'https://en.wikipedia.org/wiki/Music_of_Asia',
 'https://en.wikipedia.org/wiki/Classical_music',
 'https://en.wikipedia.org/wiki/Baroque_music',
 'https://en.wikipedia.org/wiki/Classical_period_(music)',
 'https://en.wikipedia.org/wiki/Romantic_music',
 'https://en.wikipedia.org/wiki/20th-century_music',
 'https://en.wikipedia.org/wiki/Musical_composition',
 'https://en.wikipedia.org/wiki/Performance',
 'https://en.wikipedia.org/wiki/Musical_improvisation',
 'https://en.wikipedia.org/wiki/Musical_notation',
 'https://en.wikipedia.org/wiki/Elements_of_music',
 'https://en.wikipedia.org/wiki/Pitch_(music)',
 'https://en.wikipedia.org/wiki/Melody',
 'https://en.wikipedia.org/wiki/Harmony',
 'https://en.wikipedia.org/wiki/Rhythm',
 'h

### The objective is to get all the useful text from those wikipedia pages
<ul>
<li> We will do that by extracting all text contained in a paragraph element,
<li> for all paragraphs on a page,
<li> for all pages (in note_urls)

you need to connect alot of pages in a small amount of time and then extract only the information you need

First what method will eploy to extract the main text?<br>
`.text` for divs? this would extract not only the main text but also the content, image captions, tables and others and of course we do not want that in our data

In [None]:
# initialize list to store paragraph text for each webpage
par_text = []


# creating a loop counter
i = 0

# Loop through each URL in note_urls
for url in note_urls:
    
    # connect to every webpage
    note_resp = requests.get(url)
    
    # checking if the request is successful
    if note_resp.status_code == 200:            # Everything is OK!
        print('URL #{0}: {1}'.format(i+1,url))    # print out the number of iteration and the URL to keep track of place in loop
    
    else:                                       # Something is wrong!
        print('Status code {0}: Skipping URL #{1}: {2}'.format(note_resp.status_code, i+1, url))
        i = i+1
        continue
        
    
    # get HTML from webpage
    note_html = note_resp.content
    
    # convert HTML to BeautifulSoup object
    note_soup = BeautifulSoup(note_html, 'lxml')
    
    # find all "p" tags on the webpage
    note_pars = note_soup.find_all("p")
    
    # Get the text from each "p" tag
    text = [p.text for p in note_pars]
    
    # Append text from each "p" tag to our list, par_text
    par_text.append(text)
    
    # Incrementing the loop counter
    i = i+1


In [None]:
# Inspecting the result for the first page
par_text[0]

['Music is an art form consisting of sound and silence, expressed through time.\n',
 'Music may also refer to:\n']

In [None]:
# We see that we have a list of all paragraph strings
# It would be more useful to have all the text as one string, not as a list of strings

# Merging all paragraphs of the first page into one long string
page_text = "".join(par_text[0])
page_text

'Music is an art form consisting of sound and silence, expressed through time.\nMusic may also refer to:\n'

In [None]:
# Let's do that for all pages

# Merging all paragraphs for all pages
page_text = ["".join(text) for text in par_text]

# Inspect the result for some webpage
page_text[0]

'Music is an art form consisting of sound and silence, expressed through time.\nMusic may also refer to:\n'

In [None]:
# Inspect result
print(page_text[4])

In [None]:
# Creating a dictionary with the (key,value) pairs being (url,text)
url_to_text = dict(zip(note_urls, page_text))  # You don't need to know the specifics of these functions

In [None]:
print(url_to_text['https://en.wikipedia.org/wiki/Music_theory'])

In [None]:
# A word of caution:
# We have not extracted all of the main content's text,
# as some text may be contained in lists and tables, outside of paragraphs we scraped