### web scraping
web scraping is the practice of gathering data through any means other
than a program interacting with an API (or, obviously, through a human using a web
browser). 

This is most commonly accomplished by writing an automated program
that queries a web server, requests data (usually in the form of the HTMLand other
files that comprise web pages), and then parses that data to extract needed
information.

In [1]:
## Read the html.html file
path=r"required files\html.html"
con=open(path,"r")
data=con.read()
con.close()

In [2]:
print(data)

<!DOCTYPE html>

<html>
    <head>
        <title>GETTING STARTED WITH bs4</title>
    </head>
    <body>
        <div class="para 1">
            <p>
                This is paragraph one
            </p>
            <p>
                This is paragraph two
            </p>
        </div>
        <div class='para 2'>
            <p>
                This is para 1 in div 2
            </p>
        </div>
        <div class="para 1">
            <p>
                This is paragraph three of div with class para 1
            </p>
            <p>
                This is paragraph four of div with class para 1
            </p>
        </div>
    </body>
</html>


In [3]:
type(data)

str

#### Beautiful Soup is a Python library for pulling data out of HTML and XML files. It works with your favorite parser to provide idiomatic ways of navigating, searching, and modifying the parse tree. It commonly saves programmers hours or days of work.

In [4]:
from bs4 import BeautifulSoup

In [5]:
parsed_markup=BeautifulSoup(data,'html.parser')

In [6]:
type(parsed_markup)

bs4.BeautifulSoup

In [7]:
parsed_markup

<!DOCTYPE html>

<html>
<head>
<title>GETTING STARTED WITH bs4</title>
</head>
<body>
<div class="para 1">
<p>
                This is paragraph one
            </p>
<p>
                This is paragraph two
            </p>
</div>
<div class="para 2">
<p>
                This is para 1 in div 2
            </p>
</div>
<div class="para 1">
<p>
                This is paragraph three of div with class para 1
            </p>
<p>
                This is paragraph four of div with class para 1
            </p>
</div>
</body>
</html>

In [8]:
parsed_markup.title

<title>GETTING STARTED WITH bs4</title>

In [9]:
type(parsed_markup.title)

bs4.element.Tag

In [10]:
parsed_markup.title.text

'GETTING STARTED WITH bs4'

In [11]:
type(parsed_markup.title.text)

str

In [12]:
parsed_markup.title.string

'GETTING STARTED WITH bs4'

In [13]:
type(parsed_markup.title.string)

bs4.element.NavigableString

In [14]:
parsed_markup.title.text.split(" ")[-1]

'bs4'

In [15]:
parsed_markup.div

<div class="para 1">
<p>
                This is paragraph one
            </p>
<p>
                This is paragraph two
            </p>
</div>

In [16]:
parsed_markup.find_all("div")

[<div class="para 1">
 <p>
                 This is paragraph one
             </p>
 <p>
                 This is paragraph two
             </p>
 </div>,
 <div class="para 2">
 <p>
                 This is para 1 in div 2
             </p>
 </div>,
 <div class="para 1">
 <p>
                 This is paragraph three of div with class para 1
             </p>
 <p>
                 This is paragraph four of div with class para 1
             </p>
 </div>]

In [17]:
parsed_markup.find_all("div",class_='para 2')

[<div class="para 2">
 <p>
                 This is para 1 in div 2
             </p>
 </div>]

In [18]:
divs=parsed_markup.find_all("div",class_='para 1')

In [19]:
divs

[<div class="para 1">
 <p>
                 This is paragraph one
             </p>
 <p>
                 This is paragraph two
             </p>
 </div>,
 <div class="para 1">
 <p>
                 This is paragraph three of div with class para 1
             </p>
 <p>
                 This is paragraph four of div with class para 1
             </p>
 </div>]

In [20]:
divs[0].find_all("p")

[<p>
                 This is paragraph one
             </p>,
 <p>
                 This is paragraph two
             </p>]

In [21]:
divs[1]

<div class="para 1">
<p>
                This is paragraph three of div with class para 1
            </p>
<p>
                This is paragraph four of div with class para 1
            </p>
</div>

In [22]:
for i in parsed_markup.find_all("div",class_="para 1"):
    paras=i.find_all("p")
    print(paras[1].text.strip())

This is paragraph two
This is paragraph four of div with class para 1


In [23]:
## Extracting data:
para_data=[]
for i in parsed_markup.find_all("div",class_="para 1"):
    paras=i.find_all("p")
    para_data.append(paras[1].text.strip())

In [24]:
para_data

['This is paragraph two', 'This is paragraph four of div with class para 1']

In [25]:
mydiv=parsed_markup.div

In [26]:
type(mydiv)

bs4.element.Tag

In [27]:
mydiv

<div class="para 1">
<p>
                This is paragraph one
            </p>
<p>
                This is paragraph two
            </p>
</div>

In [28]:
print(mydiv['class'])

['para', '1']


In [29]:
body=parsed_markup.body

In [30]:
body

<body>
<div class="para 1">
<p>
                This is paragraph one
            </p>
<p>
                This is paragraph two
            </p>
</div>
<div class="para 2">
<p>
                This is para 1 in div 2
            </p>
</div>
<div class="para 1">
<p>
                This is paragraph three of div with class para 1
            </p>
<p>
                This is paragraph four of div with class para 1
            </p>
</div>
</body>

In [31]:
for child in body.contents:
    print(child if child is not None else '',end='***\n')


***
<div class="para 1">
<p>
                This is paragraph one
            </p>
<p>
                This is paragraph two
            </p>
</div>***

***
<div class="para 2">
<p>
                This is para 1 in div 2
            </p>
</div>***

***
<div class="para 1">
<p>
                This is paragraph three of div with class para 1
            </p>
<p>
                This is paragraph four of div with class para 1
            </p>
</div>***

***


In [32]:
for child in body.children:
    print(child if child is not None else '',end='***\n')


***
<div class="para 1">
<p>
                This is paragraph one
            </p>
<p>
                This is paragraph two
            </p>
</div>***

***
<div class="para 2">
<p>
                This is para 1 in div 2
            </p>
</div>***

***
<div class="para 1">
<p>
                This is paragraph three of div with class para 1
            </p>
<p>
                This is paragraph four of div with class para 1
            </p>
</div>***

***


In [33]:
#children = [child for child in body.contents if child != '\n']
#print(children)
children=[]
for child in body.contents:
    if child != '\n':
        children.append(child)
print(children)

[<div class="para 1">
<p>
                This is paragraph one
            </p>
<p>
                This is paragraph two
            </p>
</div>, <div class="para 2">
<p>
                This is para 1 in div 2
            </p>
</div>, <div class="para 1">
<p>
                This is paragraph three of div with class para 1
            </p>
<p>
                This is paragraph four of div with class para 1
            </p>
</div>]


In [34]:
for index,child in enumerate(body.descendants):
    if child != '\n':
        print(index,'-',end=':')
        print(child if child != '\n' else '\\n')

1 -:<div class="para 1">
<p>
                This is paragraph one
            </p>
<p>
                This is paragraph two
            </p>
</div>
3 -:<p>
                This is paragraph one
            </p>
4 -:
                This is paragraph one
            
6 -:<p>
                This is paragraph two
            </p>
7 -:
                This is paragraph two
            
10 -:<div class="para 2">
<p>
                This is para 1 in div 2
            </p>
</div>
12 -:<p>
                This is para 1 in div 2
            </p>
13 -:
                This is para 1 in div 2
            
16 -:<div class="para 1">
<p>
                This is paragraph three of div with class para 1
            </p>
<p>
                This is paragraph four of div with class para 1
            </p>
</div>
18 -:<p>
                This is paragraph three of div with class para 1
            </p>
19 -:
                This is paragraph three of div with class para 1
            
21 -:<p>
     