### If You want to scrape a website:
1. use the Api
2. HTML Web Scrapping using some tools
    

# Import Libraries

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = "https://scikit-learn.org/stable/"

# Step 1: Get the HTML

In [3]:
r = requests.get(url)
htmlContent = r.content

In [4]:
htmlContent



# Step 2: Parse the HTML

In [5]:
soup = BeautifulSoup(htmlContent, 'html.parser')

In [6]:
soup


<!DOCTYPE html>

<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="scikit-learn: machine learning in Python" name="Description"/>
<title>scikit-learn: machine learning in Python — scikit-learn 0.22 documentation</title>
<link href="http://scikit-learn.org/stable/index.html" rel="canonical"/>
<link href="_static/favicon.ico" rel="shortcut icon"/>
<link href="_static/css/vendor/bootstrap.min.css" rel="stylesheet" type="text/css"/>
<link href="_static/gallery.css" rel="stylesheet" type="text/css"/>
<link href="_static/css/theme.css" rel="stylesheet" type="text/css"/>
<script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
<script src="_static/jquery.js"></script>
</head>
<body>
<nav class="sk-landing-navbar navbar navbar-expand-md navbar-li

In [7]:
soup.prettify

<bound method Tag.prettify of 
<!DOCTYPE html>

<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="scikit-learn: machine learning in Python" name="Description"/>
<title>scikit-learn: machine learning in Python — scikit-learn 0.22 documentation</title>
<link href="http://scikit-learn.org/stable/index.html" rel="canonical"/>
<link href="_static/favicon.ico" rel="shortcut icon"/>
<link href="_static/css/vendor/bootstrap.min.css" rel="stylesheet" type="text/css"/>
<link href="_static/gallery.css" rel="stylesheet" type="text/css"/>
<link href="_static/css/theme.css" rel="stylesheet" type="text/css"/>
<script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
<script src="_static/jquery.js"></script>
</head>
<body>
<nav class="sk-landing-navbar nav

# step 3: HTML Tree traversal

### Commonly used types of object..
1. Tag
2. NavigableString
3. BeautifulSoup
4. Comment

In [8]:
# 1. Tag
title = soup.title
type(title)

bs4.element.Tag

In [9]:
# 2. NavigableString
type(title.string)

bs4.element.NavigableString

In [10]:
# 3. BeautifulSoup
type(soup)

bs4.BeautifulSoup

### Get the title of HTML page

In [11]:
title = soup.title
title

<title>scikit-learn: machine learning in Python — scikit-learn 0.22 documentation</title>

### Get all the paragraphs from the page

In [12]:
paras = soup.find_all('p')
paras

[<p class="card-text">Identifying which category an object belongs to.</p>,
 <p class="card-text"><strong>Applications:</strong> Spam detection, image recognition.</p>,
 <p class="card-text">Predicting a continuous-valued attribute associated with an object.</p>,
 <p class="card-text"><strong>Applications:</strong> Drug response, Stock prices.</p>,
 <p class="card-text">Automatic grouping of similar objects into sets.</p>,
 <p class="card-text"><strong>Applications:</strong> Customer segmentation, Grouping experiment outcomes</p>,
 <p class="card-text">Reducing the number of random variables to consider.</p>,
 <p class="card-text"><strong>Applications:</strong> Visualization, Increased efficiency</p>,
 <p class="card-text">Comparing, validating and choosing parameters and models.</p>,
 <p class="card-text"><strong>Applications:</strong> Improved accuracy via parameter tuning</p>,
 <p class="card-text">Feature extraction and normalization.</p>,
 <p class="card-text"><strong>Applications

### Get all the anchor tags from the page

In [13]:
anchors = soup.find_all('a')
anchors

[<a class="navbar-brand py-0" href="#">
 <img alt="logo" class="sk-brand-img" src="_static/scikit-learn-logo-small.png"/>
 </a>,
 <a class="sk-nav-link nav-link" href="install.html">Install</a>,
 <a class="sk-nav-link nav-link" href="user_guide.html">User Guide</a>,
 <a class="sk-nav-link nav-link" href="modules/classes.html">API</a>,
 <a class="sk-nav-link nav-link" href="auto_examples/index.html">Examples</a>,
 <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="getting_started.html">Getting Started</a>,
 <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="tutorial/index.html">Tutorial</a>,
 <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="glossary.html">Glossary</a>,
 <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="developers/index.html">Development</a>,
 <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="faq.html">FAQ</a>,
 <a class="sk-nav-link nav-link nav-more-item-mobile-items" href="related_projects.html"

### Get first element in the  HTML page

In [14]:
soup.find('p')

<p class="card-text">Identifying which category an object belongs to.</p>

### Get the classes of any elements int he HTML page

In [15]:
soup.find('p')['class']

['card-text']

### Find all the elements with the class lead

In [16]:
print(soup.find_all('p', class_ = "card-text"))

[<p class="card-text">Identifying which category an object belongs to.</p>, <p class="card-text"><strong>Applications:</strong> Spam detection, image recognition.</p>, <p class="card-text">Predicting a continuous-valued attribute associated with an object.</p>, <p class="card-text"><strong>Applications:</strong> Drug response, Stock prices.</p>, <p class="card-text">Automatic grouping of similar objects into sets.</p>, <p class="card-text"><strong>Applications:</strong> Customer segmentation, Grouping experiment outcomes</p>, <p class="card-text">Reducing the number of random variables to consider.</p>, <p class="card-text"><strong>Applications:</strong> Visualization, Increased efficiency</p>, <p class="card-text">Comparing, validating and choosing parameters and models.</p>, <p class="card-text"><strong>Applications:</strong> Improved accuracy via parameter tuning</p>, <p class="card-text">Feature extraction and normalization.</p>, <p class="card-text"><strong>Applications:</strong> 

### Get text from the tags/soup

In [17]:
print(soup.find('p').get_text())

Identifying which category an object belongs to.


In [18]:
print(soup.get_text())




  




scikit-learn: machine learning in Python — scikit-learn 0.22 documentation




















Install


User Guide


API


Examples


Getting Started


Tutorial


Glossary


Development


FAQ


Related packages


Roadmap


About us


GitHub


Other Versions


More

Getting Started
Tutorial
Glossary
Development
FAQ
Related packages
Roadmap
About us
GitHub
Other Versions


















scikit-learn
Machine Learning in Python
Getting Started
What's New in 0.22
GitHub



Simple and efficient tools for predictive data analysis
Accessible to everybody, and reusable in various contexts
Built on NumPy, SciPy, and matplotlib
Open source, commercially usable - BSD license










Classification
Identifying which category an object belongs to.
Applications: Spam detection, image recognition.
Algorithms:
SVM,
          nearest neighbors,
          random forest,
          and more...






Examples





Regression
Predicting a continuous-valued attribute associated with an object.


### Get all the links on the page

In [19]:
all_links = set()
for link in anchors:
    if(link.get('href') != '#'):
#         linkText = "https://scikit-learn.org/stable/" + link.get('href')
        all_links.add(link)
        print(all_links)

{<a class="sk-nav-link nav-link" href="install.html">Install</a>}
{<a class="sk-nav-link nav-link" href="install.html">Install</a>, <a class="sk-nav-link nav-link" href="user_guide.html">User Guide</a>}
{<a class="sk-nav-link nav-link" href="install.html">Install</a>, <a class="sk-nav-link nav-link" href="user_guide.html">User Guide</a>, <a class="sk-nav-link nav-link" href="modules/classes.html">API</a>}
{<a class="sk-nav-link nav-link" href="install.html">Install</a>, <a class="sk-nav-link nav-link" href="user_guide.html">User Guide</a>, <a class="sk-nav-link nav-link" href="auto_examples/index.html">Examples</a>, <a class="sk-nav-link nav-link" href="modules/classes.html">API</a>}
{<a class="sk-nav-link nav-link nav-more-item-mobile-items" href="getting_started.html">Getting Started</a>, <a class="sk-nav-link nav-link" href="install.html">Install</a>, <a class="sk-nav-link nav-link" href="modules/classes.html">API</a>, <a class="sk-nav-link nav-link" href="user_guide.html">User Guid

### 4. comment

In [20]:
markup = "<p><!-- this is a comment --></p>"
soup2 = BeautifulSoup(markup)
soup2

<html><body><p><!-- this is a comment --></p></body></html>

In [21]:
print(soup2.p)

<p><!-- this is a comment --></p>


In [22]:
print(soup2.p.string)

 this is a comment 


In [23]:
print(type(soup2.p.string))

<class 'bs4.element.Comment'>


.contents -> A tag's children are available as a list<br>
.children -> A tag's children are available as a genrator

In [24]:
navbarSupportedContent = soup.find(id = 'navbarSupportedContent')

navbarSupportedContent.children

<list_iterator at 0x233fdc9df60>

In [25]:
print(navbarSupportedContent.contents)

['\n', <ul class="navbar-nav mr-auto">
<li class="nav-item">
<a class="sk-nav-link nav-link" href="install.html">Install</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link" href="user_guide.html">User Guide</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link" href="modules/classes.html">API</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link" href="auto_examples/index.html">Examples</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link nav-more-item-mobile-items" href="getting_started.html">Getting Started</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link nav-more-item-mobile-items" href="tutorial/index.html">Tutorial</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link nav-more-item-mobile-items" href="glossary.html">Glossary</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link nav-more-item-mobile-items" href="developers/index.html">Development</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link na

In [26]:
for elem in navbarSupportedContent.contents:
    print(elem)



<ul class="navbar-nav mr-auto">
<li class="nav-item">
<a class="sk-nav-link nav-link" href="install.html">Install</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link" href="user_guide.html">User Guide</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link" href="modules/classes.html">API</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link" href="auto_examples/index.html">Examples</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link nav-more-item-mobile-items" href="getting_started.html">Getting Started</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link nav-more-item-mobile-items" href="tutorial/index.html">Tutorial</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link nav-more-item-mobile-items" href="glossary.html">Glossary</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link nav-more-item-mobile-items" href="developers/index.html">Development</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link nav-mor

In [27]:
for item in navbarSupportedContent.strings:
    print(item)







Install






User Guide






API






Examples






Getting Started






Tutorial






Glossary






Development






FAQ






Related packages






Roadmap






About us






GitHub






Other Versions






More




Getting Started


Tutorial


Glossary


Development


FAQ


Related packages


Roadmap


About us


GitHub


Other Versions


























In [28]:
for item in navbarSupportedContent.stripped_strings:
    print(item)

Install
User Guide
API
Examples
Getting Started
Tutorial
Glossary
Development
FAQ
Related packages
Roadmap
About us
GitHub
Other Versions
More
Getting Started
Tutorial
Glossary
Development
FAQ
Related packages
Roadmap
About us
GitHub
Other Versions


In [29]:
print(navbarSupportedContent.parent)

<div class="container-fluid sk-landing-container px-0">
<a class="navbar-brand py-0" href="#">
<img alt="logo" class="sk-brand-img" src="_static/scikit-learn-logo-small.png"/>
</a>
<button aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation" class="navbar-toggler" data-target="#navbarSupportedContent" data-toggle="collapse" id="sk-navbar-toggler" type="button">
<span class="navbar-toggler-icon"></span>
</button>
<div class="sk-navbar-collapse collapse navbar-collapse" id="navbarSupportedContent">
<ul class="navbar-nav mr-auto">
<li class="nav-item">
<a class="sk-nav-link nav-link" href="install.html">Install</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link" href="user_guide.html">User Guide</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link" href="modules/classes.html">API</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link" href="auto_examples/index.html">Examples</a>
</li>
<li class="nav-item">
<a class="sk-n

In [30]:
for par in navbarSupportedContent.parent:
    print(par)



<a class="navbar-brand py-0" href="#">
<img alt="logo" class="sk-brand-img" src="_static/scikit-learn-logo-small.png"/>
</a>


<button aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation" class="navbar-toggler" data-target="#navbarSupportedContent" data-toggle="collapse" id="sk-navbar-toggler" type="button">
<span class="navbar-toggler-icon"></span>
</button>


<div class="sk-navbar-collapse collapse navbar-collapse" id="navbarSupportedContent">
<ul class="navbar-nav mr-auto">
<li class="nav-item">
<a class="sk-nav-link nav-link" href="install.html">Install</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link" href="user_guide.html">User Guide</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link" href="modules/classes.html">API</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link" href="auto_examples/index.html">Examples</a>
</li>
<li class="nav-item">
<a class="sk-nav-link nav-link nav-more-item-mobile-items" href=

In [31]:
for par in navbarSupportedContent.parents:
    print(par.name)

div
nav
body
html
[document]


In [32]:
print(navbarSupportedContent.next_sibling.next_sibling)

None


In [33]:
print(navbarSupportedContent.previous_sibling.previous_sibling)

<button aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation" class="navbar-toggler" data-target="#navbarSupportedContent" data-toggle="collapse" id="sk-navbar-toggler" type="button">
<span class="navbar-toggler-icon"></span>
</button>


In [34]:
elem = soup.select('.d-flex')
print(elem)

[<div class="col-md-6 d-flex">
<ul class="sk-landing-header-body">
<li>Simple and efficient tools for predictive data analysis</li>
<li>Accessible to everybody, and reusable in various contexts</li>
<li>Built on NumPy, SciPy, and matplotlib</li>
<li>Open source, commercially usable - BSD license</li>
</ul>
</div>]


In [35]:
elem = soup.select('.col-md-6')
print(elem)

[<div class="col-md-6 mb-3 mb-md-0">
<h1 class="sk-landing-header text-white text-monospace">scikit-learn</h1>
<h4 class="sk-landing-subheader text-white font-italic mb-3">Machine Learning in Python</h4>
<a class="btn sk-landing-btn mb-1" href="getting_started.html" role="button">Getting Started</a>
<a class="btn sk-landing-btn mb-1" href="whats_new/v0.22.html" role="button">What's New in 0.22</a>
<a class="btn sk-landing-btn mb-1" href="https://github.com/scikit-learn/scikit-learn" role="button">GitHub</a>
</div>, <div class="col-md-6 d-flex">
<ul class="sk-landing-header-body">
<li>Simple and efficient tools for predictive data analysis</li>
<li>Accessible to everybody, and reusable in various contexts</li>
<li>Built on NumPy, SciPy, and matplotlib</li>
<li>Open source, commercially usable - BSD license</li>
</ul>
</div>]
