# Pandas CSS Scrape

In [1]:
# Import BeautifulSoup
from bs4 import BeautifulSoup

In [2]:
# Save the provided HTML code as a Python string
html = """
<!DOCTYPE html>
<html lang="en">

<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Document</title>
</head>

<body>
    <h1>pandas</h1>
    <blockquote>pandas is a powerful Python data analysis library. </blockquote>

    <div id="start" class="section">
        <h3 class="section-header">Getting Started</h3>
        <ul class="pandas-ul">
            <a href="https://pandas.pydata.org/getting_started.html">
                <li>Install pandas</li>
            </a>
            <a href="https://pandas.pydata.org/docs/getting_started/index.html">
                <li>Getting started</li>
            </a>
        </ul>
    </div>

    <div id="documentation" class="section">
        <h3 class="section-header">Documentation</h3>
        <ul class="pandas-ul">
            <a href="https://pandas.pydata.org/docs/user_guide/index.html">
                <li>User guide</li>
            </a>
            <a href="https://pandas.pydata.org/docs/reference/index.html">
                <li>API reference</li>
            </a>
            <a href="https://pandas.pydata.org/docs/development/index.html">
                <li>How to contribute to pandas</li>
            </a>
        </ul>
    </div>

    <div id="community" class="section">
        <h3 class="section-header">The pandas Community</h3>
        <ul class="pandas-ul">
            <a href="https://pandas.pydata.org/about/index.html">
                <li>More about pandas</li>
            </a>
            <a href="https://stackoverflow.com/questions/tagged/pandas">
                <li>Have questions?</li>
            </a>
            <a href="https://pandas.pydata.org/community/ecosystem.html">
                <li>The pandas ecosystem</li>
            </a>
        </ul>
    </div>
</body>

</html>
"""

In [3]:
# Convert the HTML string into a BeautifulSoup object
soup = BeautifulSoup(html, 'html.parser')

In [4]:
# Use the find_all function to retrieve all of the h3-level headers


[<h3 class="section-header">Getting Started</h3>,
 <h3 class="section-header">Documentation</h3>,
 <h3 class="section-header">The pandas Community</h3>]

In [5]:
# Display the text of all h3-level headers


['Getting Started', 'Documentation', 'The pandas Community']

In [6]:
# Select only the first section ("Getting Started"). Use the id of this section to scrape the data


<div class="section" id="start">
<h3 class="section-header">Getting Started</h3>
<ul class="pandas-ul">
<a href="https://pandas.pydata.org/getting_started.html">
<li>Install pandas</li>
</a>
<a href="https://pandas.pydata.org/docs/getting_started/index.html">
<li>Getting started</li>
</a>
</ul>
</div>

In [7]:
# Select and display the text of the h3-level header of this section


'Getting Started'

In [8]:
# Select the link elements of this section


[<a href="https://pandas.pydata.org/getting_started.html">
 <li>Install pandas</li>
 </a>,
 <a href="https://pandas.pydata.org/docs/getting_started/index.html">
 <li>Getting started</li>
 </a>]

In [9]:
# Select and display the URLs for the link elements of this section


['https://pandas.pydata.org/getting_started.html',
 'https://pandas.pydata.org/docs/getting_started/index.html']

In [10]:
# Select and display all URLs for all link elements on the page


['https://pandas.pydata.org/getting_started.html',
 'https://pandas.pydata.org/docs/getting_started/index.html',
 'https://pandas.pydata.org/docs/user_guide/index.html',
 'https://pandas.pydata.org/docs/reference/index.html',
 'https://pandas.pydata.org/docs/development/index.html',
 'https://pandas.pydata.org/about/index.html',
 'https://stackoverflow.com/questions/tagged/pandas',
 'https://pandas.pydata.org/community/ecosystem.html']

In [11]:
# BONUS: Compile a dictionary of section titles and URLs
pandas_info_dict = {}

In [12]:
# Select all div elements on the page


# For each section


    # Extract the text of the h3-level header


    # Extract all URLs for all link elements of the section


    # Add the information to the dictionary


In [13]:
# Display the dictionary
pandas_info_dict

{'Getting Started': ['https://pandas.pydata.org/getting_started.html',
  'https://pandas.pydata.org/docs/getting_started/index.html'],
 'Documentation': ['https://pandas.pydata.org/docs/user_guide/index.html',
  'https://pandas.pydata.org/docs/reference/index.html',
  'https://pandas.pydata.org/docs/development/index.html'],
 'The pandas Community': ['https://pandas.pydata.org/about/index.html',
  'https://stackoverflow.com/questions/tagged/pandas',
  'https://pandas.pydata.org/community/ecosystem.html']}

In [14]:
# Alternatively, use a nested for loop to extract the URLs
alternate_dict = {}

# For each section


    # Extract the text of the h3-level header


    # Select all link elements of the section


    # Create an empty list to store the URLs


    # For each link, extract and save the URL


    # Add the information to the dictionary


In [15]:
# Display the dictionary
alternate_dict

{'Getting Started': ['https://pandas.pydata.org/getting_started.html',
  'https://pandas.pydata.org/docs/getting_started/index.html'],
 'Documentation': ['https://pandas.pydata.org/docs/user_guide/index.html',
  'https://pandas.pydata.org/docs/reference/index.html',
  'https://pandas.pydata.org/docs/development/index.html'],
 'The pandas Community': ['https://pandas.pydata.org/about/index.html',
  'https://stackoverflow.com/questions/tagged/pandas',
  'https://pandas.pydata.org/community/ecosystem.html']}