In [1]:
import requests
url = 'https://www.codingdojo.com'
response = requests.get(url)
type(response)

requests.models.Response

In [2]:
# Obtain the status code of the response
response.status_code

200

In [3]:
# Check if the request was successful
if response.status_code == 200:
    print("Success!")
else:
    print("An error occurred.")

Success!


In [4]:
# Obtain headers from response (First 100 chars)
str(response.headers)[:100]

"{'Date': 'Fri, 19 Apr 2024 04:57:38 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Enco"

In [5]:
# Obtain final url from response
response.url

'https://www.codingdojo.com/'

In [6]:
# Obtain encoding from response
response.encoding

'UTF-8'

In [7]:
# Obtain content from response
response.content[:500]

b'<!DOCTYPE html>\n<html lang="en">\n    <head>\n        <title>Coding Dojo - Software Development Coding Bootcamp</title>\n        <link rel="shortcut icon" href="https://cutecdn.codingdojo.com/images/global/cd_profile_icon_favicon.ico" type="image/x-icon">\n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n        <meta name="viewport" content="width=device-width, initial-scale=1 minimum-scale=1.0">\n        <meta name="description" property="og:description" '

In [8]:
# Creating a Soup Object

from bs4 import BeautifulSoup
# Use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(response.content)
# Using .prettify() to view the parsed HTML structure in a readable format
print(soup.prettify()[:2000])

<!DOCTYPE html>
<html lang="en">
 <head>
  <title>
   Coding Dojo - Software Development Coding Bootcamp
  </title>
  <link href="https://cutecdn.codingdojo.com/images/global/cd_profile_icon_favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1 minimum-scale=1.0" name="viewport"/>
  <meta content="Interested in starting your new career in tech? Learn to code at one of the top coding bootcamps. Build your first website and app in a matter of weeks. We teach Python, Java, .NET, iOS, Javascript and more. Online learning options and scholarships available. Learn more today!" name="description" property="og:description"/>
  <meta content="Coding Dojo - Software Development Coding Bootcamp" name="title" property="og:title"/>
  <meta content="" name="author"/>
  <link href="https://www.codingdojo.com/" rel="canonical"/>
  <link href="https://bat.bing" rel="d

In [9]:
## Navigating the Parsed Tree
#1 Accessing tags

soup.title

<title>Coding Dojo - Software Development Coding Bootcamp</title>

In [10]:
soup.head

<head>
<title>Coding Dojo - Software Development Coding Bootcamp</title>
<link href="https://cutecdn.codingdojo.com/images/global/cd_profile_icon_favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1 minimum-scale=1.0" name="viewport"/>
<meta content="Interested in starting your new career in tech? Learn to code at one of the top coding bootcamps. Build your first website and app in a matter of weeks. We teach Python, Java, .NET, iOS, Javascript and more. Online learning options and scholarships available. Learn more today!" name="description" property="og:description"/>
<meta content="Coding Dojo - Software Development Coding Bootcamp" name="title" property="og:title"/>
<meta content="" name="author"/>
<link href="https://www.codingdojo.com/" rel="canonical"/>
<link href="https://bat.bing" rel="dns-prefetch"/>
<link href="https://a.omappapi.com" rel="dns-p

In [11]:
print(soup.body.prettify()[:1000])

<body class="home main-content-wrap">
 <!-- Google Tag Manager (noscript) -->
 <noscript>
  <iframe height="0" src="https://metrics.codingdojo.com/ns.html?id=GTM-PPFN88D" style="display:none;visibility:hidden" width="0">
  </iframe>
 </noscript>
 <!-- End Google Tag Manager (noscript) -->
 <!-- New Admission Process 2.0 -->
 <div class="header_block home" id="header_box">
  <div class="" id="main_nav_container">
   <ul id="user_type_list">
    <li>
     <span class="dropdown_title enterprise">
      For Enterprise
     </span>
     <div class="dropdown_menu" id="enterprise_list">
      <ul>
       <li>
        <h4>
         FOR BUSINESS
        </h4>
        <div>
         <a href="/corporate-training">
          Corporate Training
         </a>
        </div>
       </li>
      </ul>
     </div>
    </li>
    <li>
     <a href="https://login.codingdojo.com/" target="_blank">
      Student Login
     </a>
    </li>
   </ul>
   <div class="main_nav_block">
    <a class="cd_logo uses_spr

In [12]:
#2 Accessing tag content

print(soup.title.text)

Coding Dojo - Software Development Coding Bootcamp


In [14]:
#3 Access tag attribute

soup.a['href']

'/corporate-training'

In [15]:
#4 Searching the tree 

# find all h2 tags
found_h2_tags = soup.find_all('h2')
len(found_h2_tags)

8

In [16]:
# Displaying the h2 tags
found_h2_tags

[<h2>Learn digital skills to enhance your career path</h2>,
 <h2>Redefine Possible</h2>,
 <h2>Discover our Bootcamps</h2>,
 <h2>Have questions?</h2>,
 <h2>Frequently Asked Questions</h2>,
 <h2>Download Our Master Course Packet</h2>,
 <h2>Redefine Your Possible</h2>,
 <h2>Reservation complete</h2>]

**Task**

Now that we have introduced the basics of requests and BeautifulSoup, we will demonstrate how to extract all of the titles from a website. We will use a simple website that was designed for practicing web scraping. It has fake job listings.

In [18]:
# Make request
url = 'https://realpython.github.io/fake-jobs/' 
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
    print("Success!")
else:
    print("An error occurred.")

Success!


In [19]:
# Create soup object with the response content
soup = BeautifulSoup(response.content)
# Preview content
print(soup.body.prettify()[:500])

<body>
 <section class="section">
  <div class="container mb-5">
   <h1 class="title is-1">
    Fake Python
   </h1>
   <p class="subtitle is-3">
    Fake Jobs for Your Web Scraping Journey
   </p>
  </div>
  <div class="container">
   <div class="columns is-multiline" id="ResultsContainer">
    <div class="column is-half">
     <div class="card">
      <div class="card-content">
       <div class="media">
        <div class="media-left">
         <figure class="image is-48x48">
          <img a


In [20]:
# Extract and print all titles
for title in soup.find_all(['h1', 'h2']):
    print(title.get_text())


        Fake Python
      
Senior Python Developer
Energy engineer
Legal executive
Fitness centre manager
Product manager
Medical technical officer
Physiological scientist
Textile designer
Television floor manager
Waste management officer
Software Engineer (Python)
Interpreter
Architect
Meteorologist
Audiological scientist
English as a second language teacher
Surgeon
Equities trader
Newspaper journalist
Materials engineer
Python Programmer (Entry-Level)
Product/process development scientist
Scientist, research (maths)
Ecologist
Materials engineer
Historic buildings inspector/conservation officer
Data scientist
Psychiatrist
Structural engineer
Immigration officer
Python Programmer (Entry-Level)
Neurosurgeon
Broadcast engineer
Make
Nurse, adult
Air broker
Editor, film/video
Production assistant, radio
Engineer, communications
Sales executive
Software Developer (Python)
Futures trader
Tour manager
Cytogeneticist
Designer, multimedia
Trade union research officer
Chemist, analytical
Progra