In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Get the html from the page
url = "https://realpython.github.io/fake-jobs/"
response = requests.get(url)
print(response.status_code)
response.ok

200


True

In [3]:
# Make a beautiful soup
soup = BeautifulSoup(response.content)
print(soup.prettify()[:500])

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Fake Python
  </title>
  <link href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css" rel="stylesheet"/>
 </head>
 <body>
  <section class="section">
   <div class="container mb-5">
    <h1 class="title is-1">
     Fake Python
    </h1>
    <p class="subtitle is-3">
     Fake Jobs for Your Web Scraping Journey
    </p>
   </div>
   <div class="c


In [4]:
# Getting a list of all job listing divs
found_job_divs = soup.find_all("div", attrs={'class':'card-content'})
len(found_job_divs)

100

In [5]:
# Slicing a single job to figure out extraction code
job_div = found_job_divs[0]
job_div

<div class="card-content">
<div class="media">
<div class="media-left">
<figure class="image is-48x48">
<img alt="Real Python Logo" src="https://files.realpython.com/media/real-python-logo-thumbnail.7f0db70c2ed2.jpg?__no_cf_polish=1"/>
</figure>
</div>
<div class="media-content">
<h2 class="title is-5">Senior Python Developer</h2>
<h3 class="subtitle is-6 company">Payne, Roberts and Davis</h3>
</div>
</div>
<div class="content">
<p class="location">
        Stewartbury, AA
      </p>
<p class="is-small has-text-grey">
<time datetime="2021-04-08">2021-04-08</time>
</p>
</div>
<footer class="card-footer">
<a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>
<a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html" target="_blank">Apply</a>
</footer>
</div>

In [6]:
from IPython.display import HTML
HTML(str(job_div))

In [7]:
# Finding the h2 that has title class
title_tag = job_div.find('h2',{'class':'title'})
title_tag

<h2 class="title is-5">Senior Python Developer</h2>

In [8]:
# Getting the job title text
title = title_tag.text
title

'Senior Python Developer'

In [9]:
# Get the text from the h3 with class=company 
company_name = job_div.find('h3',{'class':'company'}).text
company_name

'Payne, Roberts and Davis'

In [10]:
# Get any type of tag that has class=location)
location = job_div.find(attrs={'class':'location'})
location.text

'\n        Stewartbury, AA\n      '

In [11]:
# Cleaning the location name
location_name = location.text.strip()
location_name

'Stewartbury, AA'

In [12]:
# Getting time tag of date posted
time_tag = job_div.find('time')
time_tag

<time datetime="2021-04-08">2021-04-08</time>

In [13]:
# Get the date (text from time tag)
date_posted = time_tag.text
date_posted

'2021-04-08'

In [14]:
# Getting all of the links 
links = job_div.find_all('a', href=True)
links

[<a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>,
 <a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html" target="_blank">Apply</a>]

In [17]:
# First link's attrs
links[0].attrs

{'href': 'https://www.realpython.com',
 'target': '_blank',
 'class': ['card-footer-item']}

In [16]:
# Other link's attrs
links[1].attrs

{'href': 'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html',
 'target': '_blank',
 'class': ['card-footer-item']}

In [18]:
print(links[0].text)
print(links[1].text)

Learn
Apply


In [19]:
# Find the a tag with the Learn string
learn_tag = job_div.find("a",string='Learn')
learn_tag

<a class="card-footer-item" href="https://www.realpython.com" target="_blank">Learn</a>

In [20]:
# Save the href (link) form the a-tag
learn_url = learn_tag.attrs['href']
learn_url

'https://www.realpython.com'

In [21]:
# Find the a tag with the Apply string
apply_tag  = job_div.find("a", string='Apply')
apply_tag

<a class="card-footer-item" href="https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html" target="_blank">Apply</a>

In [23]:
# Save the href (link) form the a-tag
apply_url = apply_tag.attrs['href']
apply_url

'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html'

In [25]:
# All the code to define a dictionary of info for one job listing

# Finding the h2 that has title class
title = job_div.find('h2',{'class':'title'}).text

# Get the text from the h3 with class=company 
company_name = job_div.find('h3',{'class':'company'}).text

# Get any type of tag that has class=location)
location_name = job_div.find(attrs={'class':'location'}).text.strip()

# Get the date (text from time tag)
date_posted = job_div.find('time').text


# Find the a tag with the Learn string
learn_tag = job_div.find("a",string='Learn')

# Save the href (link) form the a-tag
learn_url = learn_tag.attrs['href']


# Find the a tag with the Apply string
apply_tag  = job_div.find("a", string='Apply')

# Save the href (link) form the a-tag
apply_url = apply_tag.attrs['href']

# Putting it all together
job_info = {'Title':title, 
            "Company":company_name, 
            "Location":location_name,
            "Date Posted":date_posted,
            "Learn Link": learn_url,
            "Apply Link": apply_url}
job_info

{'Title': 'Senior Python Developer',
 'Company': 'Payne, Roberts and Davis',
 'Location': 'Stewartbury, AA',
 'Date Posted': '2021-04-08',
 'Learn Link': 'https://www.realpython.com',
 'Apply Link': 'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html'}

In [27]:
# Empty list for results (Will end up a list of dictionaries)
all_jobs_data = []

for job_div in found_job_divs:
        
    # Finding the h2 that has title class
    title = job_div.find('h2',{'class':'title'}).text
    
    # Get the text from the h3 with class=company 
    company_name = job_div.find('h3',{'class':'company'}).text
    
    # Get any type of tag that has class=location)
    location_name = job_div.find(attrs={'class':'location'}).text.strip()
    
    # Get the date (text from time tag)
    date_posted = job_div.find('time').text
    
    
    # Find the a tag with the Learn string
    learn_tag = job_div.find("a",string='Learn')
    
    # Save the href (link) form the a-tag
    learn_url = learn_tag.attrs['href']
    
    
    # Find the a tag with the Apply string
    apply_tag  = job_div.find("a", string='Apply')
    
    # Save the href (link) form the a-tag
    apply_url = apply_tag.attrs['href']
    
    # Putting it all together
    job_info = {'Title':title, 
                "Company":company_name, 
                "Location":location_name,
                "Date Posted":date_posted,
                "Learn Link": learn_url,
                "Apply Link": apply_url}
    all_jobs_data.append(job_info)

In [28]:
# Convert list of dictionaries to a dataframe
jobs_df = pd.DataFrame(all_jobs_data)
jobs_df.head()

Unnamed: 0,Title,Company,Location,Date Posted,Learn Link,Apply Link
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/se...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/en...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/le...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/fi...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/pr...


In [29]:
# Saving the Results:
jobs_df.to_csv("Data/fake-job-listings-scraped.csv", index=False)