In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# Load csv using your path
jobs_df = pd.read_csv('Data/fake-job-listings-scraped.csv')
jobs_df.head()

Unnamed: 0,Title,Company,Location,Date Posted,Learn Link,Apply Link
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/se...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/en...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/le...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/fi...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/pr...


In [3]:
# Getting list links 
links = jobs_df['Apply Link']
links[0:5]

0    https://realpython.github.io/fake-jobs/jobs/se...
1    https://realpython.github.io/fake-jobs/jobs/en...
2    https://realpython.github.io/fake-jobs/jobs/le...
3    https://realpython.github.io/fake-jobs/jobs/fi...
4    https://realpython.github.io/fake-jobs/jobs/pr...
Name: Apply Link, dtype: object

In [4]:
# Slicing a test link
link = links[0]
link

'https://realpython.github.io/fake-jobs/jobs/senior-python-developer-0.html'

In [5]:
# Get the job page's response
import time

response = requests.get(link)
time.sleep(2)

In [9]:
#Save response.content as a BeautifulSoup Object
soup = BeautifulSoup(response.content)
# Preview the result
print(soup.prettify()[:1000])

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Fake Python
  </title>
  <link href="https://cdn.jsdelivr.net/npm/bulma@0.9.2/css/bulma.min.css" rel="stylesheet"/>
 </head>
 <body>
  <section class="section">
   <div class="container mb-5">
    <h1 class="title is-1">
     Fake Python
    </h1>
    <p class="subtitle is-3">
     Fake Jobs for Your Web Scraping Journey
    </p>
   </div>
   <div class="container">
    <div class="columns is-multiline" id="ResultsContainer">
     <div class="box">
      <h1 class="title is-2">
       Senior Python Developer
      </h1>
      <h2 class="subtitle is-4 company">
       Payne, Roberts and Davis
      </h2>
      <div class="content">
       <p>
        Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational

In [13]:
from IPython.display import HTML
#visualizing the page
HTML(str(soup))

In [15]:
#Find paragraph tags
p_tags = soup.find_all('p')
len(p_tags)

4

In [16]:
# Inspect the p-tags
p_tags

[<p class="subtitle is-3">
         Fake Jobs for Your Web Scraping Journey
       </p>,
 <p>Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.</p>,
 <p id="location"><strong>Location:</strong> Stewartbury, AA</p>,
 <p id="date"><strong>Posted:</strong> 2021-04-08</p>]

In [18]:
#Finding paragraphs with no id or class
p_tags = soup.find_all('p',{'id':"",'class':''})
len(p_tags)

1

In [19]:
# Confirm the information is the job description
p_tags[0]

<p>Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.</p>

In [20]:
job_details = p_tags[0].text
job_details

'Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.'

In [21]:
import time
# Links to retreive
links = jobs_df['Apply Link']

# Empty list for saving details
job_details_list = []

for link in links:
    try:
        response = requests.get(link)
        # 1-sec pause 
        time.sleep(1)

        # Make the soup and find the p-tag
        soup = BeautifulSoup(response.content)
        p_tag = soup.find_all('p',{'id':"",'class':''})

        # Appending the job details
        job_details_list.append(p_tag[0].text)


    except:
        print(f"Something went wrong with {link}. Status code: {response.status_code}")
        # Appending a null value for the job 
        job_details_list.append(pd.NA)
        
        

job_details_list[:5]

['Professional asset web application environmentally friendly detail-oriented asset. Coordinate educational dashboard agile employ growth opportunity. Company programs CSS explore role. Html educational grit web application. Oversea SCRUM talented support. Web Application fast-growing communities inclusive programs job CSS. Css discussions growth opportunity explore open-minded oversee. Css Python environmentally friendly collaborate inclusive role. Django no experience oversee dashboard environmentally friendly willing to learn programs. Programs open-minded programs asset.',
 'Party prevent live. Quickly candidate change although. Together type music hospital. Every speech support time operation wear often.',
 'Administration even relate head color. Staff beyond chair recently and off. Own available buy country store build before. Already against which continue. Look road article quickly. International big employee determine positive go Congress. Level others record hospital employee

In [22]:
# Adding the job details to the dataframe
jobs_df['Details'] = job_details_list
jobs_df.head()

Unnamed: 0,Title,Company,Location,Date Posted,Learn Link,Apply Link,Details
0,Senior Python Developer,"Payne, Roberts and Davis","Stewartbury, AA",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/se...,Professional asset web application environment...
1,Energy engineer,Vasquez-Davidson,"Christopherville, AA",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/en...,Party prevent live. Quickly candidate change a...
2,Legal executive,"Jackson, Chambers and Levy","Port Ericaburgh, AA",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/le...,Administration even relate head color. Staff b...
3,Fitness centre manager,Savage-Bradley,"East Seanview, AP",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/fi...,Tv program actually race tonight themselves tr...
4,Product manager,Ramirez Inc,"North Jamieview, AP",2021-04-08,https://www.realpython.com,https://realpython.github.io/fake-jobs/jobs/pr...,Traditional page a although for study anyone. ...


In [27]:
jobs_df.to_csv("Data/fake-job-listings-scraped-v2.csv", index=False)