# Load the Libraries and Packages

In [6]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import json
import re
from selenium.webdriver.common.by import By

# Web Scraping

## Initialize Driver and Beautiful Soup

In [19]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time

# Initialize the driver
driver = webdriver.Chrome()

# Initialize base url
base_url = 'https://rbc.wd3.myworkdayjobs.com/en-US/RBCEARLYTALENT1'

# Navigate to the URL
driver.get(base_url)


## Find the nested links in the Job Postings

In [20]:
# Initialize an empty list to store job links
job_links = []

# Initialize page counter
page = 1

while True:  # Loop to navigate through each page
    # Optionally, wait for the page to load completely
    time.sleep(5)
    
    # Fetch the page source
    page_source = driver.page_source
    
    # Initialize Beautiful Soup
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Find job postings
    job_postings = soup.find_all("li", {"class": "css-1q2dra3"})
    for job in job_postings:
        link_tag = job.find("a", {"data-automation-id": "jobTitle"})
        if link_tag:
            # Remove the redundant part from href before appending, or else the website cannot be reached
            cleaned_href = link_tag['href'].replace('/en-US/RBCEARLYTALENT1', '')
            job_links.append(base_url + cleaned_href)

    # Find the next page button and click it to automate extraction
    try:
        next_button = driver.find_element(By.XPATH, f"//button[@aria-label='page {page + 1}']")
        next_button.click()
        page += 1
    # Page reaches limit or that errors occured
    except Exception as e:
        print(f"Reached the last page or an error occurred: {e}")
        break

# Close the driver
# driver.quit()

# Output the job links
for link in job_links:
    print(link)


Reached the last page or an error occurred: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//button[@aria-label='page 8']"}
  (Session info: chrome=117.0.5938.92); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000102e7ed98 chromedriver + 4337048
1   chromedriver                        0x0000000102e76e14 chromedriver + 4304404
2   chromedriver                        0x0000000102aa3a5c chromedriver + 293468
3   chromedriver                        0x0000000102ae8d50 chromedriver + 576848
4   chromedriver                        0x0000000102b23908 chromedriver + 817416
5   chromedriver                        0x0000000102adca5c chromedriver + 526940
6   chromedriver                        0x0000000102add908 chromedriver + 530696
7   chromedriver                        0x0000000102e44de4 chromedriver + 4

In [None]:
##Check the total number of job postings
len(job_links)

In [None]:
## Presents the soup with nested data structures
print(soup.prettify())

## Sample Extraction Step-by-step

In [11]:

driver.get('https://rbc.wd3.myworkdayjobs.com/en-US/RBCEARLYTALENT1/job/TORONTO-Ontario-Canada/Investigator_R-0000067226-1')
# Optionally, wait for the page to load completely
time.sleep(5)

# Fetch the page source and close the driver
page_source = driver.page_source

job_soup = BeautifulSoup(driver.page_source, 'html.parser')

job_soup
    

Job Title: Investigator
Date Posted: 2023-09-26
Employment Type: FULL_TIME
Job Location Type: TELECOMMUTE
Job Country: Canada
Job Locality: 20 KING ST W:TORONTO
Applicant Location Requirements: Canada
Hiring Organization: 0000050007 Royal Bank of Canada
Identifier Name: Investigator
Identifier Value: R-0000067226


{'jobLocation': {'@type': 'Place',
  'address': {'@type': 'PostalAddress',
   'addressCountry': 'Canada',
   'addressLocality': '20 KING ST W:TORONTO'}},
 'applicantLocationRequirements': {'@type': 'Country', 'name': 'Canada'},
 'jobLocationType': 'TELECOMMUTE',
 'hiringOrganization': {'name': '0000050007 Royal Bank of Canada',
  '@type': 'Organization',
  'sameAs': ''},
 'identifier': {'name': 'Investigator',
  '@type': 'PropertyValue',
  'value': 'R-0000067226'},
 'datePosted': '2023-09-26',
 'employmentType': 'FULL_TIME',
 'title': 'Investigator',
 'description': "Come Work with Us! At RBC, our culture is deeply supportive and rich in opportunity and reward. You will help our clients thrive and our communities prosper, empowered by a spirit of shared purpose. Whether you’re helping clients find new opportunities, developing new technology, or providing expert advice to internal partners, you will be doing work that matters in the world, in an environment built on teamwork, service, 

### Extracting from key value pairs

In [13]:
# Find JSON-LD data
script_tag = job_soup.find('script', type='application/ld+json')
json_data = json.loads(script_tag.string)

# Extracting information
job_title = json_data.get("title", "N/A")
date_posted = json_data.get("datePosted", "N/A")
employment_type = json_data.get("employmentType", "N/A")

job_location_type = json_data.get("jobLocationType", "N/A")
job_country = json_data.get("jobLocation", {}).get("address", {}).get("addressCountry", "N/A")
job_locality = json_data.get("jobLocation", {}).get("address", {}).get("addressLocality", "N/A")

applicant_location_requirements = json_data.get("applicantLocationRequirements", {}).get("name", "N/A")

hiring_organization = json_data.get("hiringOrganization", {}).get("name", "N/A")

identifier_name = json_data.get("identifier", {}).get("name", "N/A")
identifier_value = json_data.get("identifier", {}).get("value", "N/A")
job_description = json_data.get("description")

# Printing the extracted information
print(f"Job Title: {job_title}")
print(f"Date Posted: {date_posted}")
print(f"Employment Type: {employment_type}")
print(f"Job Location Type: {job_location_type}")
print(f"Job Country: {job_country}")
print(f"Job Locality: {job_locality}")
print(f"Applicant Location Requirements: {applicant_location_requirements}")
print(f"Hiring Organization: {hiring_organization}")
print(f"Identifier Name: {identifier_name}")
print(f"Identifier Value: {identifier_value}")
# print(f"Job Description: {job_description}")


Job Title: Investigator
Date Posted: 2023-09-26
Employment Type: FULL_TIME
Job Location Type: TELECOMMUTE
Job Country: Canada
Job Locality: 20 KING ST W:TORONTO
Applicant Location Requirements: Canada
Hiring Organization: 0000050007 Royal Bank of Canada
Identifier Name: Investigator
Identifier Value: R-0000067226


### Extracting from unstructured data from metadata

In [12]:
# Unstructured data as a string
description_str = job_description

# Define regular expressions for each field
regex_dict = {
    'Address': r'Address: ([\w\s,]+)',
    'City': r'City: ([\w\-]+)',
    'Country': r'Country: (\w+)',
    'Work hours/week': r'Work hours/week: ([\d.]+)',
    'Employment Type': r'Employment Type: (\w+)',
    'Platform': r'Platform: ([\w\s]+)',
    'Job Type': r'Job Type: ([\w\s/\(\)]+)',
    'Pay Type': r'Pay Type: (\w+)',
    'Posted Date': r'Posted Date: (\d{4}-\d{2}-\d{2})',
    'Application Deadline': r'Application Deadline: (\d{4}-\d{2}-\d{2})'
}

extracted_data = {}

# Run the regular expressions on the description string
for field, regex in regex_dict.items():
    match = re.search(regex, description_str)
    if match:
        extracted_data[field] = match.group(1)
    else:
        extracted_data[field] = 'N/A'

# Post-process the extracted data
for field, value in extracted_data.items():
    for other_field in regex_dict.keys():
        if other_field == field:
            continue
        position = value.find(other_field)
        if position != -1:
            extracted_data[field] = value[:position].strip()

# Print the extracted information
for field, value in extracted_data.items():
    print(f"{field}: {value}")

Address: 20 KING ST W
City: CAN-ON-TORONTO
Country: Canada
Work hours/week: 37.5
Employment Type: Full
Platform: Group Risk Management
Job Type: Contract (Fixed Term)
Pay Type: Salaried
Posted Date: 2023-09-26
Application Deadline: 2023-10-12


## Streamline extraction for every job posting


In [21]:
import pandas as pd
from tqdm import tqdm
# Create an empty list to store job data dictionaries
job_data_list = []

for job_link in tqdm(job_links):  # Assume job_links contains the URLs you're scraping
    driver.get(job_link)
    time.sleep(5)
    job_soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    job_data = {}  # Initialize a dictionary to store data for each job
    
    # Find JSON-LD data
    script_tag = job_soup.find('script', type='application/ld+json')
    json_data = json.loads(script_tag.string)
    
    # Extract structured data
    job_data['Job Title'] = json_data.get("title", "N/A")
    job_data['Date Posted'] = json_data.get("datePosted", "N/A")
    job_data['Employment Type'] = json_data.get("employmentType", "N/A")
    job_data['Job Location Type'] = json_data.get("jobLocationType", "N/A")
    job_data['Job Country'] = json_data.get("jobLocation", {}).get("address", {}).get("addressCountry", "N/A")
    job_data['Job Locality'] = json_data.get("jobLocation", {}).get("address", {}).get("addressLocality", "N/A")
    job_data['Applicant Location Requirements'] = json_data.get("applicantLocationRequirements", {}).get("name", "N/A")
    job_data['Hiring Organization'] = json_data.get("hiringOrganization", {}).get("name", "N/A")
    job_data['Identifier Name'] = json_data.get("identifier", {}).get("name", "N/A")
    job_data['Identifier Value'] = json_data.get("identifier", {}).get("value", "N/A")
    
    # Extract unstructured data using regular expressions
    description_str = json_data.get("description", "N/A")
    for field, regex in regex_dict.items():
        match = re.search(regex, description_str)
        job_data[field] = match.group(1) if match else 'N/A'
    
    # Append the filled dictionary to the list
    job_data_list.append(job_data)
    
    # Unstructured data as a string
    description_str = job_description

    # Define regular expressions for each field
    regex_dict = {
        'Address': r'Address: ([\w\s,]+)',
        'City': r'City: ([\w\-]+)',
        'Country': r'Country: (\w+)',
        'Work hours/week': r'Work hours/week: ([\d.]+)',
        'Employment Type': r'Employment Type: (\w+)',
        'Platform': r'Platform: ([\w\s]+)',
        'Job Type': r'Job Type: ([\w\s/\(\)]+)',
        'Pay Type': r'Pay Type: (\w+)',
        'Posted Date': r'Posted Date: (\d{4}-\d{2}-\d{2})',
        'Application Deadline': r'Application Deadline: (\d{4}-\d{2}-\d{2})'
    }
    
    
    # Running the regex on the description string
    extracted_data = {}
    for field, regex in regex_dict.items():
        match = re.search(regex, description_str)
        if match:
            extracted_data[field] = match.group(1)
        else:
            extracted_data[field] = 'N/A'
    
    # Post-process the extracted data
    for field, value in extracted_data.items():
        for other_field in regex_dict.keys():
            if other_field == field:
                continue
            position = value.find(other_field)
            if position != -1:
                extracted_data[field] = value[:position].strip()
    
    # Merging structured and unstructured data into job_data
    job_data.update(extracted_data)
    
    # Append to job_data_list
    job_data_list.append(job_data)

# Convert the list of dictionaries to a DataFrame
job_df = pd.DataFrame(job_data_list)

# Optional: Save the DataFrame to a CSV file
job_df.to_csv('job_data.csv', index=False)


 38%|███████████████                         | 51/136 [05:35<09:18,  6.57s/it]


AttributeError: 'NoneType' object has no attribute 'string'

In [22]:
driver.quit()

In [26]:
job_df = pd.DataFrame(job_data_list)
job_df 

Unnamed: 0,Job Title,Date Posted,Employment Type,Job Location Type,Job Country,Job Locality,Applicant Location Requirements,Hiring Organization,Identifier Name,Identifier Value,Address,City,Country,Work hours/week,Platform,Job Type,Pay Type,Posted Date,Application Deadline
0,Investigator,2023-09-26,Full,TELECOMMUTE,Canada,20 KING ST W:TORONTO,Canada,0000050007 Royal Bank of Canada,Investigator,R-0000067226,20 KING ST W,CAN-ON-TORONTO,Canada,37.5,Group Risk Management,Contract (Fixed Term),Salaried,2023-09-26,2023-10-12
1,Investigator,2023-09-26,Full,TELECOMMUTE,Canada,20 KING ST W:TORONTO,Canada,0000050007 Royal Bank of Canada,Investigator,R-0000067226,20 KING ST W,CAN-ON-TORONTO,Canada,37.5,Group Risk Management,Contract (Fixed Term),Salaried,2023-09-26,2023-10-12
2,"2024 Wealth Management, GAM Winter Student (8 ...",2023-09-22,Full,TELECOMMUTE,Canada,"RBC CENTRE, 155 WELLINGTON ST W:TORONTO",Canada,0000050353 The Royal Trust Company,"2024 Wealth Management, GAM Winter Student (8 ...",R-0000068482,20 KING ST W,CAN-ON-TORONTO,Canada,37.5,Group Risk Management,Contract (Fixed Term),Salaried,2023-09-26,2023-10-12
3,"2024 Wealth Management, GAM Winter Student (8 ...",2023-09-22,Full,TELECOMMUTE,Canada,"RBC CENTRE, 155 WELLINGTON ST W:TORONTO",Canada,0000050353 The Royal Trust Company,"2024 Wealth Management, GAM Winter Student (8 ...",R-0000068482,20 KING ST W,CAN-ON-TORONTO,Canada,37.5,Group Risk Management,Contract (Fixed Term),Salaried,2023-09-26,2023-10-12
4,"Manager Oversight Testing - Sanctions, Financi...",2023-09-21,Full,TELECOMMUTE,Canada,20 KING ST W:TORONTO,Canada,0000050007 Royal Bank of Canada,"Manager Oversight Testing - Sanctions, Financi...",R-0000065222,20 KING ST W,CAN-ON-TORONTO,Canada,37.5,Group Risk Management,Contract (Fixed Term),Salaried,2023-09-26,2023-10-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,"2024 Capital Markets, Business &amp; Client Se...",2023-09-21,Full,TELECOMMUTE,Canada,120 WESTERN PKY:BEDFORD,Canada,0000050007 Royal Bank of Canada,"2024 Capital Markets, Business &amp; Client Se...",R-0000068568,20 KING ST W,CAN-ON-TORONTO,Canada,37.5,Group Risk Management,Contract (Fixed Term),Salaried,2023-09-26,2023-10-12
98,2024 Wealth Management IT Summer Intern,2023-09-21,Full,TELECOMMUTE,United States of America,250 NICOLLET MALL:MINNEAPOLIS,United States of America,"0000050176 RBC Capital Markets, LLC",2024 Wealth Management IT Summer Intern,R-0000067976,20 KING ST W,CAN-ON-TORONTO,Canada,37.5,Group Risk Management,Contract (Fixed Term),Salaried,2023-09-26,2023-10-12
99,2024 Wealth Management IT Summer Intern,2023-09-21,Full,TELECOMMUTE,United States of America,250 NICOLLET MALL:MINNEAPOLIS,United States of America,"0000050176 RBC Capital Markets, LLC",2024 Wealth Management IT Summer Intern,R-0000067976,20 KING ST W,CAN-ON-TORONTO,Canada,37.5,Group Risk Management,Contract (Fixed Term),Salaried,2023-09-26,2023-10-12
100,"2024 Capital Markets, Global Credit, Winter Bu...",2023-09-21,Full,TELECOMMUTE,Canada,"ROYAL BANK PLAZA, 200 BAY ST:TORONTO",Canada,0000050599 RBC Dominion Securities Inc.,"2024 Capital Markets, Global Credit, Winter Bu...",R-0000068529,20 KING ST W,CAN-ON-TORONTO,Canada,37.5,Group Risk Management,Contract (Fixed Term),Salaried,2023-09-26,2023-10-12
