In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import warnings

In [10]:
warnings.filterwarnings("ignore")

In [11]:


# Read URLs from CSV into a DataFrame
df_urls = pd.read_csv('job_links.csv')

In [12]:
df_urls.head()

Unnamed: 0,Job Link
0,https://www.jobbank.gc.ca/jobsearch/jobposting...
1,https://www.jobbank.gc.ca/jobsearch/jobposting...
2,https://www.jobbank.gc.ca/jobsearch/jobposting...
3,https://www.jobbank.gc.ca/jobsearch/jobposting...
4,https://www.jobbank.gc.ca/jobsearch/jobposting...


In [13]:
# Initialize an empty DataFrame to store extracted details
df_extracted_details = pd.DataFrame(columns=['URL', 'Location', 'Salary', 'Employment Type', 'Work Hours', 'Start Date', 'Benefits', 'Vacancies', 'Verified', 'Source'])

# Iterate over each URL
for index, row in df_urls.iterrows():
    url = row['Job Link']
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Initialize variables to store extracted details for each URL
    location = None
    salary = None
    employment_type = None
    work_hours = None
    start_date = None
    benefits = None
    vacancies = None
    source = None
    verified = None
    
    # Find the <ul> element with the specified class
    job_posting_brief_ul = soup.find('ul', class_='job-posting-brief colcount-lg-2')

    # Check if the <ul> element is found
    if job_posting_brief_ul:
        # Find all <li> elements inside the <ul>
        job_posting_details = job_posting_brief_ul.find_all('li')
        
        # Iterate over each <li> element to extract details
        for li in job_posting_details:
            # Extract text content based on the class or other properties
            text = li.get_text(strip=True)
            
            # Check the content and assign to respective variables
            if "Location" in text:
                location_spans = li.find_all('span', class_='city')  # Find all spans with city class
                location = ' '.join([span.get_text(strip=True) + ' ' for span in location_spans])  # Append space to each span's text content
                if 'Remote work available' in text:  # If 'Remote work available' is present, append it as well
                    location += ' Remote work available'
                location = location.strip()  # Remove trailing space
            elif "Salary" in text:
                salary_span = li.find('span', class_='attribute-value')  # Find span with salary value
                if salary_span:
                    salary = ' '.join(salary_span.stripped_strings)  # Join text content with space
            elif "Terms of employment" in text:
                employment_type_span = li.find('span', class_='attribute-value')  # Find span with employment type value
                if employment_type_span:
                    employment_type = ' '.join(employment_type_span.stripped_strings)  # Join text content with space
            elif "fa-clock-o" in text:
                work_hours_span = li.find('span', class_='attribute-value')  # Find span with work hours value
                if work_hours_span:
                    work_hours = ' '.join(work_hours_span.stripped_strings)  # Join text content with space
            elif "Start date" in text:
                start_date_span = li.find('span', class_='attribute-value')  # Find span with start date value
                if start_date_span:
                    start_date = ' '.join(start_date_span.stripped_strings)  # Join text content with space
            elif "Benefits" in text:
                benefits_span = li.find('span', class_='attribute-value')  # Find span with benefits value
                if benefits_span:
                    benefits = ' '.join(benefits_span.stripped_strings)  # Join text content with space
            elif "vacancies" in text:
                vacancies_span = li.find('span', class_='attribute-value')  # Find span with vacancies value
                if vacancies_span:
                    vacancies = ' '.join(vacancies_span.stripped_strings)  # Join text content with space
            elif "Verified" in text:
                verified = "Verified"
            elif "Source" in text:
                source_span = li.find('span', class_='attribute-value')  # Find span with source value
                if source_span:
                    source = ' '.join(source_span.stripped_strings)  # Join text content with space
    
    # Append extracted details to DataFrame
    df_extracted_details = df_extracted_details.append({'URL': url, 
                                                        'Location': location, 
                                                        'Salary': salary, 
                                                        'Employment Type': employment_type, 
                                                        'Work Hours': work_hours, 
                                                        'Start Date': start_date, 
                                                        'Benefits': benefits, 
                                                        'Vacancies': vacancies, 
                                                        'Verified': verified, 
                                                        'Source': source}, 
                                                        ignore_index=True)

    # Print URL and extracted details
    print(f"URL [{index+1}]: {url}")
    print(f"Extracted: {df_extracted_details.iloc[-1]}\n")

# Print DataFrame with extracted details
print(df_extracted_details)

URL [1]: https://www.jobbank.gc.ca/jobsearch/jobposting/40630614;jsessionid=5C634C8A45AF3D19F0E6E04F16919FBD.jobsearch75?source=searchresults
Extracted: URL                https://www.jobbank.gc.ca/jobsearch/jobposting...
Location                                                        None
Salary                                                          None
Employment Type                                                 None
Work Hours                                                      None
Start Date                                                      None
Benefits                                                        None
Vacancies                                                       None
Verified                                                        None
Source                                                          None
Name: 0, dtype: object

URL [2]: https://www.jobbank.gc.ca/jobsearch/jobposting/40630117;jsessionid=5C634C8A45AF3D19F0E6E04F16919FBD.jobsearch75?source=searc

KeyboardInterrupt: 