In [36]:
import csv
import datetime
import time
from bs4 import BeautifulSoup
import requests

def scrape_job_details(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    job_details = {}

    # Extracting job details
    title_elem = soup.find('span', property='title')
    job_details['Title'] = title_elem.get_text().strip() if title_elem else ''

    date_elem = soup.find('span', property='datePosted')
    job_details['Date Posted'] = date_elem.get_text().strip() if date_elem else ''

    employer_elem = soup.find('span', property='hiringOrganization')
    job_details['Employer'] = employer_elem.get_text().strip() if employer_elem else ''

    location_elem = soup.find('span', property='addressLocality')
    region_elem = soup.find('span', property='addressRegion')
    job_details['Location'] = f"{location_elem.get_text().strip()}, {region_elem.get_text().strip()}" if location_elem and region_elem else ''

    min_value = soup.find('span', property='minValue').get_text().strip() if soup.find('span', property='minValue') else ''
    unit_text = soup.find('span', class_='hidden', property='unitText').get_text().strip() if soup.find('span', class_='hidden', property='unitText') else ''
    job_details['Salary'] = f"{min_value} {unit_text}" if min_value and unit_text else ''

    workhour_elem = soup.find('span', property='workHours')
    job_details['Work Hours'] = workhour_elem.get_text().strip() if workhour_elem else ''
    
    employment_type_elem = soup.find('span', property='employmentType')
    job_details['Employment Type'] = employment_type_elem.get_text().strip() if employment_type_elem else ''


    start_date_elem = soup.find('span', class_='wb-inv', string='Start date').find_next_sibling() if soup.find('span', class_='wb-inv', string='Start date') else None
    job_details['Start Date'] = start_date_elem.get_text().strip() if start_date_elem else ''
    
#     li_element = soup.find('span', class_='fa fa-gift')
#     job_details['Benefits'] = li_element.get_text(strip=True) if li_element else ''

    
    vacancies_elem = soup.find('span', class_='wb-inv', string='vacancies').find_next_sibling() if soup.find('span', class_='wb-inv', string='vacancies') else None
    job_details['Vacancies'] = vacancies_elem.get_text().strip() if vacancies_elem else ''

    

    languages_elem = soup.find('h4', string='Languages').find_next_sibling() if soup.find('h4', string='Languages') else None
    job_details['Languages'] = languages_elem.get_text().strip() if languages_elem else ''

    education_elem = soup.find('h4', string='Education').find_next_sibling() if soup.find('h4', string='Education') else None
    job_details['Education'] = education_elem.get_text().strip() if education_elem else ''

    experience_elem = soup.find('h4', string='Experience').find_next_sibling() if soup.find('h4', string='Experience') else None
    job_details['Experience'] = experience_elem.get_text().strip() if experience_elem else ''

    # Additional scraping for responsibilities
    responsibilities_elem = soup.find('div', property='responsibilities')
    if responsibilities_elem:
        job_details['Responsibilities'] = ', '.join([li.get_text().strip() for li in responsibilities_elem.find_all('li')])
    else:
        job_details['Responsibilities'] = ''
        
        
    jobbenefits_elem = soup.find('div', property='jobBenefits')
    if jobbenefits_elem:
        job_details['Benefits'] = ', '.join([li.get_text().strip() for li in jobbenefits_elem.find_all('li')])
    else:
        job_details['Benefits'] = ''
        
    valid_through_elem = soup.find('p', property='validThrough')
    job_details['Valid Through'] = valid_through_elem.get_text().strip() if valid_through_elem else ''
        
    job_details['url'] = url

    return job_details

# Read URLs from CSV file
job_links = []
with open('job_links_ON20400_03_28.csv', 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip header
    for row in reader:
        job_links.append(row[0])

# Scrape job details for each URL
job_details_list = []
for url in job_links:
    job_details = scrape_job_details(url)
    job_details_list.append(job_details)

# # Write job details to CSV file
# with open('job_details.csv', 'w', newline='', encoding='utf-8') as csvfile:
#     fieldnames = ['Title', 'Date Posted', 'Employer', 'Location', 'Salary','Work Hours', 'Employment Type', 'Start Date', 'Vacancies',  'Languages', 'Education', 'Experience', 'Responsibilities','Benefits','Valid Through','url']
#     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#     writer.writeheader()
#     for job_details in job_details_list:
#         writer.writerow(job_details)

# print("Job details have been saved to job_details.csv")

# Generate CSV file name
num_rows = len(job_details_list)
current_date = datetime.date.today().strftime("%m_%d")
csv_filename = f"job_detail_ON_{num_rows}_{current_date}.csv"

# Write job details to CSV file
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Title', 'Date Posted', 'Employer', 'Location', 'Salary', 'Work Hours', 'Employment Type', 'Start Date', 'Vacancies', 'Languages', 'Education', 'Experience', 'Responsibilities', 'Benefits', 'Valid Through', 'url']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for job_details in job_details_list:
        writer.writerow(job_details)



# Read job details from CSV file and print progress
print("Reading job details from CSV file:")
start_time = time.time()
num_urls_read = 0
with open(csv_filename, 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        num_urls_read += 1
        elapsed_time = time.time() - start_time
        print(f"Elapsed time: {elapsed_time:.2f} seconds | URLs read: {num_urls_read}", end='\r')
        # Simulate some processing time for demonstration purposes
        time.sleep(0.1)  # You can remove this line in your actual code

print("\nAll job details have been read from the CSV file.")
print(f"Job details have been saved to {csv_filename}")


Reading job details from CSV file:
Elapsed time: 2223.12 seconds | URLs read: 20400
All job details have been read from the CSV file.
Job details have been saved to job_detail_ON_20400_03_29.csv


In [37]:
import pandas as pd
df = pd.read_csv('job_detail_ON_20400_03_29.csv')

In [38]:
df.columns

Index(['Title', 'Date Posted', 'Employer', 'Location', 'Salary', 'Work Hours',
       'Employment Type', 'Start Date', 'Vacancies', 'Languages', 'Education',
       'Experience', 'Responsibilities', 'Benefits', 'Valid Through', 'url'],
      dtype='object')

In [39]:
df.head()

Unnamed: 0,Title,Date Posted,Employer,Location,Salary,Work Hours,Employment Type,Start Date,Vacancies,Languages,Education,Experience,Responsibilities,Benefits,Valid Through,url
0,prosthetic assistant,"Posted on March 23, 2024",Metro,"Sarnia, ON",,,,Starts as soon as possible,1 vacancy,,,,,,2024-04-21,https://www.jobbank.gc.ca/jobsearch/jobposting...
1,cabinetmaker,"Posted on February 28, 2024",Sympro Mfg Inc.,"Mississauga, ON",27.00 HOUR,30 to 40 hours per week,Permanent employmentFull time,Starts as soon as possible,1 vacancy,English,Secondary (high) school graduation certificate,1 to less than 7 months,"Study plans, specifications or drawings of art...",,2024-04-27,https://www.jobbank.gc.ca/jobsearch/jobposting...
2,restoration carpenter,"Posted on March 18, 2024",Supra Restoration Services LTD,"Bradford, ON",30.00 HOUR,40 hours per week,Term or contract (ending: 2027-07-02)Full time,,3 vacancies,English,Secondary (high) school graduation certificate,2 years to less than 3 years,"Read and interpret blueprints, drawings and sk...",,2024-04-17,https://www.jobbank.gc.ca/jobsearch/jobposting...
3,painter,"Posted on March 12, 2024",Majestic Renovations,"Bradford, ON",30.00 HOUR,32 to 44 hours per week,Permanent employmentFull time,Starts as soon as possible,2 vacancies,English,"No degree, certificate or diploma",3 years to less than 5 years,"Prepare, clean and sand surfaces to be painted...",,2024-04-11,https://www.jobbank.gc.ca/jobsearch/jobposting...
4,marketing coordinator,"Posted on March 25, 2024","Vaco, LLC","Mississauga, ON",,,,Starts as soon as possible,1 vacancy,,,,,,2024-04-26,https://www.jobbank.gc.ca/jobsearch/jobposting...


In [40]:
df.shape

(20400, 16)

In [41]:
df['Start Date'].unique()

array(['Starts as soon as possible', nan], dtype=object)

In [42]:
df.tail()

Unnamed: 0,Title,Date Posted,Employer,Location,Salary,Work Hours,Employment Type,Start Date,Vacancies,Languages,Education,Experience,Responsibilities,Benefits,Valid Through,url
20395,furniture mover,"Posted on March 06, 2024",TR Overseas Moving 1927 Inc,"Scarborough, ON",17.00 HOUR,30 to 45 hours per week,Permanent employmentFull time,Starts as soon as possible,15 vacancies,English,Secondary (high) school graduation certificate...,Experience an asset,"Load, unload and move products and materials b...","Dental plan, Health care plan, Vision care ben...",2024-04-05,https://www.jobbank.gc.ca/jobsearch/jobposting...
20396,,,,,,,,,,,,,,,,https://www.jobbank.gc.ca/jobsearch/jobposting...
20397,administrative assistant - office,"Posted on March 12, 2024",Atlantic Immigration Limited,"Mississauga, ON",20.00 HOUR,35 to 44 hours per week,Permanent employmentFull time,Starts as soon as possible,3 vacancies,English,"College, CEGEP or other non-university certifi...",7 months to less than 1 year,Establish and implement policies and procedure...,"Dental plan, Health care plan, Vision care ben...",2024-04-11,https://www.jobbank.gc.ca/jobsearch/jobposting...
20398,,,,,,,,,,,,,,,,https://www.jobbank.gc.ca/jobsearch/jobposting...
20399,human resources director,"Posted on March 26, 2024",Waterloo Region District School Board,"Kitchener, ON",,,Permanent employmentFull time,Starts as soon as possible,1 vacancy,,,,,,2024-05-25,https://www.jobbank.gc.ca/jobsearch/jobposting...


In [43]:
df.isnull().sum()

Title                 399
Date Posted           399
Employer              399
Location              399
Salary               5516
Work Hours           9062
Employment Type      1986
Start Date            858
Vacancies             589
Languages            9062
Education            9091
Experience           9091
Responsibilities     9675
Benefits            16725
Valid Through        4377
url                     0
dtype: int64

In [50]:
df['Languages'].unique()

array([nan, 'English', 'English or French', 'Bilingual', 'French'],
      dtype=object)

In [49]:
df['Salary']

0               NaN
1        27.00 HOUR
2        30.00 HOUR
3        30.00 HOUR
4               NaN
            ...    
20395    17.00 HOUR
20396           NaN
20397    20.00 HOUR
20398           NaN
20399           NaN
Name: Salary, Length: 20400, dtype: object