In [1]:
import csv
import datetime
import time
from bs4 import BeautifulSoup
import requests

def scrape_job_details(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    job_details = {}

    # Extracting job details
    title_elem = soup.find('span', property='title')
    job_details['Title'] = title_elem.get_text().strip() if title_elem else ''

    date_elem = soup.find('span', property='datePosted')
    job_details['Date Posted'] = date_elem.get_text().strip() if date_elem else ''

    employer_elem = soup.find('span', property='hiringOrganization')
    job_details['Employer'] = employer_elem.get_text().strip() if employer_elem else ''

    location_elem = soup.find('span', property='addressLocality')
    region_elem = soup.find('span', property='addressRegion')
    job_details['Location'] = f"{location_elem.get_text().strip()}, {region_elem.get_text().strip()}" if location_elem and region_elem else ''

    min_value = soup.find('span', property='minValue').get_text().strip() if soup.find('span', property='minValue') else ''
    unit_text = soup.find('span', class_='hidden', property='unitText').get_text().strip() if soup.find('span', class_='hidden', property='unitText') else ''
    job_details['Salary'] = f"{min_value} {unit_text}" if min_value and unit_text else ''

    workhour_elem = soup.find('span', property='workHours')
    job_details['Work Hours'] = workhour_elem.get_text().strip() if workhour_elem else ''
    
    employment_type_elem = soup.find('span', property='employmentType')
    job_details['Employment Type'] = employment_type_elem.get_text().strip() if employment_type_elem else ''


    start_date_elem = soup.find('span', class_='wb-inv', string='Start date').find_next_sibling() if soup.find('span', class_='wb-inv', string='Start date') else None
    job_details['Start Date'] = start_date_elem.get_text().strip() if start_date_elem else ''
    
#     li_element = soup.find('span', class_='fa fa-gift')
#     job_details['Benefits'] = li_element.get_text(strip=True) if li_element else ''

    
    vacancies_elem = soup.find('span', class_='wb-inv', string='vacancies').find_next_sibling() if soup.find('span', class_='wb-inv', string='vacancies') else None
    job_details['Vacancies'] = vacancies_elem.get_text().strip() if vacancies_elem else ''

    

    languages_elem = soup.find('h4', string='Languages').find_next_sibling() if soup.find('h4', string='Languages') else None
    job_details['Languages'] = languages_elem.get_text().strip() if languages_elem else ''

    education_elem = soup.find('h4', string='Education').find_next_sibling() if soup.find('h4', string='Education') else None
    job_details['Education'] = education_elem.get_text().strip() if education_elem else ''

    experience_elem = soup.find('h4', string='Experience').find_next_sibling() if soup.find('h4', string='Experience') else None
    job_details['Experience'] = experience_elem.get_text().strip() if experience_elem else ''

    # Additional scraping for responsibilities
    responsibilities_elem = soup.find('div', property='responsibilities')
    if responsibilities_elem:
        job_details['Responsibilities'] = ', '.join([li.get_text().strip() for li in responsibilities_elem.find_all('li')])
    else:
        job_details['Responsibilities'] = ''
        
        
    jobbenefits_elem = soup.find('div', property='jobBenefits')
    if jobbenefits_elem:
        job_details['Benefits'] = ', '.join([li.get_text().strip() for li in jobbenefits_elem.find_all('li')])
    else:
        job_details['Benefits'] = ''
        
    valid_through_elem = soup.find('p', property='validThrough')
    job_details['Valid Through'] = valid_through_elem.get_text().strip() if valid_through_elem else ''
        
    job_details['url'] = url

    return job_details

# Read URLs from CSV file
job_links = []
with open('job_links_small_provinces_11852_03_31.csv', 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip header
    for row in reader:
        job_links.append(row[0])

# Scrape job details for each URL
job_details_list = []
num_urls_read = 0
for index, url in enumerate(job_links, 1):
    job_details = scrape_job_details(url)
    job_details_list.append(job_details)
    
    # Increment the count of URLs read
    num_urls_read += 1
    
    # Print the progress message
    print(f"Number of links read: {num_urls_read}", end='\r')
    
# Generate CSV file name
num_rows = len(job_details_list)
current_date = datetime.date.today().strftime("%m_%d")
csv_filename = f"job_detail_small_provinces_{num_rows}_{current_date}.csv"

# Write job details to CSV file
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Title', 'Date Posted', 'Employer', 'Location', 'Salary', 'Work Hours', 'Employment Type', 'Start Date', 'Vacancies', 'Languages', 'Education', 'Experience', 'Responsibilities', 'Benefits', 'Valid Through', 'url']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for job_details in job_details_list:
        writer.writerow(job_details)

# Print the completion message
print("\nAll job details have been read.")
print(f"Job details have been saved to {csv_filename}")


Number of links read: 11852
All job details have been read.
Job details have been saved to job_detail_small_provinces_11852_03_31.csv


In [29]:
import pandas as pd
df = pd.read_csv('job_detail_AB_13993_03_30.csv')

In [30]:
df.columns

Index(['Title', 'Date Posted', 'Employer', 'Location', 'Salary', 'Work Hours',
       'Employment Type', 'Start Date', 'Vacancies', 'Languages', 'Education',
       'Experience', 'Responsibilities', 'Benefits', 'Valid Through', 'url'],
      dtype='object')

In [31]:
df.head()

Unnamed: 0,Title,Date Posted,Employer,Location,Salary,Work Hours,Employment Type,Start Date,Vacancies,Languages,Education,Experience,Responsibilities,Benefits,Valid Through,url
0,food service supervisor,"Posted on March 28, 2024",Bar Burritos,"Edson, AB",18.00 HOUR,35 hours per week,Permanent employmentFull time,Starts as soon as possible,1 vacancy,English,Secondary (high) school graduation certificate,1 year to less than 2 years,"Establish methods to meet work schedules, Requ...",,2024-04-27,https://www.jobbank.gc.ca/jobsearch/jobposting...
1,construction painter,"Posted on March 25, 2024",Zematar Construction Ltd.,"Calgary, AB",25.00 HOUR,30 to 40 hours per week,Permanent employmentFull time,Starts as soon as possible,1 vacancy,English,Secondary (high) school graduation certificate,1 year to less than 2 years,"Prepare, clean and sand surfaces to be painted...",,2024-04-24,https://www.jobbank.gc.ca/jobsearch/jobposting...
2,food counter attendant,"Posted on March 22, 2024",MYLONAS ENTERPRISES LTD. O/A MR. SUB,"Calgary, AB",15.50 HOUR,30 to 40 hours per week,Permanent employmentFull time,Starts as soon as possible,4 vacancies,English,"No degree, certificate or diploma",Will train,"Clear and clean tables, trays and chairs, Load...",Other benefits,2024-04-21,https://www.jobbank.gc.ca/jobsearch/jobposting...
3,"teacher, kindergarten","Posted on March 26, 2024",Foundations for the Future Charter Academy,"Calgary, AB",,,Permanent employmentFull time,Starts as soon as possible,1 vacancy,,,,,,2024-05-25,https://www.jobbank.gc.ca/jobsearch/jobposting...
4,administrative assistant,"Posted on March 19, 2024",TOP CHOICE LOGISTICS INC.,"Calgary, AB",27.00 HOUR,35 hours per week,Permanent employmentFull time,Starts as soon as possible,1 vacancy,English,Secondary (high) school graduation certificate,7 months to less than 1 year,"Arrange and co-ordinate seminars, conferences,...",,2024-04-18,https://www.jobbank.gc.ca/jobsearch/jobposting...


In [32]:
df.shape

(13993, 16)

In [34]:
df['Start Date'].unique()

array(['Starts as soon as possible', nan], dtype=object)

In [35]:
df.tail()

Unnamed: 0,Title,Date Posted,Employer,Location,Salary,Work Hours,Employment Type,Start Date,Vacancies,Languages,Education,Experience,Responsibilities,Benefits,Valid Through,url
13988,cabinetmaker,"Posted on March 01, 2024",1772399 ALBERTA LTD.,"Fort McMurray, AB",30.00 HOUR,35 hours per week,Permanent employmentFull time,Starts as soon as possible,2 vacancies,English,Secondary (high) school graduation certificate,1 year to less than 2 years,"Study plans, specifications or drawings of art...",Group insurance benefits,2024-03-31,https://www.jobbank.gc.ca/jobsearch/jobposting...
13989,food counter attendant,"Posted on March 20, 2024",2352083 Alberta Ltd,"Hinton, AB",15.50 HOUR,35 hours per week,Permanent employmentFull time,Starts as soon as possible,2 vacancies,English,"No degree, certificate or diploma",Will train,"Keep records of the quantities of food used, P...",,2024-04-19,https://www.jobbank.gc.ca/jobsearch/jobposting...
13990,apprentice carpenter,"Posted on March 19, 2024",Triangle Enterprises Ltd.,"Calgary, AB",18.00 HOUR,40 hours per week,Permanent employmentFull time,Starts as soon as possible,1 vacancy,English,Registered Apprenticeship certificate\n\nor eq...,2 years to less than 3 years,"Measure, cut, shape, assemble and join materia...","Dental plan, Health care plan, Paramedical ser...",2024-04-03,https://www.jobbank.gc.ca/jobsearch/jobposting...
13991,computer network technician,"Posted on February 26, 2024",Allamani Technical Solutions Ltd,"Grande Prairie, AB",33.65 HOUR,30 to 35 hours per week,Permanent employmentFull time,Starts as soon as possible,1 vacancy,English,College/CEGEP\n\nor equivalent experience,7 months to less than 1 year,"Maintain, troubleshoot and administer the use ...",,2024-04-25,https://www.jobbank.gc.ca/jobsearch/jobposting...
13992,journeyman/woman plumber,"Posted on March 26, 2024",Trust Plumbing & Gasfitting Ltd.,"Calgary, AB",40.00 HOUR,,Full time,Starts as soon as possible,1 vacancy,,,,,,,https://www.jobbank.gc.ca/jobsearch/jobposting...


In [36]:
df.isnull().sum()

Title                 155
Date Posted           155
Employer              155
Location              155
Salary               2262
Work Hours           3358
Employment Type       821
Start Date            386
Vacancies             161
Languages            3358
Education            3364
Experience           3364
Responsibilities     3598
Benefits            11859
Valid Through        1461
url                     0
dtype: int64

In [37]:
df['Languages'].unique()

array(['English', nan, 'English or French', 'Bilingual', 'French'],
      dtype=object)

In [38]:
df['Salary']

0        18.00 HOUR
1        25.00 HOUR
2        15.50 HOUR
3               NaN
4        27.00 HOUR
            ...    
13988    30.00 HOUR
13989    15.50 HOUR
13990    18.00 HOUR
13991    33.65 HOUR
13992    40.00 HOUR
Name: Salary, Length: 13993, dtype: object

In [39]:
df.describe()

Unnamed: 0,Title,Date Posted,Employer,Location,Salary,Work Hours,Employment Type,Start Date,Vacancies,Languages,Education,Experience,Responsibilities,Benefits,Valid Through,url
count,13838,13838,13838,13838,11731,10635,13172,13607,13832,10635,10629,10629,10395,2134,12532,13993
unique,2000,38,7646,279,884,203,35,1,24,4,55,9,6602,647,50,13993
top,food service supervisor,"Posted on March 28, 2024",ApplytoEducation,"Edmonton, AB",28.85 HOUR,40 hours per week,Permanent employmentFull time,Starts as soon as possible,1 vacancy,English,Secondary (high) school graduation certificate,1 year to less than 2 years,"Calculate and prepare cheques for payroll, Cal...",Other benefits,2024-04-27,https://www.jobbank.gc.ca/jobsearch/jobposting...
freq,858,974,94,4440,666,2620,11417,13607,8522,10586,5793,3827,178,233,962,1


In [None]:
|