## Data Collection
- Scrape job postings data from jobstreet (PH)
- Include columns such as "Job Name", "Company Name", "Location", "Date Posted", "Job Specialization", "Job Type", and "Job Link"
- Convert the "Date Posted" column from time data to date format using datetime library
- Automate the web scraper to collect data up to the last available job postings and save it in a CSV file

In [10]:
import csv
import requests
from bs4 import BeautifulSoup

import re
import datetime
current_time = datetime.datetime.now()


In [11]:
# datetime conversation to date only
def convert_datetime(dt_posted):
    if dt_posted == 'now':
        return datetime.datetime.now().strftime("%Y-%m-%d")
    
    hours = 0
    minutes = 0
    days = 0
    
    match = re.match(r"(\d+)h ago", dt_posted)
    if match:
        hours = int(match.group(1))
    
    match = re.match(r"(\d+)m ago", dt_posted)
    if match:
        minutes = int(match.group(1))
    
    match = re.match(r"(\d+)d ago", dt_posted)
    if match:
        days = int(match.group(1))
    
    date = datetime.datetime.now() - datetime.timedelta(days=days, hours=hours, minutes=minutes)
    return date.strftime("%Y-%m-%d")

In [12]:
main_url = "https://www.jobstreet.com.ph/en/job-search/job-vacancy.php?pg={page}"
max_page = 2 # maximum pages to scrape

In [71]:
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.find_all('div', class_='z1s6m00'))

[<div class="z1s6m00" style="min-height:100vh"><div class="z1s6m00 _1hbhsw65a _1hbhsw6fq _1hbhsw6fe _1hbhsw6ga"><div class="z1s6m00 _1hbhsw65a _1hbhsw6ga _1hbhsw6fq _1hbhsw6p y759cs0 y44q7i18 y44q7i1b _1hbhsw632 _1hbhsw635"><div class="z1s6m00 _1hbhsw6p _1hbhsw6v cbtxjf0"><div class="z1s6m00 _1hbhsw69q _1hbhsw68m _1hbhsw65a _1hbhsw6fe _1hbhsw6fy _1hbhsw6p _1hbhsw6n _1hbhsw65f y759cs0 y759cs2 y44q7i18 y44q7i1b _1hbhsw632 _1hbhsw635"><div class="z1s6m00 _1hbhsw6fe _1hbhsw65a _1hbhsw6n"><div class="z1s6m00 _1hbhsw696"><a class="z1s6m00 z1s6m0f m3ze5h0" data-automation="Logo" href="/"><div class="z1s6m00 _1hbhsw65i _1hbhsw60 _1szqkis0" data-automation="jobstreet">JobStreet</div><svg height="30" viewbox="0 0 1620.35 250" xmlns="http://www.w3.org/2000/svg"><path d="M641 165.11l25.47-14.89c5.4 14 16 23.09 34.75 23.09 17.92 0 24.18-7.55 24.18-16 0-11.23-10.14-15.54-32.59-22-23.1-6.69-45.55-15.33-45.55-44 0-27.46 26.21-43.73 50.83-43.73 23.79 0 42.74 11.48 52.78 31.65l-25 14.46c-5.39-11.44-13.9

In [70]:
# Create a new CSV file and write the headers as the first row
with open('jobs.csv', mode='w', newline='', encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(['Job Name', 'Company Name', 'Location', 'Date Posted', 'Job Specialization', 'Job Type', 'Job Link'])
    
    # Iterate over each page and extract the job information
    for page in range(1, max_page+1):
        url = main_url.format(page=page)
        response = requests.get(url)
        
        soup = BeautifulSoup(response.content, 'html.parser')
    
        for job in soup.find_all('div', class_='z1s6m00'):
            try:
                job_name = job.find('div', class_='z1s6m00 l3gun70 l3gun74 l3gun72').get_text()
                company_name = job.find('span', class_='z1s6m00 _17dyj7u1 _1hbhsw64y _1hbhsw60 _1hbhsw6r').get_text()
                location = job.find('span', class_='z1s6m00 _1hbhsw64y y44q7i0 y44q7i3 y44q7i21 y44q7ih').get_text()
                date_posted = convert_datetime(job.find('span', class_='z1s6m00 _1hbhsw64y y44q7i0 y44q7i1 y44q7i22 y44q7ih').get_text())
                
                job_specialization = soup.find("dt", text="Job Specializations").find_next_sibling("dd").get_text()
                job_type = soup.find("dt", text="Job Type").find_next_sibling("dd").get_text()

                job_link = "https://www.jobstreet.com.ph" + job.find('a', class_ = 'jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h')['href']

                # Write the extracted information to the CSV file
                writer.writerow([job_name, company_name, location, date_posted, job_specialization, job_type, job_link])
            except Exception as e:
                job_name = ''
                company_name = ''
                location = ''
                date_posted = ''
                job_specialization = ''
                job_type = ''
                job_link = ''
                print(f"Failed to extract job information: {e}")

  job_specialization = soup.find("dt", text="Job Specializations").find_next_sibling("dd").get_text()
  job_type = soup.find("dt", text="Job Type").find_next_sibling("dd").get_text()


Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job inform

Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job inform

Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job inform

Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job information: 'NoneType' object has no attribute 'get_text'
Failed to extract job inform

In [9]:
from bs4 import BeautifulSoup

html_doc = '<div class="z1s6m00 _1hbhsw64u _1hbhsw650"><div class="z1s6m00 rqoqz3" aria-hidden="false"><div class="z1s6m00 _1hbhsw66u _1hbhsw67y _1hbhsw69q _1hbhsw68m rqoqz2"><dl class="z1s6m00 _1hbhsw64y y44q7i0 y44q7i1 y44q7i22 y44q7ih"><dt class="z1s6m00 _1hbhsw6c6" style="color: rgb(46, 56, 73);"><strong class="y44q7i3">Job Specializations</strong></dt><dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/accounting-finance-jobs/" data-automation="jobCardCategoryLink" title="Limit results to Accounting/Finance " class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">Accounting/Finance</a></dd> / <dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/general-cost-accounting-finance-jobs/" data-automation="jobCardCategoryLink" title="Limit results to General/Cost Accounting in Accounting/Finance" class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">General/Cost Accounting</a></dd><div class="z1s6m00 _1hbhsw66u _1hbhsw67y"><span class="z1s6m00 _1hbhsw64y _1hbhsw65e"><span class="z1s6m00 _1hbhsw65i _1hbhsw6p _14n5q8n0 _14n5q8n3 _14n5q8n5 _14n5q8n8"></span></span></div><dt class="z1s6m00 _1hbhsw6c6" style="color: rgb(46, 56, 73);"><strong class="y44q7i3">Job Type</strong></dt><dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/temporary-jobs/" data-automation="jobCardJobTypeLink" title="Limit results to Temporary " class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">Temporary</a></dd></dl></div></div></div><div class="z1s6m00 _1hbhsw64u _1hbhsw650"><div class="z1s6m00 rqoqz3" aria-hidden="false"><div class="z1s6m00 _1hbhsw66u _1hbhsw67y _1hbhsw69q _1hbhsw68m rqoqz2"><dl class="z1s6m00 _1hbhsw64y y44q7i0 y44q7i1 y44q7i22 y44q7ih"><dt class="z1s6m00 _1hbhsw6c6" style="color: rgb(46, 56, 73);"><strong class="y44q7i3">Job Specializations</strong></dt><dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/hotel-restaurant-jobs/" data-automation="jobCardCategoryLink" title="Limit results to Hotel/Restaurant " class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">Hotel/Restaurant</a></dd> / <dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/hotel-management-tourism-services-jobs/" data-automation="jobCardCategoryLink" title="Limit results to Hotel/Tourism in Hotel/Restaurant" class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">Hotel/Tourism</a></dd><div class="z1s6m00 _1hbhsw66u _1hbhsw67y"><span class="z1s6m00 _1hbhsw64y _1hbhsw65e"><span class="z1s6m00 _1hbhsw65i _1hbhsw6p _14n5q8n0 _14n5q8n3 _14n5q8n5 _14n5q8n8"></span></span></div><dt class="z1s6m00 _1hbhsw6c6" style="color: rgb(46, 56, 73);"><strong class="y44q7i3">Job Type</strong></dt><dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/full-time-jobs/" data-automation="jobCardJobTypeLink" title="Limit results to Full-Time " class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">Full-Time</a></dd></dl></div></div></div>'

soup = BeautifulSoup(html_doc, 'html.parser')

job_specializations = []
job_types = []

for dt in soup.find_all('dt', {'class': '_1hbhsw6c6'}):
    strong = dt.find('strong', {'class': 'y44q7i3'})
    if strong and strong.text.strip() == 'Job Specializations':
        for a in dt.find_next('dd').find_all('a'):
            job_specializations.append(a.text.strip())
    elif strong and strong.text.strip() == 'Job Type':
        for a in dt.find_next('dd').find_all('a'):
            job_types.append(a.text.strip())

print('Job Specializations:', job_specializations)
print('Job Types:', job_types)

Job Specializations: ['Accounting/Finance', 'Hotel/Restaurant']
Job Types: ['Temporary', 'Full-Time']


In [None]:
from bs4 import BeautifulSoup

# Sample HTML elements
html_element_1 = '<div class="z1s6m00 _1hbhsw64u _1hbhsw650"> ... </div>'
html_element_2 = '<div class="z1s6m00 _1hbhsw64u _1hbhsw650"> ... </div>'

# Create a Beautiful Soup object for each HTML element
soup_1 = BeautifulSoup(html_element_1, 'html.parser')
soup_2 = BeautifulSoup(html_element_2, 'html.parser')

# Extract job specialization and job type from each HTML element
job_spec_1 = soup_1.find('dt', text='Job Specializations').find_next_sibling('dd').text.strip()
job_type_1 = soup_1.find('dt', text='Job Type').find_next_sibling('dd').text.strip()

job_spec_2 = soup_2.find('dt', text='Job Specializations').find_next_sibling('dd').text.strip()
job_type_2 = soup_2.find('dt', text='Job Type').find_next_sibling('dd').text.strip()

# Print the results
print(f"Job specialization 1: {job_spec_1}")
print(f"Job type 1: {job_type_1}\n")
print(f"Job specialization 2: {job_spec_2}")
print(f"Job type 2: {job_type_2}")

In [25]:
# assume that the HTML code is stored in the variable 'html'
html = '<div class="z1s6m00 _1hbhsw64u _1hbhsw650"><div class="z1s6m00 rqoqz3" aria-hidden="false"><div class="z1s6m00 _1hbhsw66u _1hbhsw67y _1hbhsw69q _1hbhsw68m rqoqz2"><dl class="z1s6m00 _1hbhsw64y y44q7i0 y44q7i1 y44q7i22 y44q7ih"><dt class="z1s6m00 _1hbhsw6c6" style="color: rgb(46, 56, 73);"><strong class="y44q7i3">Job Specializations</strong></dt><dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/accounting-finance-jobs/" data-automation="jobCardCategoryLink" title="Limit results to Accounting/Finance " class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">Accounting/Finance</a></dd> / <dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/general-cost-accounting-finance-jobs/" data-automation="jobCardCategoryLink" title="Limit results to General/Cost Accounting in Accounting/Finance" class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">General/Cost Accounting</a></dd><div class="z1s6m00 _1hbhsw66u _1hbhsw67y"><span class="z1s6m00 _1hbhsw64y _1hbhsw65e"><span class="z1s6m00 _1hbhsw65i _1hbhsw6p _14n5q8n0 _14n5q8n3 _14n5q8n5 _14n5q8n8"></span></span></div><dt class="z1s6m00 _1hbhsw6c6" style="color: rgb(46, 56, 73);"><strong class="y44q7i3">Job Type</strong></dt><dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/temporary-jobs/" data-automation="jobCardJobTypeLink" title="Limit results to Temporary " class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">Temporary</a></dd></dl></div></div></div><div class="z1s6m00 _1hbhsw64u _1hbhsw650"><div class="z1s6m00 rqoqz3" aria-hidden="false"><div class="z1s6m00 _1hbhsw66u _1hbhsw67y _1hbhsw69q _1hbhsw68m rqoqz2"><dl class="z1s6m00 _1hbhsw64y y44q7i0 y44q7i1 y44q7i22 y44q7ih"><dt class="z1s6m00 _1hbhsw6c6" style="color: rgb(46, 56, 73);"><strong class="y44q7i3">Job Specializations</strong></dt><dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/hotel-restaurant-jobs/" data-automation="jobCardCategoryLink" title="Limit results to Hotel/Restaurant " class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">Hotel/Restaurant</a></dd> / <dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/hotel-management-tourism-services-jobs/" data-automation="jobCardCategoryLink" title="Limit results to Hotel/Tourism in Hotel/Restaurant" class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">Hotel/Tourism</a></dd><div class="z1s6m00 _1hbhsw66u _1hbhsw67y"><span class="z1s6m00 _1hbhsw64y _1hbhsw65e"><span class="z1s6m00 _1hbhsw65i _1hbhsw6p _14n5q8n0 _14n5q8n3 _14n5q8n5 _14n5q8n8"></span></span></div><dt class="z1s6m00 _1hbhsw6c6" style="color: rgb(46, 56, 73);"><strong class="y44q7i3">Job Type</strong></dt><dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/full-time-jobs/" data-automation="jobCardJobTypeLink" title="Limit results to Full-Time " class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">Full-Time</a></dd></dl></div></div></div>'
# create a BeautifulSoup object to parse the HTML
soup = BeautifulSoup(html, 'html.parser')

# find all the <div> elements with class "z1s6m00 _1hbhsw64u _1hbhsw650"
job_divs = soup.find_all('div', class_='z1s6m00 _1hbhsw64u _1hbhsw650')

# iterate over the job_divs and extract the job specialization and job type
for div in job_divs:
    dl = div.find('dl', class_='z1s6m00 _1hbhsw64y y44q7i0 y44q7i1 y44q7i22 y44q7ih')
    if dl is not None:
        spec_dt = dl.find('dt', text='Job Specializations')
        spec_dd = spec_dt.find_next('dd')
        specialization = spec_dd.text.strip()

        type_dt = dl.find('dt', text='Job Type')
        type_dd = type_dt.find_next('dd')
        job_type = type_dd.text.strip()

        print('Job Specialization:', specialization)
        print('Job Type:', job_type)

Job Specialization: Accounting/Finance
Job Type: Temporary
Job Specialization: Hotel/Restaurant
Job Type: Full-Time


  spec_dt = dl.find('dt', text='Job Specializations')
  type_dt = dl.find('dt', text='Job Type')


In [61]:
html_element = '<div class="z1s6m00 _1hbhsw64u _1hbhsw650"><div class="z1s6m00 rqoqz3" aria-hidden="false"><div class="z1s6m00 _1hbhsw66u _1hbhsw67y _1hbhsw69q _1hbhsw68m rqoqz2"><dl class="z1s6m00 _1hbhsw64y y44q7i0 y44q7i1 y44q7i22 y44q7ih"><dt class="z1s6m00 _1hbhsw6c6" style="color:#2E3849"><strong class="y44q7i3">Job Specializations</strong></dt><dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/accounting-finance-jobs/" data-automation="jobCardCategoryLink" title="Limit results to Accounting/Finance " class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">Accounting/Finance</a></dd> / <dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/general-cost-accounting-finance-jobs/" data-automation="jobCardCategoryLink" title="Limit results to General/Cost Accounting in Accounting/Finance" class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">General/Cost Accounting</a></dd><div class="z1s6m00 _1hbhsw66u _1hbhsw67y"><span class="z1s6m00 _1hbhsw64y _1hbhsw65e"><span class="z1s6m00 _1hbhsw65i _1hbhsw6p _14n5q8n0 _14n5q8n3 _14n5q8n5 _14n5q8n8"></span></span></div><dt class="z1s6m00 _1hbhsw6c6" style="color:#2E3849"><strong class="y44q7i3">Job Type</strong></dt><dd class="z1s6m00 _1hbhsw652"><a href="/en/job-search/temporary-jobs/" data-automation="jobCardJobTypeLink" title="Limit results to Temporary " class="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h">Temporary</a></dd></dl></div></div></div>'

soup = BeautifulSoup(html_element, 'html.parser')
dd_tags = soup.find_all('dd', class_='_1hbhsw652')
job_info = []
for tag in dd_tags:
  job_info.append(tag.text)
print(job_info)

['Accounting/Finance', 'General/Cost Accounting', 'Temporary']
