# Simple Job scraping file

*Scrape job details from indeed website*

### Github: https://github.com/raydiwill/Job-Scraper


## Install prerequisite packages

In [1]:
# Import prerequisite packages
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import pandas as pd

## Create soup object from url

In [2]:
# Function to creat the soup object from the url.
def extract_page(page):
    # The url may vary, special note at page argument 
    url = f'https://vn.indeed.com/Vi%E1%BB%87c-l%C3%A0m?q=data%20analyst&l=H%C3%A0%20N%E1%BB%99i&start={page}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    return soup

## Extract desirable information 

In [3]:
# Function to scrape data from html texts.
def extract_content(soup):
    # Each job is wrapped inside an a tag, within the tag contains job details
    job_elements = soup.find_all("a", class_="tapItem")
    
    # Loop through each object to find job details
    for job_element in job_elements:
        title = job_element.find("h2", class_="jobTitle")
        
        # Some jobs from indeed.com can have a label wrapped in sub-tag indicating if they're new, this condition check if 
        # there are more tags than the others. Else, return the job title
        if (len(title.findChildren()) > 1):
            title = title.find_next("span").find_next("span").text
        else: 
            title = title.text
        company = job_element.find("span", class_="companyName")
        location = job_element.find("div", class_="companyLocation")
        description = job_element.find("div", class_="job-snippet")
        salary = job_element.find("div", class_="salary-snippet")
        
        # Some jobs may post the salary detail, others may not
        if salary:
            salary = salary.text.strip()
        else:
            salary = '' 
        post_date = job_element.find('span', 'date').text
        today = datetime.today().strftime('%Y-%m-%d')
        
        # Create a tuple to store the data
        record = (title, company.text, location.text, post_date, today, salary, description.text.strip().replace('\n', ' '))
        records.append(record)
    return 

## Scrape data in each page

In [4]:
# Init a list to store all job data
records = []

# Loop through each page until there is no more additional job pages
for i in range(0, 60, 10):
    print(f'Scraping job details from page {round(i/10+1)}')
    page = extract_page(0)
    extract_content(page)
    
print("Process finished! Create a dataframe to store the results")

Scraping job details from page 1
Scraping job details from page 2
Scraping job details from page 3
Scraping job details from page 4
Scraping job details from page 5
Scraping job details from page 6
Process finished! Create a dataframe to store the results


## Create dataframe to store results and export to csv file

In [5]:
# Convert list into data frame for easier to view and write to csv file
df = pd.DataFrame(records)

# Add meaningful names to the data frame
df.columns = ['JobTitle', 'Company', 'Location', 'DatePosted', 'DateExtracted', 'Salary', 'Description']
#print(df.head())

# Write to csv file 
df.to_csv("results.csv")

                            JobTitle  \
0                       Data Analyst   
1  Fresh Business Analyst IT Project   
2                  Data Analyst (HN)   
3                       DATA ANALYST   
4                       Data Analyst   

                                             Company Location  \
0  Công Ty Cổ Phần Truyền Thông Và Giải Trí HG Media   Hà Nội   
1                      Công TY TNHH Tripath Việt Nam   Hà Nội   
2                          Viettel Cyberspace Center   Hà Nội   
3                                          VMG Media   Hà Nội   
4                              Chứng khoán Kỹ Thương   Hà Nội   

             DatePosted DateExtracted                    Salary  \
0    Posted1 ngày trước    2022-01-21  15.000.000 VNĐ một tháng   
1   Posted14 ngày trước    2022-01-21  10.000.000 VNĐ một tháng   
2  Posted30+ ngày trước    2022-01-21                             
3   Posted10 ngày trước    2022-01-21                             
4  Posted30+ ngày trước    2022-0

## The entire code

In [7]:
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import pandas as pd

def extract_page(page):
    url = f'https://vn.indeed.com/Vi%E1%BB%87c-l%C3%A0m?q=data%20analyst&l=H%C3%A0%20N%E1%BB%99i&start={page}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    return soup

def extract_content(soup):
    job_elements = soup.find_all("a", class_="tapItem")
    for job_element in job_elements:
        title = job_element.find("h2", class_="jobTitle")
        if (len(title.findChildren()) > 1):
            title = title.find_next("span").find_next("span").text
        else: 
            title = title.text
        company = job_element.find("span", class_="companyName")
        location = job_element.find("div", class_="companyLocation")
        description = job_element.find("div", class_="job-snippet")
        salary = job_element.find("div", class_="salary-snippet")
        if salary:
            salary = salary.text.strip()
        else:
            salary = '' 
        post_date = job_element.find('span', 'date').text
        today = datetime.today().strftime('%Y-%m-%d')
        record = (title, company.text, location.text, post_date, today, salary, description.text.strip().replace('\n', ' '))
        records.append(record)
    return 

records = []
for i in range(0, 60, 10):
    print(f'Scraping job details from page {round(i/10+1)}')
    page = extract_page(0)
    extract_content(page)
    
print("Process finished! Create a dataframe to store the results")

df = pd.DataFrame(records)
df.columns = ['JobTitle', 'Company', 'Location', 'DatePosted', 'DateExtracted', 'Salary', 'Description']
print(df.head())
df.to_csv("results.csv")

Scraping job details from page 1
Scraping job details from page 2
Scraping job details from page 3
Scraping job details from page 4
Scraping job details from page 5
Scraping job details from page 6
Process finished! Create a dataframe to store the results
                            JobTitle  \
0                       Data Analyst   
1  Fresh Business Analyst IT Project   
2                  Data Analyst (HN)   
3                       DATA ANALYST   
4                       Data Analyst   

                                             Company Location  \
0  Công Ty Cổ Phần Truyền Thông Và Giải Trí HG Media   Hà Nội   
1                      Công TY TNHH Tripath Việt Nam   Hà Nội   
2                          Viettel Cyberspace Center   Hà Nội   
3                                          VMG Media   Hà Nội   
4                              Chứng khoán Kỹ Thương   Hà Nội   

             DatePosted DateExtracted                    Salary  \
0    Posted1 ngày trước    2022-01-21  15.000