In [1]:
import urllib
import requests
from bs4 import BeautifulSoup 
import pandas as pd



Web scraping can be divided into a few steps:

1. Request the source code/content of a page to a server
2. Download the response (usually HTML)
3. Parse the downloaded information to identify and extract the information we need

Uncomment the following to have a look on how the main element looks like:

In [2]:
# url = 'https://malaysia.indeed.com/jobs?q=data+scientist&l=Kuala+Lumpur' # if I were to search for data scientist roles in Kuala Lumpur
# page = requests.get(url) 
# soup = BeautifulSoup(page.content, 'html.parser')
# results = soup.find(id='resultsCol') 
# print(results.prettify())  # use Prettify so the logged content is easier to read

Define a function so that I can search for any roles at any locations

In [3]:
def get_indeed_jobs(job_title, location):
  getVars = {'q' : job_title, 'l' : location}
  my_page = 0 # the first page

  title = []  # create empty lists to store the job details later
  company = []
  location = []

  while my_page<50: # I am going to scrape the data from the first five pages
    url = ('https://malaysia.indeed.com/jobs?' + urllib.parse.urlencode(getVars) + f'&start={my_page}')
    my_page += 10
    page = requests.get(url) # request and download the page
    soup = BeautifulSoup(page.content, 'html.parser')  # parse the HTML code using BeautifulSoup
    results = soup.find(id='resultsCol') # object that show us all the information inside our main element
    indeed_jobs = results.find_all('a', class_="tapItem")
    
    for indeed_job in indeed_jobs:
      job_title = indeed_job.find('span', class_='') # get the job title 
      job_company = indeed_job.find('span', class_='companyName')  # get the company's name
      job_location = indeed_job.find('div', class_='companyLocation') # get the company's location
      title.append(job_title.text)
      company.append(job_company.text)
      location.append(job_location.text)
    
  job_data = pd.DataFrame({'Title': title, 'Company': company, 'Location': location}) # store the scraped data to a dataframe
  job_data.drop_duplicates() # remove any duplicated rows
  print(job_data)
  job_data.to_csv('jobs.csv', index=False) # save the dataframe to a csv file

In [4]:
get_indeed_jobs('data science intern', 'Kuala Lumpur') # if were to search for data science internship opportunities in Kuala Lumpur

                                                Title  ...                Location
0   Intern, Data Analysis & Digitalization, SC Exc...  ...            Kuala Lumpur
1   University Internships - Finance & Accounting ...  ...            Kuala Lumpur
2      Intern, SQL Data Management (Finance Services)  ...                 Bangsar
3                                  Intern - Marketing  ...            Kuala Lumpur
4                                              Intern  ...            Kuala Lumpur
..                                                ...  ...                     ...
61           Product Management Intern (Kuala Lumpur)  ...  Remote in Kuala Lumpur
62                                Data Science Intern  ...  Remote in Kuala Lumpur
63                          Intern Research Assistant  ...              Mont Kiara
64                                      Physio Intern  ...         Mid Valley City
65        Software Engineer (Fullstack - Python & JS)  ...            Kuala Lumpur

[66