# Scraping data from Indeed.com

In [1]:
# import libraries

import pandas as pd 
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep

# setting the webdriver
    to allow us to interact with the web browser page 

In [2]:
# create an intance ChromeOptions to configure various options for the driver
options = webdriver.ChromeOptions()

# add my user-agent to mimic the chrome webriver we created
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
options.add_argument(f'user_agent = {user_agent}')

# setting the webdriver
driver = webdriver.Chrome(options = options)



# url of the website we want to scrape
url = 'https://eg.indeed.com/jobs?q=data%20analyst&l=Cairo'

# navigate the url using the driver to begin scraping
driver.get(url)
sleep(5)

# Using BeautifulSoup to parse the html source script

In [3]:
# extract the html source
src = driver.page_source

# parse it
soup = BeautifulSoup(src, 'html.parser')

# extracting specific info
    like:
    - job title
    - company name
    - company location
    - Job summary
    - Job posted date

##### Finding all job postings

In [4]:
jobs = soup.find_all('div', {'class':'job_seen_beacon'})
len(jobs)

15

##### 1st job post

In [5]:
first = jobs[0]
#first

##### setting an empty list to fill it later

In [6]:
records = []

#### Extract the job title

In [7]:
job_title = first.a.span.get('title')
job_title

'Data Analyst'

#### Extract the company name

In [8]:
company_name = first.find('div', 'css-1qv0295 e37uo190').text.strip()
company_name

'Kotn'

#### Extract the company's location

In [9]:
company_loc = first.find('div', 'css-1p0sjhy eu4oa1w0').text.strip()
company_loc

'Cairo'

#### Extract the job summary

In [10]:
summary = first.find('div', 'underShelfFooter').ul.text.replace('\n',' ').strip()
summary

'Perform data entry tasks to input new data and update existing records. The Data Analyst will work closely with cross-functional teams to gather required data…'

#### Extract the job posted date

In [11]:
Date_JobPosted = first.find('div', 'underShelfFooter').find('span', 'css-qvloho eu4oa1w0').find_all(text=True, recursive=False)[-1]
Date_JobPosted

  Date_JobPosted = first.find('div', 'underShelfFooter').find('span', 'css-qvloho eu4oa1w0').find_all(text=True, recursive=False)[-1]


'Posted 30+ days ago'

#### Extract the job link

In [13]:
job_link = first.a.get('href')
job_link

'/rc/clk?jk=dee90d9d2570a0cd&bb=y2uMcNqrrU1oAXGTd6rLJdGrozpB9LEgvMclSuLTgxnR4wBzAPg9sG3po6i-dQ7syeCLqa0nHpMW_IfHiXrt1d_G38QVIRMu1yxrx1ikxLfGGBjg9ip_4Cr2vVdJoCZ_&xkcb=SoCr67M39gaFxBW2lR0LbzkdCdPP&fccid=20febb5c80e54eb1&vjs=3'

# Generalize all extracted data in a function

In [14]:
def get_records(first):
    # job title
    job_title = first.a.span.get('title')

    # company_name
    try:
        company_name = first.find('div', 'css-1qv0295 e37uo190').text.strip()
    except AttributeError:
        company_name = 'confidential'
        
    # company location
    try:
        company_loc = first.find('div', 'css-1p0sjhy eu4oa1w0').text.strip()
    except AttributeError:
        company_loc = 'confidential'
        
    # job summary
    try:
        summary = first.find('div', 'underShelfFooter').ul.text.replace('\n',' ').strip()
    except AttributeError:
        summary = 'Not Found'
        
    # job postes date  
    try:
        Date_JobPosted = first.find('div', 'underShelfFooter').find('span', 'css-qvloho eu4oa1w0').find_all(text=True, recursive=False)[-1]
    except AttributeError:
        Date_JobPosted = 'Not Found'
    
    # job link
    try:
        job_link = first.a.get('href')
    except AttributeError:
        job_link = 'Not Found'
        
        
    
    return (job_title, company_name, company_loc, summary, Date_JobPosted, job_link)


get_records(first)

  Date_JobPosted = first.find('div', 'underShelfFooter').find('span', 'css-qvloho eu4oa1w0').find_all(text=True, recursive=False)[-1]


('Data Analyst',
 'Kotn',
 'Cairo',
 'Perform data entry tasks to input new data and update existing records. The Data Analyst will work closely with cross-functional teams to gather required data…',
 'Posted 30+ days ago',
 '/rc/clk?jk=dee90d9d2570a0cd&bb=y2uMcNqrrU1oAXGTd6rLJdGrozpB9LEgvMclSuLTgxnR4wBzAPg9sG3po6i-dQ7syeCLqa0nHpMW_IfHiXrt1d_G38QVIRMu1yxrx1ikxLfGGBjg9ip_4Cr2vVdJoCZ_&xkcb=SoCr67M39gaFxBW2lR0LbzkdCdPP&fccid=20febb5c80e54eb1&vjs=3')

# Extracting data from multiple pages with while loop

In [15]:
# setting the link of pagination
while True:
    try:
        url = 'https://eg.indeed.com/' + soup.find('a', {'aria-label':'Next Page'}).get('href')
    except AttributeError:
        break
        
    # to acces the url that we'll use for iteration
    driver.get(url)
    
    # getting the html src again for the next iterable pages
    src = driver.page_source

    # parse the new html src
    soup = BeautifulSoup(src, 'html.parser')
    
    # find all job postings
    jobs = soup.find_all('div', {'class':'job_seen_beacon'})

    
    # begin iteration
    for first in jobs:
        record = get_records(first)
        
        records.append(record)
        

# lastly quit the driver :)
driver.quit()

  Date_JobPosted = first.find('div', 'underShelfFooter').find('span', 'css-qvloho eu4oa1w0').find_all(text=True, recursive=False)[-1]


# Save the scrapted data into a csv  file

In [16]:
# converts list of records into a dataframe
df = pd.DataFrame(records, columns=['job_title', 'company_name', 'company_loc', 'summary', 'Date_JobPosted', 'job_link'])  

# save the dataframe into a csv file
df.to_csv('indeed_ScrapedData.csv', index=False)

print("csv saved!")

csv saved!


In [17]:
df

Unnamed: 0,job_title,company_name,company_loc,summary,Date_JobPosted,job_link
0,Senior Data Analyst,Nawy Real Estate,Maadi,Familiarity with data warehouse development an...,Posted 9 days ago,/rc/clk?jk=04231f8f9ac86d40&bb=MnrglTZx347eUYR...
1,Financial Analyst,HES world,Cairo,Prepares reports and analyzes financial data i...,Posted 30+ days ago,/rc/clk?jk=b8c26c745ddbe471&bb=MnrglTZx347eUYR...
2,Workforce Analyst,Majorel Egypt,Cairo,Responsible for reporting data management acti...,Posted 30+ days ago,/rc/clk?jk=384d20e20a0e5259&bb=MnrglTZx347eUYR...
3,Senior Analyst - Data & Analytics - Cairo Airp...,Chalhoub Group,Nasr City,Support analysts and business users across the...,Posted 23 days ago,/rc/clk?jk=e7ed07a200269420&bb=MnrglTZx347eUYR...
4,Senior Data Analyst,Procore Technologies,Cairo,We'd love to hear from you if you are a season...,Active 3 days ago,/rc/clk?jk=19c32bbae44cf06e&bb=MnrglTZx347eUYR...
...,...,...,...,...,...,...
220,SAP FICO Consultant,IBM,Cairo,Experienced in trouble shooting system & data ...,Posted 30+ days ago,/rc/clk?jk=746489eeec598b38&bb=_R6ha8Zlz9WNb7t...
221,Datacenter Transformation Architect,Capgemini,New Cairo,Collaborate with financial analysts to ensure ...,Posted 30+ days ago,/rc/clk?jk=4bfeac5f10df87b2&bb=_R6ha8Zlz9WNb7t...
222,Innovation Hub I Cloud Engineering Senior Tech...,Deloitte,Cairo,Experience with data streaming technologies. W...,Posted 30+ days ago,/rc/clk?jk=e70718784d310adb&bb=_R6ha8Zlz9WNb7t...
223,"Innovation Hub I Senior Cloud Engineer, Site R...",Deloitte,Cairo,Experience with data streaming technologies. W...,Posted 30+ days ago,/rc/clk?jk=7a4322674aad267b&bb=_R6ha8Zlz9WNb7t...
