In [1]:
import requests
import re
import pandas as pd
from time import perf_counter
from functools import wraps
from bs4 import BeautifulSoup
from tqdm import tqdm
from datetime import datetime

In [2]:
headers = {'Accept': 'text/html',
           'Accept-Language': 'en-US',
           'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Safari/605.1.15 Version/13.0.4',
           'Referer': 'http://www.google.com/'}

def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = perf_counter()
        result = func(*args, **kwargs)
        end_time = perf_counter()
        total_time = end_time - start_time
        print(f'Function {func.__name__} Took {total_time:.4f} seconds - {args} {kwargs}')
        return result
    return timeit_wrapper


### Every linkedin job search page carries 25 jobs

In [24]:
@timeit
def get_job_links(keyword, start_page, pages):    
    def custom_selector(tag):
        return tag.name == "a" and tag.has_attr("href") and keyword in tag.get('href')
    title = re.sub(' ', '%20', keyword)
    keyword = re.sub(' ', '-', keyword) # This is used inside custom_selector's scope   
    print(f'Searching for {keyword}')
    job_links = []
    position = start_page
    try:
        for page in tqdm(range(pages)):
            # url = f"https://www.linkedin.com/jobs/search/?currentJobId=3693533935&distance=25&geoId=102454443&keywords={title}&origin=JOB_SEARCH_PAGE_KEYWORD_HISTORY&refresh=true&start={position}"
            # url = f"https://www.linkedin.com/jobs/search/?currentJobId=3693533935&distance=25&geoId=102454443&keywords={title}&currentJobId=3415227738&position=1&pageNum=0&start={position}" 
            url = f"https://www.linkedin.com/jobs/search/?currentJobId=3693533935&geoId=102454443&keywords={title}&location=Singapore&origin=JOB_SEARCH_PAGE_SEARCH_BUTTON&refresh=true&start={position}"
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.text,'html.parser')
            tags = soup.find_all(custom_selector)
            for tag in tags:
                job_links.append(tag.get('href'))
            position += 25
    except Exception as e:
        print(f'Error at page {page}')
        print(e)
    finally:
        return job_links

In [10]:
# @timeit
def get_job_info(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text,'html.parser')
    info = {}

    # Page title
    title = soup.find('title')
    if title:
        info['company'] = title.text.split(' hiring')[0]

    # Job title
    job_title = soup.find('h1')
    if job_title:
        info['job_title'] = job_title.text

    # Job level, type (full time etc), sector
    criteria = soup.find_all('span', class_="description__job-criteria-text description__job-criteria-text--criteria")
    if criteria:
        criteria = [x.text.strip(' \n') for x in criteria]
        try:
            info['level'] = criteria[0]
            info['job_type'] = criteria[1]
            info['industry1'] = criteria[2]
            info['industry2'] = criteria[3]
        except Exception as e:
            print(e, criteria)

    # Job scope and requirements
    descriptions = soup.find(class_ = "show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden")
    if descriptions:
        descriptions = [li.text for li in descriptions.find_all('li')]
        info['descriptions'] = descriptions
        for desc in descriptions:
            if 'experience' in desc:
                info['experience'] = desc   
            if ('PySpark' in desc) or ('Spark' in desc):
                info['spark'] = desc 

    info['link'] = url            
    
    return info

In [25]:
links = get_job_links('data analyst', start_page=0, pages=20)
print(len(links))

Searching for data-analyst


100%|██████████| 20/20 [00:23<00:00,  1.18s/it]

Function get_job_links Took 23.5748 seconds - ('data analyst',) {'start_page': 0, 'pages': 20}
342





In [26]:
main = {}
for index, link in tqdm(enumerate(links), total = len(links), dynamic_ncols =True):
    #print(f'Link {index}', end=' - ')
    main[index] = get_job_info(link)

  6%|▋           | 21/342 [00:24<07:19,  1.37s/it]

list index out of range


 13%|█▌          | 43/342 [00:50<05:40,  1.14s/it]

list index out of range


 93%|██████████▎| 319/342 [06:06<00:27,  1.21s/it]

list index out of range


100%|██████████▉| 341/342 [06:33<00:01,  1.03s/it]

list index out of range


100%|███████████| 342/342 [06:34<00:00,  1.15s/it]


In [27]:
df = pd.DataFrame.from_dict(main, orient='index')
df = df[~(df['company'].isnull() | df['job_title'].isnull() | df['level'].isnull())]
df = df.drop_duplicates(subset=['link'])
df = df[['company', 'job_title', 'level', 'job_type', 'experience', 'spark', 'descriptions', 'industry1', 'industry2', 'link']]
df = df.sort_values(by=['level', 'spark', 'company', 'job_type'],
                    ascending= [True, False, True, True])
df


Unnamed: 0,company,job_title,level,job_type,experience,spark,descriptions,industry1,industry2,link
72,Shopee,"Data Analyst - Marketing Analytics, Regional B...",Associate,Full-time,,(Strong plus but not required) Proficiency in ...,[Provide data and insight support for specific...,"Analyst, Marketing, and Information Technology","Technology, Information and Internet",https://sg.linkedin.com/jobs/view/data-analyst...
120,Shopee,"Data Analyst - Marketing Analytics, Regional B...",Associate,Full-time,,(Strong plus but not required) Proficiency in ...,[Provide data and insight support for specific...,"Analyst, Marketing, and Information Technology","Technology, Information and Internet",https://sg.linkedin.com/jobs/view/data-analyst...
210,Shopee,"Data Analyst - Marketing Analytics, Regional B...",Associate,Full-time,,(Strong plus but not required) Proficiency in ...,[Provide data and insight support for specific...,"Analyst, Marketing, and Information Technology","Technology, Information and Internet",https://sg.linkedin.com/jobs/view/data-analyst...
259,Shopee,"Data Analyst - Marketing Analytics, Regional B...",Associate,Full-time,,(Strong plus but not required) Proficiency in ...,[Provide data and insight support for specific...,"Analyst, Marketing, and Information Technology","Technology, Information and Internet",https://sg.linkedin.com/jobs/view/data-analyst...
81,HCLTech,Data Analyst (SQL/Snowflake),Associate,Full-time,At least 3 years of data-related working exper...,,"[Working closely with business end-users, mark...",Information Technology,IT Services and IT Consulting,https://sg.linkedin.com/jobs/view/data-analyst...
...,...,...,...,...,...,...,...,...,...,...
144,Glints,Business Analyst/ Data Analyst Intern,Not Applicable,Internship,,,[The mission of this role is to solve strategi...,"Research, Analyst, and Information Technology","Transportation, Logistics, Supply Chain and St...",https://sg.linkedin.com/jobs/view/business-ana...
188,Glints,Business Analyst/ Data Analyst Intern,Not Applicable,Internship,,,[The mission of this role is to solve strategi...,"Research, Analyst, and Information Technology","Transportation, Logistics, Supply Chain and St...",https://sg.linkedin.com/jobs/view/business-ana...
282,Glints,Business Analyst/ Data Analyst Intern,Not Applicable,Internship,,,[The mission of this role is to solve strategi...,"Research, Analyst, and Information Technology","Transportation, Logistics, Supply Chain and St...",https://sg.linkedin.com/jobs/view/business-ana...
305,Glints,Business Analyst/ Data Analyst Intern,Not Applicable,Internship,,,[The mission of this role is to solve strategi...,"Research, Analyst, and Information Technology","Transportation, Logistics, Supply Chain and St...",https://sg.linkedin.com/jobs/view/business-ana...


In [28]:
df.to_excel(f"data-analyst_{datetime.now().strftime('%Y-%m-%d-%M%S')}.xlsx", engine='xlsxwriter')