In [2]:
import requests
import re
import pandas as pd
import os
from time import perf_counter
from functools import wraps
from bs4 import BeautifulSoup
from tqdm import tqdm
from datetime import datetime

### Wrappers and request headers

In [3]:
headers = {'Accept': 'text/html',
           'Accept-Language': 'en-US',
           'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Safari/605.1.15 Version/13.0.4',
           'Referer': 'http://www.google.com/'}

def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = perf_counter()
        result = func(*args, **kwargs)
        end_time = perf_counter()
        total_time = end_time - start_time
        print(f'Function {func.__name__} Took {total_time:.4f} seconds - {args} {kwargs}')
        return result
    return timeit_wrapper


### Functions 

In [9]:
@timeit
def get_job_links(keyword, start_page, pages):    
    '''Every linkedin job search page carries 25 jobs'''
    def custom_selector(tag):
        return tag.name == "a" and tag.has_attr("href") and keyword in tag.get('href')
    title = re.sub(' ', '%20', keyword)
    keyword = re.sub(' ', '-', keyword) # This is used inside custom_selector's scope   
    print(f'Searching for {keyword}')
    job_links = []
    position = start_page
    currentJobId = None
    try:
        for page in tqdm(range(pages)):
            # url = f"https://www.linkedin.com/jobs/search/?currentJobId=3693533935&distance=25&geoId=102454443&keywords={title}&origin=JOB_SEARCH_PAGE_KEYWORD_HISTORY&refresh=true&start={position}"
            # url = f"https://www.linkedin.com/jobs/search/?currentJobId=3693533935&distance=25&geoId=102454443&keywords={title}&currentJobId=3415227738&position=1&pageNum=0&start={position}" 
            if not currentJobId:
                url = f"https://www.linkedin.com/jobs/search/?geoId=102454443&keywords={title}&location=Singapore&start={position}"
            else:
                url = f"https://www.linkedin.com/jobs/search/?currentJobId={currentJobId}&geoId=102454443&keywords={title}&location=Singapore&start={position}"
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.text,'html.parser')
            tags = soup.find_all(custom_selector)
            for tag in tags:
                link = tag.get('href')
                job_links.append(link)
            # Get the last jobid to start on the next page
            currentJobId = re.findall('-([0-9]{6,})\?', link)[0]
            position += 25
    except Exception as e:
        print(f'Error at page {page}')
        print(e)
    finally:
        return job_links

In [5]:
def get_job_info(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text,'html.parser')
    info = {}

    # Page title
    title = soup.find('title')
    if title:
        info['company'] = title.text.split(' hiring')[0]

    # Job title
    job_title = soup.find('h1')
    if job_title:
        info['job_title'] = job_title.text

    # Job level, type (full time etc), sector
    criteria = soup.find_all('span', class_="description__job-criteria-text description__job-criteria-text--criteria")
    if criteria:
        criteria = [x.text.strip(' \n') for x in criteria]
        try:
            info['level'] = criteria[0]
            info['job_type'] = criteria[1]
            info['industry1'] = criteria[2]
            info['industry2'] = criteria[3]
        except Exception as e:
            print(e, criteria)

    # Job scope and requirements
    descriptions = soup.find(class_ = "show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden")
    if descriptions:
        descriptions = [li.text for li in descriptions.find_all('li')]
        line_by_line_desc = ''
        experience = ''
        spark = ''
        degree = ''
        for desc in descriptions:
            if 'experience' in desc:
                experience += desc + '\n'
            if ('PySpark' in desc) or ('Spark' in desc):
                spark += desc + '\n'
            if 'degree' in desc:
                degree += desc + '\n'
            line_by_line_desc += desc + '\n'
        info['descriptions'] = line_by_line_desc
        info['degree'] = degree
        info['experience'] = experience   
        info['spark'] = spark 

    info['link'] = url            
    
    return info

## Input job search keyword

In [23]:
keyword = 'data analyst'
pages = 5

In [24]:
links = get_job_links(keyword, start_page=0, pages=pages)
print('Retrieved links: ', len(links), '\n')
main = {}
print('Getting job info')
for index, link in tqdm(enumerate(links), total = len(links), dynamic_ncols =True):
    #print(f'Link {index}', end=' - ')
    main[index] = get_job_info(link)

Searching for data-analyst


100%|██████████| 5/5 [00:05<00:00,  1.13s/it]


Function get_job_links Took 5.6528 seconds - ('data analyst',) {'start_page': 0, 'pages': 5}
Retrieved links:  94 

Getting job info


  4%|▍         | 4/94 [00:05<02:08,  1.43s/it]

list index out of range ['Mid-Senior level', 'Full-time', 'Information Technology']


 24%|██▍       | 23/94 [00:27<01:27,  1.24s/it]

list index out of range ['Entry level', 'Full-time', 'Information Technology']


 54%|█████▍    | 51/94 [00:49<00:42,  1.01it/s]

list index out of range ['Mid-Senior level', 'Full-time', 'Information Technology']


100%|██████████| 94/94 [01:28<00:00,  1.07it/s]


In [25]:
df = pd.DataFrame.from_dict(main, orient='index')
df = df[~(df['company'].isnull() | df['job_title'].isnull() | df['level'].isnull())]
subset_duplicates = ['company', 'job_title', 'level', 'job_type', 'degree', 'experience', 'spark', 'descriptions', 'industry1']
df = df.drop_duplicates(subset=subset_duplicates)
df = df[['company', 'job_title', 'level', 'job_type', 'experience', 'spark', 'degree', 'descriptions', 'industry1', 'industry2', 'link']]
df = df.sort_values(by=['level', 'spark', 'company', 'job_type'],
                    ascending= [True, False, True, True])
print(len(df))
df.head()


42


Unnamed: 0,company,job_title,level,job_type,experience,spark,degree,descriptions,industry1,industry2,link
30,Shopee,"Data Analyst - Marketing Analytics, Regional B...",Associate,Full-time,,Develop smart and efficient solutions to repor...,,Provide data and insight support for specific ...,"Analyst, Marketing, and Information Technology","Technology, Information and Internet",https://sg.linkedin.com/jobs/view/data-analyst...
85,HCLTech,Data Analyst (SQL/Snowflake),Associate,Full-time,At least 3 years of data-related working exper...,,Bachelor degree from a recognized tertiary ins...,"Working closely with business end-users, marke...",Information Technology,IT Services and IT Consulting,https://sg.linkedin.com/jobs/view/data-analyst...
61,SHIELD,Data Analyst (Risk),Associate,Full-time,3-5 years of experience as a hands-on analyst...,,,Analysis of rich user and transaction data to ...,Analyst,"Technology, Information and Internet",https://sg.linkedin.com/jobs/view/data-analyst...
43,SISTIC Singapore,Data Analyst,Associate,Full-time,At least 1-3 years of experience working in a ...,,,Delve into our diverse event portfolio and dev...,Information Technology,Entertainment Providers,https://sg.linkedin.com/jobs/view/data-analyst...
25,Starbucks Coffee Singapore,Data Analyst,Associate,Full-time,,,"At least a Bachelor degree in Statistics, Math...",Manage end-to-end data projects; identify issu...,Analyst and Information Technology,Food and Beverage Services,https://sg.linkedin.com/jobs/view/data-analyst...


In [26]:
df.to_excel(f"{keyword}_{datetime.now().strftime('%Y-%m-%d-%M%S')}.xlsx", engine='xlsxwriter')

In [27]:
excel_files = [file for file in os.listdir() if file.endswith('xlsx')]
for item in enumerate(excel_files):
    print(item)

(0, 'data analyst_2023-12-20-0245.xlsx')
(1, 'data engineer_2023-12-20-0033.xlsx')
(2, 'data scientist_2023-12-20-5814.xlsx')
(3, 'data-analyst_2023-12-20-0737.xlsx')
(4, 'data-analyst_2023-12-20-3727.xlsx')
(5, 'MAIN_2023-12-20.xlsx')


In [28]:
main_df = pd.DataFrame(columns = ['company', 'job_title', 'level', 'job_type', 'experience', 'spark', 'degree', 'descriptions', 'industry1', 'industry2', 'link'])

file_indices= [0,1,2,3,4]
for idx in file_indices:
    sub_df = pd.read_excel(excel_files[idx], index_col=0)
    main_df = pd.concat([main_df, sub_df])
    main_df = main_df.drop_duplicates(subset=subset_duplicates)

len(main_df)

111

In [29]:
main_df.to_excel(f"MAIN_{datetime.now().strftime('%Y-%m-%d')}.xlsx", engine='xlsxwriter')