In [54]:
import requests
import re
import pandas as pd
from time import perf_counter
from functools import wraps
from bs4 import BeautifulSoup
from tqdm import tqdm
from datetime import datetime

In [8]:
headers = {'Accept': 'text/html',
           'Accept-Language': 'en-US',
           'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Safari/605.1.15 Version/13.0.4',
           'Referer': 'http://www.google.com/'}

def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = perf_counter()
        result = func(*args, **kwargs)
        end_time = perf_counter()
        total_time = end_time - start_time
        print(f'Function {func.__name__} Took {total_time:.4f} seconds - {args} {kwargs}')
        return result
    return timeit_wrapper


### Every linkedin job search page carries 25 jobs

In [10]:
@timeit
def get_job_links(keyword, start_page, pages):    
    def custom_selector(tag):
        return tag.name == "a" and tag.has_attr("href") and keyword in tag.get('href')
    title = re.sub(' ', '%20', keyword)
    keyword = re.sub(' ', '-', keyword) # This is used inside custom_selector's scope   
    print(f'Searching for {keyword}')
    job_links = []
    position = start_page
    position = position*25 + 1
    try:
        for page in tqdm(range(pages)):
            url = f"https://www.linkedin.com/jobs/search/?currentJobId=3693533935&distance=25&geoId=102454443&keywords={title}&origin=JOB_SEARCH_PAGE_KEYWORD_HISTORY&refresh=true&start={position}"
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.text,'html.parser')
            tags = soup.find_all(custom_selector)
            for tag in tags:
                job_links.append(tag.get('href'))
            position = position*25 + 1
    except Exception as e:
        print(f'Error at page {page}')
        print(e)
    finally:
        return job_links

In [40]:
# @timeit
def get_job_info(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text,'html.parser')
    info = {}

    # Page title
    title = soup.find('title')
    if title:
        info['title'] = title.text.split(' hiring')[0]

    # Job title
    job_title = soup.find('h1')
    if job_title:
        info['job_title'] = job_title.text

    # Job level, type (full time etc), sector
    criteria = soup.find_all('span', class_="description__job-criteria-text description__job-criteria-text--criteria")
    if criteria:
        criteria = [x.text.strip(' \n') for x in criteria]
        info['level'] = criteria[0]
        info['job_type'] = criteria[1]
        info['industry1'] = criteria[2]
        info['industry2'] = criteria[3]

    # Job scope and requirements
    descriptions = soup.find(class_ = "show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden")
    if descriptions:
        descriptions = [li.text for li in descriptions.find_all('li')]
        info['descriptions'] = descriptions
        for desc in descriptions:
            if 'experience' in desc:
                info['experience'] = desc   

    info['link'] = url            
    
    return info

In [51]:
links = get_job_links('data analyst', start_page=0, pages=10)
print(len(links))

Searching for data-analyst


100%|██████████| 10/10 [00:11<00:00,  1.11s/it]

Function get_job_links Took 11.1480 seconds - ('data analyst',) {'start_page': 0, 'pages': 10}
143





In [52]:
main = {}
for index, link in tqdm(enumerate(links), total = len(links)):
    #print(f'Link {index}', end=' - ')
    main[index] = get_job_info(link)

100%|██████████| 143/143 [02:50<00:00,  1.19s/it]


In [56]:
df = pd.DataFrame.from_dict(main, orient='index')
df = df[~(df['title'].isnull() | df['job_title'].isnull() | df['level'].isnull())]
df


Unnamed: 0,title,job_title,level,job_type,industry1,industry2,descriptions,experience
1,YipitData,Data Analyst - Singapore,Entry level,Full-time,Information Technology,Market Research,[Data: Work with massive amounts of data on th...,
2,WhiteCoat Global,Data Analyst,Mid-Senior level,Full-time,Health Care Provider,Hospitals and Health Care,[1-3 years of relevant work experience in Anal...,1-3 years of relevant work experience in Analy...
3,Arta Finance,Data Analyst,Entry level,Full-time,Information Technology,Financial Services,"[Data Analysis:, Perform exploratory data anal...",Minimum 3 years of relevant work experience.
4,Allianz Global Investors,Data Analyst,Entry level,Full-time,Information Technology,Financial Services and Insurance,[Develop a in depth understanding and knowledg...,An Agile mind-set and experience in investment...
6,Glints,Business Analyst/ Data Analyst Intern,Not Applicable,Internship,"Research, Analyst, and Information Technology","Transportation, Logistics, Supply Chain and St...",[The mission of this role is to solve strategi...,
...,...,...,...,...,...,...,...,...
138,YipitData,Data Analyst - Singapore,Entry level,Full-time,Information Technology,Market Research,[Data: Work with massive amounts of data on th...,
139,SISTIC Singapore,Data Analyst,Associate,Full-time,Information Technology,Entertainment Providers,[Delve into our diverse event portfolio and de...,At least 1-3 years of experience working in a ...
140,TikTok,E-commerce data analyst,Associate,Full-time,Analyst and Strategy/Planning,"IT Services and IT Consulting, Entertainment P...",[],
141,AbbVie,Business Data Analyst,Mid-Senior level,Full-time,"Project Management, Other, and Information Tec...",Pharmaceutical Manufacturing and Biotechnology...,[Use appropriate analytics platforms and analy...,


In [66]:
df.to_excel(f"data-analyst_{datetime.now().strftime('%Y-%m-%d-%M%S')}.xlsx", engine='xlsxwriter')