In [1]:
import requests
import re
import pandas as pd
import os
from time import perf_counter
from functools import wraps
from bs4 import BeautifulSoup
from tqdm import tqdm
from datetime import datetime
import logging

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler = logging.FileHandler('logs/app.log', mode='a')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

### Functions, wrappers and request headers

In [42]:
headers = {'Accept': 'text/html',
           'Accept-Language': 'en-US',
           'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Safari/605.1.15 Version/13.0.4',
           'Referer': 'http://www.google.com/'}

headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' # My actual user agent


def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = perf_counter()
        result = func(*args, **kwargs)
        end_time = perf_counter()
        total_time = end_time - start_time
        logger.info(f'Function {func.__name__} Took {total_time:.4f} seconds - {args} {kwargs}')
        return result
    return timeit_wrapper

@timeit
def get_job_links(keyword: str, start_page: int, pages: int)-> tuple:    
    '''Function to retrieve all job links over specified number of pages and search
    Inputs:
        keyword: str - job title and other keywords
        start_page: int - page to start
        pages: int - number of pages to retrieve
    Returns 
        list of job links
        list of actual urls used
    '''
    def custom_selector(tag):
        '''
        Helper function used to identify a href tag with belongs to the job link
        Inputs:
            soup tag
        Returns soup tag
        '''
        return tag.name == "a" and tag.has_attr("href") and keyword in tag.get('href')
    
    title = re.sub(' ', '%20', keyword.lower()) # This is used for building url
    keyword = re.sub(' ', '-', keyword.lower()) # This is used inside custom_selector's scope   

    logger.info(f'Searching for {keyword}')
    job_links = []
    redirected_urls = []
    position = start_page
    currentJobId = None
    try:
        for page in tqdm(range(pages)):
            # url = f"https://www.linkedin.com/jobs/search/?currentJobId=3693533935&distance=25&geoId=102454443&keywords={title}&origin=JOB_SEARCH_PAGE_KEYWORD_HISTORY&refresh=true&start={position}"
            # url = f"https://www.linkedin.com/jobs/search/?currentJobId=3693533935&distance=25&geoId=102454443&keywords={title}&currentJobId=3415227738&position=1&pageNum=0&start={position}" 
            if not currentJobId:
                url = f"https://www.linkedin.com/jobs/search/?geoId=102454443&keywords={title}&location=Singapore&start={position}"
            else:
                position = 0
                url = f"https://www.linkedin.com/jobs/search/?currentJobId={currentJobId}&geoId=102454443&keywords={title}&location=Singapore&start={position}"
            # url = f"https://www.linkedin.com/jobs/search/?geoId=102454443&keywords={title}&location=Singapore&start={position}"
            response = requests.get(url, headers=headers)
            redirected_urls.append(response.url) # Get the actual url in case of checking
            soup = BeautifulSoup(response.text,'html.parser')
            tags = soup.find_all(custom_selector)
            for tag in tags:
                link = tag.get('href')
                link = link.split('?')[0] # Tidy up the link to remove the trackingid
                job_links.append(link)

            # Get the last jobid to start on the next page
            currentJobId = re.findall('-([0-9]{6,})', link)[0]
            logger.info(f'LastJobId found: {currentJobId}')
            position += 25
        logger.info(f'Retrieved links: {len(job_links)}')
    except Exception as e:
        logger.error(f'Error at page {page}, {e}')
    finally:
        return job_links, redirected_urls
    
def get_job_info(url: str, index: int, return_soup: bool=False):
    '''
    Function to retrieve information for an individual job page
    Inputs:
        url: str - url for job page
        index: int - index of file for tracking
        return_soup: bool - whether to return the soup object for debugging
    Returns
        dict
    '''
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text,'html.parser')
    info = {}

    # JobID

    jobid = re.findall('-(\d{8,})', url) # removed [\?/] from pattern because of url cleaning ? away in new version
    if len(jobid) == 1 :
        info['job_id'] = jobid[0]
    else:
        logger.error(f'Index {index}: Found no Job ID or multiple Job IDs for {url}')

    # Page title
    title = soup.find('title')
    if title:
        info['company'] = title.text.split(' hiring')[0]
    else:
        logger.error(f'Index {index}: Found no company name for {url}')

    # Job title
    job_title = soup.find('h1')
    if job_title:
        info['job_title'] = job_title.text
    else:
        logger.error(f'Index {index}: Found no job title for {url}')

    # Job level, type (full time etc), sector
    criteria = soup.find_all('span', class_="description__job-criteria-text description__job-criteria-text--criteria")
    if criteria:
        criteria = [x.text.strip(' \n') for x in criteria]
        try:
            info['level'] = criteria[0]
            info['job_type'] = criteria[1]
            info['industry1'] = criteria[2]
            info['industry2'] = criteria[3]
        except Exception as e:
            logger.error(f'Index {index}: {e, criteria, url}')

    # Job scope and requirements
    descriptions = soup.find(class_ = "show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden")
    if descriptions:
        # Create empty strings to append to
        line_by_line_desc = ''
        experience = ''
        spark = ''
        degree = ''

        for tag in descriptions.children: # children is a generator of all the tags contained, the alternative to the full tags is .content
            description = tag.text.strip('\n')
            if (tag.name == 'ul') or (tag.name == 'ol'):
                line_by_line_desc += '\u2022' + description + '\n'
            elif tag.name == 'br':
                continue
            else:
                line_by_line_desc += description + '\n'

            # Search for special info interested in
            if 'experience' in description:
                experience += '\u2022' + description + '\n'
            if ('PySpark' in description) or ('Spark' in description):
                spark += '\u2022' + description + '\n'
            if 'degree' in description:
                degree += '\u2022' + description + '\n'

            info['descriptions'] = line_by_line_desc
            info['degree'] = degree
            info['experience'] = experience   
            info['spark'] = spark 
    else: # Print notification if nothing found
        logger.error(f'Index {index}: Found no description for {url}')

    info['link'] = url            
    
    if return_soup:
        return info, soup
    else:
        return info, None
    
def process_df(data: dict, remove_nulls: bool=True, remove_duplicates: bool=True)-> pd.DataFrame:
    '''
    Function to process the dictionaries into a Pandas DataFrame
    Inputs:
        data: dict - dictionary with information fields
    Returns:
        pd.DataFrame - dataframe which is deduplicated, and sorted
    '''
    df = pd.DataFrame.from_dict(data, orient='index')
    df = df.set_index('job_id')

    if remove_nulls:
        wNulls = len(df)
        # df = df[~(df['company'].isnull() & df['job_title'].isnull() & df['level'].isnull() & df['descriptions'].isnull())]
        df = df[~df.index.duplicated(keep='first')]
        logger.info(f'Removed {wNulls - len(df)} empty rows')

    # Deduplication, some jobs are similar but have different link maybe due to their different posting time / reposting
    if remove_duplicates:
        # subset_duplicates = ['company', 'job_title', 'level', 'job_type', 'degree', 'experience', 'spark', 'descriptions', 'industry1']
        wDups = len(df)
        df = df.drop_duplicates(subset=['job_id'])
        logger.info(f'Removed {wDups - len(df)} duplicates')

    # Sorting order and values
    df = df[['company', 'job_title', 'level', 'job_type', 'experience', 'spark', 'degree', 'descriptions', 'industry1', 'industry2', 'link']]
    df = df.sort_values(by=['company', 'job_title', 'level'],
                        ascending= [True, True, True])
    logger.info(f'Extracted {len(df)} number of jobs')

    return df

def append_to_main(main_df_filepath: str, most_recent_filepaths: list)-> pd.DataFrame:
    '''
    Function to append a list of excels to a main excel file
    Inputs:
        main_df_filepath: str - filepath to the main file
        most_recent_filepaths: list - list of filepaths to iterate through
    Returns:
        pd.DataFrame: a compiled dataframe
    '''


    # Appending to existing dataframe
    subfolder = 'ignore/'
    main_df = pd.read_excel(subfolder+main_df_filepath)
    # subset_duplicates = ['company', 'job_title', 'level', 'job_type', 'degree', 'experience', 'spark', 'descriptions', 'industry1']
    original_rows = len(main_df)
    logger.info(f'Original rows: {original_rows}')

    if most_recent_filepaths:
        for filepath in most_recent_filepaths:
            if 'MAIN' not in filepath:
                sub_df = pd.read_excel(subfolder+filepath, index_col=0)
                # sub_df.insert(2, 'Emailed  / Messaged Recruiter', np.nan)
                # sub_df.insert(2, 'Apply', np.nan)
                logger.info(f'Read rows: {len(sub_df)}')
                main_df = pd.concat([main_df, sub_df])
                # main_df = main_df.drop_duplicates(subset=subset_duplicates)
                main_df = main_df[~main_df.index.duplicated(keep='first')] # This drops by index instead

    else:
        excel_filepaths = [file for file in os.listdir('ignore') if file.endswith('xlsx')]
        for item in enumerate(excel_filepaths):
            print(item)
        idx = input("Select file index: ")
        sub_df = pd.read_excel(subfolder+excel_filepaths[idx], index_col=0)
        # sub_df.insert(2, 'Emailed  / Messaged Recruiter', np.nan)
        # sub_df.insert(2, 'Apply', np.nan)
        logger.info(f'Read rows: {len(sub_df)}')
        main_df = pd.concat([main_df, sub_df])
        # main_df = main_df.drop_duplicates(subset=subset_duplicates)
        main_df = main_df[~main_df.index.duplicated(keep='first')] # This drops by index instead
    
    logger.info(f'Added rows: {len(main_df)-original_rows}')
    
    return main_df

## Input job search keyword

In [43]:
keyword = 'data analyst'
pages = 5

In [32]:
# Test and modifications:
# link = 'https://www.linkedin.com/jobs/view/data-analyst-digital-banking-at-seamoney-3779064039/?trackingId=aCJ2KUqm4mf0TIndAcY4zQ%3D%3D&refId=OHUthHp%2FYKwJ3V%2FzgKIwvQ%3D%3D&pageNum=0&position=16&trk=public_jobs_jserp-result_search-card&originalSubdomain=sg'
# info, soup = get_job_info(link, return_soup=True)
# descriptions = soup.find(class_ = "show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden")
# for tag in descriptions.contents:
#     print(tag)
# print(info['descriptions'])
# info


In [44]:
links, urls = get_job_links(keyword, start_page=0, pages=pages)
main = {}
soups = {}
logger.info('Getting job info')
for index, link in tqdm(enumerate(links), total = len(links), dynamic_ncols =True):
    main[index], soups[index] = get_job_info(link, index, return_soup=True)
# for index, link in enumerate(links):
#     main[index] = get_job_info(link, index)

100%|██████████| 5/5 [00:05<00:00,  1.00s/it]
100%|██████████| 102/102 [01:59<00:00,  1.17s/it]


In [49]:
# soups[88]

In [52]:
df = process_df(main, remove_nulls=True, remove_duplicates=True)
df.head()

Unnamed: 0_level_0,company,job_title,level,job_type,experience,spark,degree,descriptions,industry1,industry2,link
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3716933612,4e Exchange,Senior Data Analyst,Mid-Senior level,Full-time,"• 4e Exchange, a leading financial deri...",,•Bachelor's or Master's degree in Data Science...,"4e Exchange, a leading financial deriv...",Analyst,"Technology, Information and Internet",https://sg.linkedin.com/jobs/view/senior-data-...
3746296451,AbbVie,Business Data Analyst,Mid-Senior level,Full-time,,,•Use appropriate analytics platforms and analy...,\nJob Overview:\nThis position is responsible ...,"Project Management, Other, and Information Tec...",Pharmaceutical Manufacturing and Biotechnology...,https://sg.linkedin.com/jobs/view/business-dat...
3757235292,Allianz Global Investors,Data Analyst,Entry level,Full-time,•We encourage candidates with at least 3 years...,,"•University degree in computer science, IT, fi...",\nJoin us. Let’s care for tomorrow.\nAt Allian...,Information Technology,Financial Services and Insurance,https://sg.linkedin.com/jobs/view/data-analyst...
3775716634,Arta Finance,Data Analyst,Entry level,Full-time,•Bachelor's degree. MBA is a plus.Minimum 3 ye...,,•Bachelor's degree. MBA is a plus.Minimum 3 ye...,Arta Finance is a dynamic fintech star...,Information Technology,Financial Services,https://sg.linkedin.com/jobs/view/data-analyst...
3780122428,Big Cloud,Senior Data Analyst,Mid-Senior level,Full-time,•Are you experienced within data analytics?\n,,•Bachelor's degree in statistics/ business ana...,\nAre you experienced within data analytics?\n...,Engineering and Information Technology,"Transportation, Logistics, Supply Chain and St...",https://sg.linkedin.com/jobs/view/senior-data-...


In [53]:
most_recent_file = f"{keyword}_{datetime.now().strftime('%Y-%m-%d-%H%M')}.xlsx"
df.to_excel(f"ignore/{most_recent_file}", engine='xlsxwriter')

In [54]:
most_recent_filepaths = [file for file in os.listdir('ignore') if datetime.now().strftime('%Y-%m-%d') in file]
most_recent_filepaths

['data analyst_2023-12-23-1602.xlsx']

In [55]:
main_df = append_to_main('MAIN_2023-12-22.xlsx', most_recent_filepaths)

In [56]:
main_df.to_excel(f"ignore/MAIN_{datetime.now().strftime('%Y-%m-%d')}.xlsx", engine='xlsxwriter')