In [1]:
import os
import yaml
from tqdm import tqdm
from datetime import datetime
from modules.utils import *

### Functions, wrappers and request headers

In [13]:
from selenium import webdriver
import time

def get_job_links_selenium(keyword: str, pages: int)-> tuple:    
    '''Function to retrieve all job links over specified number of pages and search
    Inputs:
        keyword: str - job title and other keywords
        pages: int - number of pages to retrieve, achieved through scrolling with Selenium
    Returns 
        list of job links
        list of actual urls used
        counter dictionary of JobId
    '''
    def custom_selector(tag):
        '''
        Helper function used to identify a href tag with belongs to the job link
        Inputs:
            soup tag
        Returns soup tag
        '''
        return tag.name == "a" and tag.has_attr("href") and keyword in tag.get('href')
    
    title = re.sub(' ', '%20', keyword.lower()) # This is used for building url
    keyword = re.sub(' ', '-', keyword.lower()) # This is used inside custom_selector's scope   

    logger.info(f'Searching for {keyword}')
    data = {}
    try:
        driver = webdriver.Chrome()
        driver.implicitly_wait(30)
        driver.get(f"https://www.linkedin.com/jobs/search/?distance=25&geoId=102454443&keywords={title}&location=Singapore&start=0")
        for i in range(0, pages):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(4)

        # Get the full html after scrolling is complete, then parse via bs4
        html_source = driver.page_source
        soup = BeautifulSoup(html_source,'html.parser')
        tags = soup.find_all(custom_selector)

        # Get the link and jobid for each listed job
        for tag in tags:
            link = tag.get('href')
            link = link.split('?')[0] # Tidy up the link to remove the trackingid
            if 'login' not in link:
                data.setdefault(link, 0)
                data[link] += 1
        logger.info(f'Unique links: {len(data)}')
    except Exception as e:
        logger.error(f'Error at scroll {i}, {e}')
    finally:
        return data

## Input job search keyword

In [2]:
keyword = 'data analyst'
pages = 20

# with open('config.yaml', 'r') as f:
#     config = yaml.safe_load(f)
#     keyword_list = config['job_titles']
#     pages = config['pages']

In [3]:
links = get_job_links(keyword, start_page=0, pages=pages)
main = {}
soups = {}
for index, link in tqdm(enumerate(links), total = len(links), dynamic_ncols =True):
    main[index], soups[index] = get_job_info(link, index, return_soup=True)

100%|██████████| 52/52 [01:11<00:00,  1.37s/it]


In [74]:
# soups[88]
# links

In [4]:
df = process_df(main, remove_nulls=True, remove_duplicates=True)
df.head()

Unnamed: 0_level_0,date_logged,company,job_title,level,job_type,experience,spark,degree,descriptions,industry1,industry2,link
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3790092207,2023-12-25,Adecco,Data Analyst (Tech MNC/ SQL / UP7K+),Entry level,Full-time,•2+ years of experience working in data-relate...,,,Data Analyst\nA famous Tech MNC\nThe O...,Information Technology,Staffing and Recruiting,https://sg.linkedin.com/jobs/view/data-analyst...
3757235292,2023-12-25,Allianz Global Investors,Data Analyst,Entry level,Full-time,•We encourage candidates with at least 3 years...,,"•University degree in computer science, IT, fi...",\nJoin us. Let’s care for tomorrow.\nAt Allian...,Information Technology,Financial Services and Insurance,https://sg.linkedin.com/jobs/view/data-analyst...
3729120053,2023-12-25,Amaris Consulting,Data Analyst,Entry level,Full-time,"•At Amaris, we strive to provide our candidate...",,•Education: A bachelor's degree in a relevant ...,Who are we?\nAmaris Consulting\n is an...,Information Technology,IT Services and IT Consulting,https://sg.linkedin.com/jobs/view/data-analyst...
3791812116,2023-12-25,Applicantz,Data Analyst,Mid-Senior level,Contract,"•Our large, Fortune Technology client is ranke...",,,\nTHIS IS A 4+ months CONTRACT POSITION WITH O...,Information Technology,"Technology, Information and Internet and Compu...",https://sg.linkedin.com/jobs/view/data-analyst...
3775716634,2023-12-25,Arta Finance,Data Analyst,Entry level,Full-time,•Bachelor's degree. MBA is a plus.Minimum 3 ye...,,•Bachelor's degree. MBA is a plus.Minimum 3 ye...,Arta Finance is a dynamic fintech star...,Information Technology,Financial Services,https://sg.linkedin.com/jobs/view/data-analyst...


### By saving and compiling all the intermediate files

In [5]:
most_recent_file = f"{keyword}_{datetime.now().strftime('%Y-%m-%d:%H%M')}.xlsx"
df.to_excel(f"ignore/{most_recent_file}", engine='xlsxwriter')
most_recent_filepaths = [file for file in os.listdir('ignore') if datetime.now().strftime('%Y-%m-%d') in file]
most_recent_filepaths

['data analyst_2023-12-25-2047.xlsx']

In [7]:
# main_df = update_main('files/MAIN_2023-12-25.xlsx', most_recent_filepaths)
main_df = update_main('files/MAIN_2023-12-25.xlsx', [df])

In [56]:
main_df.to_excel(f"ignore/MAIN_{datetime.now().strftime('%Y-%m-%d')}.xlsx", engine='xlsxwriter')