In [2]:
import bs4
import pandas as pd
import re

In [3]:
def extract_info_from_soup(html_filename: str) -> pd.DataFrame:
    entry_classes = 'sc-fHaIPq ehhYXC'
    company_classes = 'sc-lbAedR goolxk'
    job_title_classes = 'sc-jagAyb jOapvc'
    job_link_classes = 'sc-hBtbzO ekANGt'
    date_added_classes = 'sc-dwcupg htOOi'
    
    entry_class = 'sc-fHaIPq'
    company_class = 'sc-lbAedR'
    job_title_class = 'sc-jagAyb'
    job_link_class = 'sc-hBtbzO'
    date_added_class = 'sc-dwcupg'

    file = ''
    with open(html_filename, 'r') as f:
        text_list = f.readlines()
        for line in text_list:
            file += line

    soup = bs4.BeautifulSoup(file, 'html.parser')
    dataframe = pd.DataFrame()
    for entry in soup.find_all(attrs={'class':entry_class}):
        for c in entry:
            if type(c) == bs4.element.Tag:
                if len(c) > 5:
                    has_date = c
                    link = has_date.find_all('a', attrs={'class':job_link_class})
                    if len(link) > 0:
                        link = link[0].get('href')
                    else:
                        link = None
                    date = has_date.find_all(attrs={'class':date_added_classes})
                    date = date[0].text
                    date = date.replace('\n','')[21:]
                else:
                    if company_class in c['class']:
                        company = c.text
                    elif job_title_class in c['class']:
                        job = c.text
        dataframe = dataframe.append({
            'company': company,
            'job_title': job,
            'date_applied': date,
            'link': link
        }, ignore_index=True)
    return dataframe

In [4]:
wishlist = extract_info_from_soup('wishlist.html')
applied = extract_info_from_soup('applied.html')
interview = extract_info_from_soup('interview.html')
rejected = extract_info_from_soup('rejected.html')

wishlist['status'] = 'wishlist'
applied['status'] = 'applied'
interview['status'] = 'interview'
rejected['status'] = 'rejected'

board = pd.DataFrame()
for col in [wishlist, applied, interview, rejected]:
    board = board.append(col)

board['date_applied'] = pd.to_datetime(board['date_applied'] + ' 2021')
board = board.sort_values('date_applied', ascending = False).reset_index(drop=True)

In [5]:
board.head(2)

Unnamed: 0,company,job_title,date_applied,link,status
0,Borrego,Data Analyst,2021-12-14 14:00:00,https://www.linkedin.com/jobs/view/2830632945,interview
1,Embold Health,Software Engineer,2021-12-14 13:36:00,https://www.linkedin.com/jobs/view/2817136029,applied


In [36]:
board.to_csv('job_search.csv', index=False)