# Resume Job Postings

1. Extracting raw text from job postings

In [4]:
# Setup
import glob
import pandas as pd
from bs4 import BeautifulSoup as bs


In [12]:
# List of files in the html directory
files = glob.glob('./data/html_job_postings//*.html')
print('Number of HTML files: {}'.format(len(files)))

Number of HTML files: 1337


In [15]:
# Load all HTML pages as text into a list -- one entry per HTML page
html_content = []
for file in files:
    with open(file, 'r', encoding='utf-8') as f:
        html_content.append(f.read())

In [16]:
# Inspect the first entry of the list of resumes
html_content[0]

'<html><head><title>Data Engineer - Columbus, GA 31909</title></head>\n<body><h2>Data Engineer - Columbus, GA 31909</h2>\n<p>Celebrating its twenty-fifth anniversary in 2016, Cyient is an acknowledged leader in engineering design services, design-led manufacturing, networks and operations, data transformation, and analytics. We collaborate with our clients to help them achieve more and together shape a better future. We call it Designing Tomorrow Together.\n</p><p></p><p>Our industry focus includes aerospace, defense, rail transportation, off-highway &amp; industrial, power generation, mining, oil &amp; gas, communications, utilities, geospatial, semiconductor and medical technology. We align closely with the business needs, goals, culture, and core values of our clients. This reflects in the deep, long-standing relationships we have developed and sustained with some of the leading names in these industries.\n</p><p></p><p><b>LOCATION\n</b></p>Columbus,Georgia\n<p></p><p><b>JOB DESCRIP

In [20]:
# Store webpage section with skills
html_sections = []
html_dict = {}

for key in ['title', 'body', 'bullets']:
    html_dict[key] = []

# Prototype with first page of the HTML list
first_page = html_content[0]
soup = bs(first_page, "html.parser")
# Get key parts
title = soup.find('title').text
body = soup.find('body').text
bullets = soup.find_all('li')
html_dict['title'].append(title)
html_dict['body'].append(body)
# Cleaning bullets content
html_dict['bullets'].append([b.text.strip() for b in bullets])

df = pd.DataFrame(data=html_dict)
df.head()

Unnamed: 0,title,body,bullets
0,"Data Engineer - Columbus, GA 31909","Data Engineer - Columbus, GA 31909\nCelebratin...","[Bachelor’s or Master’s degree in statistics, ..."


In [25]:
# Apply prototype in a general function
def get_html_content(html_content):
    """Extracts title, and list items from HTML job postings.
    Args:
        html_content (list): List of html resumes.
    Return:
        df: Returns a pandas dataframe with separate columns for title, body, and bullet items."""
    
    html_sections = []
    html_dict = {}
    for key in ['title', 'body', 'bullets']:
        html_dict[key] = []
    for html in html_content:
        soup = bs(html, "html.parser")
        title = soup.find('title').text
        body = soup.find('body').text
        bullets = soup.find_all('li')
        html_dict['title'].append(title)
        html_dict['body'].append(body)
        # remove extra leading and trailing whitespace with strip()
        html_dict['bullets'].append([b.text.strip() for b in bullets])
    
    df = pd.DataFrame(html_dict)
    return df

In [26]:
df = get_html_content(html_content)
df.head()

Unnamed: 0,title,body,bullets
0,"Data Engineer - Columbus, GA 31909","Data Engineer - Columbus, GA 31909\nCelebratin...","[Bachelor’s or Master’s degree in statistics, ..."
1,"Data Analyst - St. Louis, MO","Data Analyst - St. Louis, MO\nDuties\nSummary\...",[Job family (Series)\n1501 General Mathematics...
2,"Data Scientist - Newark, CA","Data Scientist - Newark, CA\nData Scientist\n\...","[Design, develop, document and maintain machin..."
3,Patient Care Assistant / PCA - Med/Surg (Fayet...,Patient Care Assistant / PCA - Med/Surg (Fayet...,[Provides all personal care services in accord...
4,"Scientific Programmer - Berkeley, CA","Scientific Programmer - Berkeley, CA\nCaribou ...","[Demonstrated proficiency with Python, JavaScr..."
