### Step 1: Load Raw HTML Pages
*Load all webpages from your file system into Python as text in a list format*

In [3]:
# Group library imports in the beginning

import glob
import os

import pandas as pd
from bs4 import BeautifulSoup as bs

In [5]:
# Find current working directory and its sub-directories

cwd = os.getcwd()
dirs = os.listdir()

print(cwd)
print(dirs)

/Users/samanthaberk/Documents/resume-job-posting-nlp-project
['.DS_Store', 'LICENSE', 'environment.yml', 'Extracting Text from Online Job Postings.ipynb', 'resume_project.pickle', 'README.md', '.gitignore', '.ipynb_checkpoints', '.git', 'data', '1. Extracting Raw Text from Job Posting HTML Web pages.ipynb']


In [6]:
# Build a path to the HTML job postings directory

path = os.path.join(cwd, dirs[dirs.index('data')])
print(path)

files = os.listdir(path)
path = os.path.join(path, files[files.index('html_job_postings')])
print(path)

/Users/samanthaberk/Documents/resume-job-posting-nlp-project/data
/Users/samanthaberk/Documents/resume-job-posting-nlp-project/data/html_job_postings


In [9]:
# get a list of the files in the HTML directoy

files = glob.glob(path + '//*.html')
len(files)

1337

In [22]:
# Load the HTML pages into a list

html_content = []
for file in files:
    with open (file, 'r') as f:
        html_content.append(f.read())

### Step 2: Extract and store data
*Parse the HTML into sections and store them a dictionary with lists as values, then convert this to a DataFrame*

In [23]:
# Create a dictionary to store the parsed data

sections = []
html_dict = {}
for key in ['title', 'body', 'bullets']:
    html_dict[key] = []
    
print(html_dict)


{'title': [], 'body': [], 'bullets': []}


In [20]:
# prototype with the first page
first_page = html_content[0]

soup = bs(first_page, 'lxml')

title = soup.find('title').text
body = soup.find('body').text
bullets = soup.find_all('li')

html_dict['title'].append(title)
html_dict['body'].append(body)
html_dict['bullets'].append([b.text.strip() for b in bullets])

df = pd.DataFrame(data=html_dict)
df.head()

Unnamed: 0,title,body,bullets
0,"Quantitative Analyst - Boston, MA 02116","Quantitative Analyst - Boston, MA 02116\nQuant...",[]
1,"Quantitative Analyst - Boston, MA 02116","Quantitative Analyst - Boston, MA 02116\nQuant...",[]


In [25]:
def get_html_contents(html_pages):
    """
    Extracts title, body, and bullets from HTML job postings.
    Returns a dataframe with separate columns for title, body and bullets.
    """
    sections = []
    html_dict = {}
    for key in ['title', 'body', 'bullets']:
        html_dict[key] = []
    
    for html in html_content:
        soup = bs(html, 'lxml')
        title = soup.find('title').text
        body = soup.find('body').text
        bullets = soup.find_all('li') 
        html_dict['title'].append(title)
        html_dict['body'].append(body)
        html_dict['bullets'].append([b.text.strip() for b in bullets]) 
        
    df = pd.DataFrame(data=html_dict)
    
    return df

In [27]:
df = get_html_contents(html_content)

In [28]:
df.head()

Unnamed: 0,title,body,bullets
0,"Quantitative Analyst - Boston, MA 02116","Quantitative Analyst - Boston, MA 02116\nQuant...",[]
1,"Data Scientist - Mountain View, CA","Data Scientist - Mountain View, CA\nGroundTrut...","[Help senior members of the team to explore, d..."
2,"Data Scientist - Seattle, WA","Data Scientist - Seattle, WA\nA Bachelor or Ma...",[A Bachelor or Masters Degree in a highly quan...
3,Senior Natural Language Processing (NLP) Engin...,Senior Natural Language Processing (NLP) Engin...,[Join a small team creating a proprietary NLU ...
4,"FLEXO FOLDER GLUER OPER - McClellan, CA - McCl...","FLEXO FOLDER GLUER OPER - McClellan, CA - McCl...",[]


In [30]:
df.shape

(1337, 3)

### Step 3: Dedupe rows and save the dataframe for later

In [32]:
# Convert lists in the bullets column to tuples
df['bullets'] = df['bullets'].apply(tuple, 1)

In [35]:
df.drop_duplicates(inplace=True)
df.shape

(1328, 3)

In [36]:
# Save DataFrame to disk
df.to_pickle('step1_df.pk')