# Task 1–Load Job Posting HTML documents and extract content into Pandas DataFrame
From LiveProject "Decoding Data Science Postings to Improve Your Resume", Section 1.2.

## Imports
Import libraries and apply notebook settings.

In [4]:
# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 50

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Performance metrics
import time
from tqdm import tqdm

# Soup prep
from bs4 import BeautifulSoup

# File serialization
import pickle

In [5]:
# Project file imports, exports
import os
# Create an output folder.
# [REF:How can I safely create a nested directory](https://stackoverflow.com/a/14364249)
#   > `pathlib.Path.mkdir` as used above recursively creates the directory 
#      and does not raise an exception if the directory already exists. 
#      If you don't need or want the parents to be created, skip the parents argument.
import pathlib
# Baseline
prj_home = pathlib.Path('..')
prj_home_data_dir = os.path.join(prj_home, 'data/html_job_postings')
prj_home_output_dir = os.path.join(prj_home, 'data/extract')

## Open and Read for Subset of HTML Sources

In [6]:
# Taking a nod from RealPython and using `os.scandir` which returns a generator object
html_post = []
with os.scandir(prj_home_data_dir) as i:
    for entry in i:
        if entry.is_file():
            if entry.name.endswith('da_fccid.html'):
                html_post.append(entry.name)

In [7]:
# The above used an artificial and bs limiter. How many file-entries of the 1337 did we capture?
len(html_post)

9

In [8]:
counter = 0
for i in html_post:
    counter += 1
    print(f"Now reading item #{counter}, posting file:{i}\n")
    # Load/Soup
    soup = BeautifulSoup(open(os.path.join(prj_home_data_dir, i)), "lxml")
    print(f"*Title: " + soup.title.text + "*") # can has element 'title' 
    headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5']) # can has elements 'header\x\'
    for header_item in headers:
        print(header_item.get_text())
    # uls = header_item.find_next('ul') # can has none to every elements 'ul'
    # if header_item.find_next('ul') != None:
        # for list_item in uls.find_all('li'):
            # print(list_item.text)
    paras = soup.find_all('p') # can has none to every elements 'p'
    for p_item in paras:
        print(p_item.text)
        uls = p_item.find_next('ul')
        if p_item.find_next('ul') != None:
            for li_item in uls.find_all('li'):
                print(li_item.text)    
    print(f"\n---\n")

, CISA, CISSP, CIA, CISM or related license/certification preferred

Valuation credentials such as CVA, ABV, AVA, or ASA preferred.

Accounting, finance, and economics knowledge; proficiency in US GAAP and GAAS preferred.

Highly analytical with expertise in financial analysis and modeling, econometrics, statistics, and regression analysis.

Ability to manage multiple priorities and simultaneous project in a rapidly growing practice.

Strong leadership, business development, recruitment, training, and mentoring skills

Excellent written, presentation, leadership, and interpersonal communications skills.

Ability to work additional hours and/or travel as needed.

BENEFITS:

Founded in Chicago in 1924, Grant Thornton LLP (Grant Thornton) is the U.S. member firm of Grant Thornton International Ltd, one of the world’s leading organizations of independent audit, tax and advisory firms. Grant Thornton has revenue in excess of $1.6 billion and operates 59 offices across the United States with

### Keep going–can get metrics from the selected page?

In [9]:
counter = 0
for i in html_post:
    counter += 1
    print(f'\"\"\" Now reading item #{counter}, posting file:{i} \"\"\"')
    # Load/Soup
    soup = BeautifulSoup(open(os.path.join(prj_home_data_dir, i)), "lxml")
    print(f'Page Title: ' + soup.title.get_text()) # can has el 'title'
    header_rows = []
    headers = soup.find_all(['h{}'.format(i) for i in range(1,7)]) # skip doing a regex, return el 'header' 1-6
    print(f'Found {len(headers)} header elements')
    for el_h in headers:
        header_rows.append(el_h.text.strip())
    
    print(f"  First: " + header_rows[0])
    
    if len(headers) > 1:
        print(f"  Last: " + header_rows[-1])
    
    print(f"\n---\n")    

""" Now reading item #1, posting file:3157fcef3ee474da_fccid.html """
Page Title: Data Scientist - Mountain View, CA
Found 1 header elements
  First: Data Scientist - Mountain View, CA

---

""" Now reading item #2, posting file:51aab4724bb38ada_fccid.html """
Page Title: Data Analyst I - San Diego, CA 92121
Found 11 header elements
  First: Data Analyst I - San Diego, CA 92121
  Last: Job Family:

---

""" Now reading item #3, posting file:f41a6612c58a80da_fccid.html """
Page Title: Financial Advisory Services Valuation Senior Associate - New York, NY 10017
Found 1 header elements
  First: Financial Advisory Services Valuation Senior Associate - New York, NY 10017

---

""" Now reading item #4, posting file:14fc0fb5060f54da_fccid.html """
Page Title: Data Analyst, Analytics and Research - Washington, DC 20036
Found 1 header elements
  First: Data Analyst, Analytics and Research - Washington, DC 20036

---

""" Now reading item #5, posting file:d7895f9a3165e1da_fccid.html """
Page Titl

## Soup to Pandas


In [10]:
def convert_html_pages_to_dataframe(html_pages_dir):
    """Extract title, body, and bullet points (neé needs/wants) from HTML-formatted job postings.
    Return a Pandas dataframe with a column for those items.
    """
    html_posts = []
    with os.scandir(html_pages_dir) as i:
        for entry in i:
            if entry.is_file() & entry.name.endswith('.html'):
                    html_posts.append(entry.name)    

    # Python dictionary to Pandas DataFrame is preferred
    html_dict = {}
    for key in ['title', 'body', 'bullets']:
        html_dict[key] = []

    for html in html_posts:
        soup = BeautifulSoup(open(os.path.join(html_pages_dir, html)), "lxml")
        title = soup.find('title').getText()
        body = soup.find('body').getText()
        bullets = soup.find_all('li')
        
        html_dict['title'].append(title)
        html_dict['body'].append(body)
        # bullets will be stripped of leading and trailing whitespace
        html_dict['bullets'].append([b.text.strip() for b in bullets])

    df = pd.DataFrame(html_dict)

    return df

In [11]:
df = convert_html_pages_to_dataframe(prj_home_data_dir)

In [12]:
df.head(10)
df.shape

Unnamed: 0,title,body,bullets
0,"Quantitative Analyst - Boston, MA 02116","Quantitative Analyst - Boston, MA 02116\nQuant...",[]
1,"Data Scientist - Mountain View, CA","Data Scientist - Mountain View, CA\nGroundTrut...","[Help senior members of the team to explore, d..."
2,"Data Scientist - Seattle, WA","Data Scientist - Seattle, WA\nA Bachelor or Ma...",[A Bachelor or Masters Degree in a highly quan...
3,Senior Natural Language Processing (NLP) Engin...,Senior Natural Language Processing (NLP) Engin...,[Join a small team creating a proprietary NLU ...
4,"FLEXO FOLDER GLUER OPER - McClellan, CA - McCl...","FLEXO FOLDER GLUER OPER - McClellan, CA - McCl...",[]
5,"Junior Data Scientist - College Park, MD 20740","Junior Data Scientist - College Park, MD 20740...",[Degree: Bachelor’s degree in business analyti...
6,"Data Scientist - New York, NY","Data Scientist - New York, NY\nDescription\nDS...","[Languages: Python, PySpark, SQL, Data Tools: ..."
7,Business Analyst - Medical Claims Data Project...,Business Analyst - Medical Claims Data Project...,"[Highly developed analytical skills, Mastery o..."
8,"(Entry-Level) Data Scientist - Chicago, IL","(Entry-Level) Data Scientist - Chicago, IL\nDa...",[Be the go-to person for Data ingest and stora...
9,"Data Scientist, Analytics - Seattle, WA 98101","Data Scientist, Analytics - Seattle, WA 98101\...",[Apply your expertise in quantitative analysis...


(1337, 3)


## Drop Dupes
Our HTML pages dump may contain duplicate postings collected from different sites (or perhaps a job re-posting). The dataframe has the full post-body, so use that for an initial de-duplicate operation.

Per [DataFrame.duplicated()](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.duplicated.html), we can reason about the duplicates where:

**subset**: the column label or a sequence of labels, default is "all"

**keep**: how to mark duplicates {'first', 'last', False}

* _first_: Mark duplicates as True except for the first occurence

* _last_: Mark duplicates as True except for the last occurence
    
* _False_: Mark all duplicates as True

In [24]:
df.duplicated(subset=['title', 'body'], keep='first').sum()

9

In [25]:
df_simple_dedupe = df.drop_duplicates(subset=['title', 'body'])

In [26]:
df_simple_dedupe.shape

(1328, 3)

## Filter for Data Science Postings

In [27]:
# Create a useful match-group for jobs we are looking to find
job_regex = r'(?:data.*(?:scien|engineer|analy))|(?:natural language|statistic|machine learn)'
df_final = df_simple_dedupe[df_simple_dedupe['body'].str.contains(job_regex, na = False, case = False)]

In [28]:
df_final

Unnamed: 0,title,body,bullets
0,"Quantitative Analyst - Boston, MA 02116","Quantitative Analyst - Boston, MA 02116\nQuant...",[]
1,"Data Scientist - Mountain View, CA","Data Scientist - Mountain View, CA\nGroundTrut...","[Help senior members of the team to explore, d..."
2,"Data Scientist - Seattle, WA","Data Scientist - Seattle, WA\nA Bachelor or Ma...",[A Bachelor or Masters Degree in a highly quan...
3,Senior Natural Language Processing (NLP) Engin...,Senior Natural Language Processing (NLP) Engin...,[Join a small team creating a proprietary NLU ...
5,"Junior Data Scientist - College Park, MD 20740","Junior Data Scientist - College Park, MD 20740...",[Degree: Bachelor’s degree in business analyti...
...,...,...,...
1332,"Data Scientist - Birmingham, AL 35233","Data Scientist - Birmingham, AL 35233\nSUMMARY...",[]
1333,"Senior Data Scientist - Tempe, AZ","Senior Data Scientist - Tempe, AZ\nCircle K is...",[Independently develop advanced analytics and ...
1334,New College Grad - Cybersecurity (Masters Degr...,New College Grad - Cybersecurity (Masters Degr...,"[Advanced Degree (e.g. Masters, MBA, JD, MD, o..."
1335,"Decision Science Manager, Media Mix Modeling -...","Decision Science Manager, Media Mix Modeling -...",[As a Decision Science Manager for media mix m...


In [29]:
df_final.shape

(1193, 3)

## Pickle Results

In [None]:
my_pickle = 'jobs_pkl'
# pickle.dump(df_final, open(os.path.join(prj_home_output_dir, my_pickle), "wb" ))
df_final.to_pickle(os.path.join(prj_home_output_dir, my_pickle))