In [1]:
from bs4 import BeautifulSoup as bs
import os
import pathlib
import pandas as pd

## Define Helper Functions for Extracting Title and Skills ##

In [2]:
def html_files_to_list(path):
    """
        returns a list of tuples containing the file_name and the contents of the file as a string
    """
    with os.scandir(path) as my_dir:
        return [(entry.name, bs(pathlib.Path(entry.path).read_text())) for entry in my_dir if entry.is_file and entry.name.endswith('html')]


In [3]:
def get_title(bs_object):
    """
        returns the text of the title for the passed beautiful soup object
    """
    my_title = bs_object.find('title')
    if my_title:
        return my_title.text.lower()
    return None

In [4]:
def get_skills(bs_object):
    """
        returns comma separated list of skills
    """
    my_list_items = [li.text.strip() for li in bs_object.find_all('li')]
#     return ','.join(my_list_items)
    return my_list_items

## Process all of the files in the data directory and return a tuple for each containing the original file path and the Beautiful Soup Object created from the contents ##

This will simply creating a DataFrame later.

In [5]:
html_data = html_files_to_list(os.path.join('data', 'html_job_postings'))

## Create a Dataframe from the Html files - initially with just 2 Columns - FilePath and Contents ##

In [6]:
my_df = pd.DataFrame(html_data, columns=['FilePath', 'Contents'])

## Remove Duplicates from the DataFrame. ##

In [7]:
# Remove duplicate entries
print(f'Before drop duplicates, Shape: {my_df.shape}')
print('*' * 80)
my_df.drop_duplicates(subset='Contents', inplace=True)
print(f'After drop duplicates, Shape: {my_df.shape}')

Before drop duplicates, Shape: (1337, 2)
********************************************************************************
After drop duplicates, Shape: (1329, 2)


## Now use our helpfer functions to create columns Title and Skills ##

In [8]:
my_df['Title'] = my_df['Contents'].apply(lambda x: get_title(x))
my_df['Skills'] = my_df['Contents'].apply(lambda x: get_skills(x))

## Optional Step - Filter out Resumes that don't appear to be for Data Scientist ##

In [9]:
# Filter out things that are probably NOT what we're interest in
my_df = my_df[(my_df['Title'].str.contains('data')) & (my_df['Title'].str.contains('scien'))]

In [10]:
my_df['Title_1'] = my_df['Title'].apply(lambda x: (x.split('-'))[0] )
my_df['Title_2'] = my_df['Title'].apply(lambda x: (x.split('-'))[1] if len(x.split('-')) > 1 else 'N/A') 
my_df['Title_3'] = my_df['Title'].apply(lambda x: (x.split('-'))[2] if len(x.split('-')) > 2 else 'N/A') 


In [11]:
my_df[['Title_1', 'Title_2', 'Title_3', 'Skills']].head()

Unnamed: 0,Title_1,Title_2,Title_3,Skills
1,data scientist,"mountain view, ca",,"[Help senior members of the team to explore, d..."
2,data scientist,"seattle, wa",,[A Bachelor or Masters Degree in a highly quan...
5,junior data scientist,"college park, md 20740",,[Degree: Bachelor’s degree in business analyti...
6,data scientist,"new york, ny",,"[Languages: Python, PySpark, SQL, Data Tools: ..."
8,(entry,level) data scientist,"chicago, il",[Be the go-to person for Data ingest and stora...


## Now save the Dataframe for later ##

In [12]:
SAVE_FILE_NAME = "resume_project_after_step_1.pickle"
my_df.to_pickle(SAVE_FILE_NAME)

## Verify that we will be able to read back our saved Data Frame later . . . ##

In [13]:
my_read_df = pd.read_pickle(SAVE_FILE_NAME)
my_read_df[['Title', 'Skills']].head()

Unnamed: 0,Title,Skills
1,"data scientist - mountain view, ca","[Help senior members of the team to explore, d..."
2,"data scientist - seattle, wa",[A Bachelor or Masters Degree in a highly quan...
5,"junior data scientist - college park, md 20740",[Degree: Bachelor’s degree in business analyti...
6,"data scientist - new york, ny","[Languages: Python, PySpark, SQL, Data Tools: ..."
8,"(entry-level) data scientist - chicago, il",[Be the go-to person for Data ingest and stora...
