##  Decoding Data Science Job Postings Notebook 1

This is a really tiny dataset so we can just add items to the bottom of the dataset

Import libraries

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import os
import random
from pathlib import Path
from IPython.core.display import display, HTML
from bs4 import BeautifulSoup as bs

### Helper Functions

In [2]:
def render(html_string):
    display(HTML(html_string))
    
cur_path = Path()

random.seed(42)

### Find the working files and stash them in a list

In [15]:
inputs = os.listdir('data/html_job_postings')
inputs

['001b92395ed0fb62_fccid.html',
 '00321a48d04fe754_fccid.html',
 '0079c11b2611349f_fccid.html',
 '007d9d7b5c09d820_fccid.html',
 '0125eabc844281c9_fccid.html',
 '014ae4dbded805d2_fccid.html',
 '0179ea131f141400_fccid.html',
 '018866568cd5a0b0_fccid.html',
 '01aa3fd02a66fc1d_fccid.html',
 '01bd31cf3814dee3_fccid.html',
 '0203a50423c1dff3_fccid.html',
 '025f464c00c43f58_fccid.html',
 '032a60df9899fe1c_fccid.html',
 '03b7844b77344288_fccid.html',
 '03db5efbdebb59a8_fccid.html',
 '0410b4cd5db34fec_fccid.html',
 '04186a90ce9af7e3_fccid.html',
 '04817d18faa6531c_fccid.html',
 '0488045ed4473017_fccid.html',
 '0537cc057dc3fbaf_fccid.html',
 '054d04e709095c2b_fccid.html',
 '05622250ca2f97ca_fccid.html',
 '0598b7efa8b53aef_fccid.html',
 '05bbd20f59b2b295_fccid.html',
 '05e113c6cfd125f5_fccid.html',
 '064f745bb5631f9b_fccid.html',
 '06a385f35dbe05a8_fccid.html',
 '06d49f8db79069ea_fccid.html',
 '06f4f89b2121d668_fccid.html',
 '06fb015837858542_fccid.html',
 '073776d12c6a7644_fccid.html',
 '079c06

### Construct the Data Frame

In [37]:
cols = ['Title','Body','Bullets']
jobs = pd.DataFrame(columns=cols)
jobs.head()

Unnamed: 0,Title,Body,Bullets


### Procesing loop

- Find the file
- Discard if not HTML
- Read the file, coercing all characters to latin-1
- Add the Title, Body and Bullet Points to the Data Frame


In [38]:
printcounter = 0
for fn in range(len (inputs)): 
    printcounter = printcounter + 1
    work = cur_path.joinpath('data','html_job_postings',inputs[fn])
    if printcounter == 21:
        print(fn,work)
        printcounter = 0
        
    #Only try to process html files
    if not (inputs[fn].endswith("html")) : continue

    with open(work, encoding='latin-1')as f:   #there are some unreadable char so coerce them
        wfile = f.read()
        soup = bs(wfile,'html.parser')
        title = soup.title.text
        #print(title)
        body = soup.find('body')
       
        blt = [bullet.text for bullet in soup.find_all('li')]
        jobs.loc[fn] = [title, soup.body.text,blt]
        f.close()
        
jobs.head()    

20 data\html_job_postings\054d04e709095c2b_fccid.html
41 data\html_job_postings\09b405a07dc2ebcf_fccid.html
62 data\html_job_postings\0d20d59bb0b52bc8_fccid.html
83 data\html_job_postings\117fdac717d768c9_fccid.html
104 data\html_job_postings\15ea1bf724360627_fccid.html
125 data\html_job_postings\18ceaad4b9aa1a8d_fccid.html
146 data\html_job_postings\1c8576530dcf26f0_fccid.html
167 data\html_job_postings\1fcf4a65bd9da593_fccid.html
188 data\html_job_postings\22ec0d056a58a782_fccid.html
209 data\html_job_postings\2513f7ce8174946b_fccid.html
230 data\html_job_postings\28987b664990906f_fccid.html
251 data\html_job_postings\2b937acb6b86aa7f_fccid.html
272 data\html_job_postings\30af8c722a93ceaf_fccid.html
293 data\html_job_postings\34cb6265a33472c7_fccid.html
314 data\html_job_postings\3820cb3051c0ef58_fccid.html
335 data\html_job_postings\3b2bc1ad8d0a664e_fccid.html
356 data\html_job_postings\4068926239bcd32d_fccid.html
377 data\html_job_postings\45345be501062dd9_fccid.html
398 data\html_

Unnamed: 0,Title,Body,Bullets
0,"Data Engineer - Columbus, GA 31909","Data Engineer - Columbus, GA 31909\nCelebratin...",[Bachelorâs or Masterâs degree in statisti...
1,"Data Analyst - St. Louis, MO","Data Analyst - St. Louis, MO\nDuties\nSummary\...",[Job family (Series)\n1501 General Mathematics...
2,"Data Scientist - Newark, CA","Data Scientist - Newark, CA\nData Scientist\n\...","[ Design, develop, document and maintain machi..."
3,Patient Care Assistant / PCA - Med/Surg (Fayet...,Patient Care Assistant / PCA - Med/Surg (Fayet...,[Provides all personal care services in accord...
4,"Scientific Programmer - Berkeley, CA","Scientific Programmer - Berkeley, CA\nCaribou ...","[Demonstrated proficiency with Python, JavaScr..."


### How many got into the Data Frame

In [39]:
len(jobs)

1336

### How many are duplicates 

Define duplicate as duplicate Body

In [41]:
len(jobs['Body']) - len (jobs['Body'].drop_duplicates())

9

### Discard the duplicates

In [42]:
jobs = jobs['Body'].drop_duplicates()
print(len(jobs))

1327
