### Job Postings Retrieval
This script will contain the following classes and methods
- class: New_Postings
- class: Posting
- method: print_time

In [2]:
%%file job_postings.py

#needed modules
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup as bs
from collections import defaultdict
from datetime import datetime, timedelta
from nltk.corpus import stopwords
from time import time
import job_description_features as jdf

#class for each job posting
class Posting:        
    def __init__(self, link):
        self.job_link = link
        #get details and assign to attributes
        details = self.get_job_posting_info()
        if details:
            self.job_title = details.get('title')
            self.company_name = details.get('company')
            self.job_location = details.get('location')
            self.description = details.get('desc')
        else: self = None
    
    #extract desired skills from description
    def tech_skills(self, desc = None):
        #model, work in progress. Will move here once finalized for phase 1 
        if desc: #do stuff
            self._tech_skills = desc
        #return self._tech_skills
    
    #extract citizenship stance of organization or for position from description
    def allows_immigrants(self, desc = None):
        #model, work in progress. Will move here once finalized for phase 2        
        #flag = True #do stuff here
        self._immigration_assistance = flag
        return self._immigration_assistance
    
    #extract relocation assistance stance of organization or for position from description
    def has_relocation_help(self, desc = None):
        #model, work in progress. Will move here once finalized for phase 2        
        #flag = True #do stuff here
        self._relocation_assistance = flag
        return self._relocation_assistance

    #method to get the info in the links in order
    def get_job_posting_info(self):      
        #get page data
        page = requests.get(self.job_link)
        soup = bs(page.content, 'lxml')
        #get value fields
        job = soup.find('h1', 'topcard__title')
        location = soup.find('span', 'topcard__flavor topcard__flavor--bullet')
        company = soup.find('a', 'topcard__org-name-link topcard__flavor--black-link')
        desc = soup.find('div', 'description__text description__text--rich')
        if job is not None and location is not None and company is not None and desc is not None:
            #add job attributes if verified
            job = job.get_text().strip()
            location = location.get_text().strip()
            company = company.get_text().strip()
            desc = desc.get_text()
            details = {'title': job, 'location': location, 'company': company, 'desc': desc}
            return details
        else: return None
            
#class to find, filter, and process job postings
class New_Postings():
    #jobs to search
    job_titles = pd.read_csv('titles.txt', header=None)[0].values.tolist()
    
    def __init__(self):
        self.today = datetime.now().strftime('%B %d, %Y')
        self.links = self.get_all_location_results()
        self.postings = []
        for link in self.links:
            post = Posting(link=link)
            if post: self.postings.append(post)
        self.postings = list(filter(self.filter_title_and_location, self.postings))
        
    #method to get search results for dictionary positions and all locations before filtering
    #Posted in the last 24 hours and <= 10 miles from job location
    def get_all_location_results(self, location = 'United States'): 
        end = len(self.job_titles)
        self.links = []
        print(end, 'search terms: \n--------------------------------\n')
        for title in self.job_titles:
            URL = 'https://www.linkedin.com/jobs/search?keywords='+ title +'&location='+location+'&f_TP=1'
            page = requests.get(URL)
            soup = bs(page.content, 'lxml')
            refs = soup.find_all('a', class_='result-card__full-card-link')
            self.links += [ref.get('href') for ref in refs if ref.get('href') not in self.links]
        print(location + ':', len(self.links), 'result links for', self.today)
        return self.links
    
    #method to check whether title is valid for entry, associate, internship level, non-government job
    def filter_title_and_location(self, job): #true is the thing we want to keep
        filters = ['VP', 'manager', 'senior', 'sr', 'president', 'vice president', 'director']
        #check if title has senior tags/location is Virginia = government jobs/need clearance+residence
        try:
            for w in filters:
                if job.job_title.upper().find(w.upper()) != -1: return False
            if job.job_location.upper().find(', VA') != -1: return False
            else: return True
        except: pass
        
    #method to return the processed job postings as a dataframe
    def get_job_postings(self):
        #filter out non-qualified jobs
        print('Total remaining job postings:', len(self.postings))
        companies = jdf.get_H1B_approvers()
        jobs = []
        for p in self.postings:
            val = jdf.Description_Features(p.description)
            jobs.append({'title': p.job_title, 'company': p.company_name, 
                         'location': p.job_location, 'desc_raw': p.description,
                        'desc_visa': val.clean_description_text(),
                        'sponsor':val.get_immigration_stance(p.company_name, companies)})
        self.job_data = pd.DataFrame(jobs)
        return self.job_data
    
#time printer
def print_time(note, t):
    print(note, '{m}:{s:02d} mins'\
          .format(m = round((time() - t )/ 60), s = round((time()-t)%60)))

Overwriting job_postings.py


### Posting Description Processing
This script will contain the following classes and methods for processing the description text of each job posting
- class: Description_Features

In [1]:
%%file job_description_features.py

#modules needed
import nltk
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup as bs
import string
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
import wordninja

#class to preprocess the description and get target description for feature extraction
class Description_Features():    
    #tags to filter description with for immigration
    immigration_tags = ['security clearance', 'citizens', 'citizen', 'green card', 
                             'authorized', 'authorization', 'sponsorship', 'visa', 'US citizen',  
                             'eligible',  'TS/SCI',  'DoD', 'secret clearance', 'resident', 'W2'
                             'US persons', 'equal employment', 'EEO', 'citizenship', 'immigration',
                             'citizenship status', 'No C2C', 'W2 only', 'visas', 'clearance'
                            ]
    wn = WordNetLemmatizer()
    
    def __init__(self, text = None):
        if text: self.description = text
        self.immigration_tags = [tag.lower() for tag in self.immigration_tags]
    
    #allows you to parse through text to see if some words are attached by mistake and add a space
    def parsing_description(self):
        desc = []
        for word in self.description.split():
            if len(word) <= 8: desc.append(word)
            else: desc.append(' '.join(wordninja.split(word)))
        self.description = ' '.join(desc)
        return self.description
    
    def filter_tags(self, desc):
        check = 0
        for tag in self.immigration_tags:
            if desc.lower().find(tag) != -1: check += 1 #tag found
        if check == 0: return False
        else: return True

    def target_description(self):
        #get sentence tokens from semi-cleaned description and filter out ones with no target tags
        sents = sent_tokenize(self.description)
        sents = list(filter(self.filter_tags, sents))
        if len(sents) == 0: sents.append('check company stance')
        self.filtered_description = ' '.join(sents)
        return self.filtered_description
    
    #ML pipeline to parse description, tokenize, lower case, get target lemmatized tokens (joined)
    def clean_description_text(self):
        #cleaning a description
        #remove/ignore non-ASCII characters and years
        self.description = re.sub(r'[^\x00-\x7f]', '', self.description) 
        self.description = re.sub(r'[0-9]{3,}', '', self.description)
        #make sure all words are displayed correctly/no words attached by mistake, remove stopwords
        desc = self.parsing_description()
        stopwords = list(stopwords.words('english'))
        stops = [stop for stop in stopwords.words('english') if stop not in ['no', 'not', 'only']]
        self.description = ' '.join([w.lower() for w in desc.split() if w.lower() not in stops])

        #only get sentences with immigration indicator words/phrases (requires sent_tokenization)
        filter_desc = self.target_description()
        #remove unwanted/non-context punctuation marks after un-tokenizing sentences
        filters = ''.join([x for x in string.punctuation if x != '#' and x != '+'])
        desc = ''.join([char for char in filter_desc if char not in filters])
        self.filtered_description = ' '.join([wn.lemmatize(word) for word in word_tokenize(desc)])
        return self.filtered_description
    
    #lemmatize the descriptions and create tokens
    def tokenize(self):
        if self.filtered_description == 'check company stance': self.tokens = ''
        else: self.tokens = [wn.lemmatize(word) for word in word_tokenize(desc_visa)]
        return self.tokens
    
    #method to check immigration stance based on description or company (from H1B list)
    def get_immigration_stance(self, comp, companies_list):
        self.immigration = 'Unknown'
        if self.filtered_description == 'check company stance':
            for org in companies_list:
                flag = [] #tracking companies names from H1B list
                if comp.lower() in org.lower() or comp.lower() in org.lower(): flag.append(True)
                if True in flag: self.immigration = 'Yes'
        return self.immigration                    
    
    def collocation_finder(self, n_gram_total, n_gram_filter_word):
        cf = TrigramCollocationFinder.from_words(self.sentence_tokens[0].split()) 
        #checking what words appear frequently with 'word' in this case it is 'work'
        n_filter = lambda *words: n_gram_filter_word not in words
        cf.apply_ngram_filter(n_filter)
        #apply frq filter removes occurences that happened less than x times
        self.collocation_scores = cf.nbest(TrigramAssocMeasures.likelihood_ratio, n_gram_total)
        return self.collocation_scores
    
#method to get the company names for top H1B approved companies for the current year
def get_H1B_approvers(pages = range(1,5)):
    h1b_companies = []
    for page in pages:
        URL = 'http://www.myvisajobs.com/Reports/2020-H1B-Visa-Sponsor.aspx?P=' + str(page)
        page = requests.get(URL)
        soup = bs(page.content, 'lxml')
        table = soup.find('table', class_='tbl').find_all('tr')[1:] #minus header row
        for tr in table: 
            try: h1b_companies.append(tr.find_all('td')[1].text)
            except: pass
    return h1b_companies

Overwriting job_description_features.py
