In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.collocations import *
nltk.download('genesis')
# nltk.download('stopwords')
# nltk.download('punkt')

In [58]:
# Parameters for jobs
job_url = 'https://jobs.lever.co/pachama/34b58e75-af8f-4217-a5c0-21df58bed6cc'
platform_stop_words = ['apply', 'job', 'home', 'pagejobs', 'powered', 'andor', 'along']
job_stop_words = ['pachama', 'san', 'francisco', 'youll', 'pachamas']

In [112]:
# Functions
def lever_co_parser(url):
    """parser for jobs.lever.co"""
    extracted_text = ''
    r = requests.get(url)
    text = r.text
    soup = BeautifulSoup(text, 'html')
    # TODO lever.co delivers multiple copies of same section, need better dedupe
    for s in soup.select('div'):
        extracted_text += s.get_text(separator=' ')
    return extracted_text


def clean_text(text, filter_stops=True):
    """removes stop words, punctuation, and duplicate sections"""
    # tokenize by sentence to dedupe sections
    sents = list(set(sent_tokenize(text)))
    sents_text = ' '.join(sents) # concatenate back to a string
    
    # remove punctuation
    punctuation= '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    sents_text_no_punc = " "
    for s in sents_text:
        if s not in punctuation:
            sents_text_no_punc = sents_text_no_punc + s
            
    # remove stop words if desired, not useful for ngrams
    if filter_stops:
        word_tokens = word_tokenize(sents_text_no_punc)
        stop_words = list(set(stopwords.words('english')))
        stop_words = stop_words + platform_stop_words + job_stop_words
        filtered_tokens = [w.lower() for w in word_tokens if not w.lower() in stop_words]
    else:
        filtered_tokens = word_tokenize(sents_text_no_punc)
        filtered_tokens = [w.lower() for w in filtered_tokens]
    return filtered_tokens


def analyze_tokens(word_token_list, n):
    """takes a bag of tokens and returns a data frame of the 20 most common """
    fdist = FreqDist(word_token_list)
    print('Most common terms:')
    print(fdist.most_common(n))

    
def analyze_ngrams(word_token_list, bi_tri='bi', top=10, n=5):
    """takes tokenized list without stopwords removed, outputs top x n-grams that appear
    at least n times"""
    if bi_tri == 'tri':
        trigram_measures = nltk.collocations.TrigramAssocMeasures()
        finder = TrigramCollocationFinder.from_words(word_token_list)
        finder.apply_freq_filter(n)
        print('\nMost common trigrams:\n',finder.nbest(trigram_measures.pmi, top))
    else:
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(word_token_list)
        finder.apply_freq_filter(n)
        print('\nMost common bigrams:\n',finder.nbest(bigram_measures.pmi, top))

        
def main(job_url):
    job_text = lever_co_parser(job_url)
    clean_job_word_tokens = clean_text(job_text)
    cleanish_job_word_tokens = clean_text(job_text, filter_stops=False)
    analyze_tokens(clean_job_word_tokens, 20)
    analyze_ngrams(cleanish_job_word_tokens)
    analyze_ngrams(cleanish_job_word_tokens, bi_tri='tri', top=10, n=5)

In [113]:
main(job_url)

Most common terms:
[('product', 30), ('team', 23), ('roadmap', 19), ('data', 17), ('verify', 14), ('strong', 10), ('5+', 10), ('engineers', 10), ('complex', 10), ('products', 10), ('mission', 9), ('remote', 9), ('new', 9), ('track', 8), ('focus', 8), ('communication', 8), ('use', 8), ('across', 7), ('climate', 7), ('manager', 7)]

Most common bigrams:
 [('ambiguous', 'environment'), ('analytical', 'thought'), ('andor', 'researchfocused'), ('application', 'towards'), ('deeply', 'analytical'), ('ego', 'crisp'), ('entrepreneurial', 'spirit'), ('first', 'principles'), ('help', 'address'), ('image', 'processing')]

Most common trigrams:
 [('analytical', 'thought', 'process'), ('andor', 'researchfocused', 'tools'), ('application', 'towards', 'image'), ('deeply', 'analytical', 'thought'), ('first', 'principles', 'nearmastery'), ('low', 'ego', 'crisp'), ('notion', 'or', 'similar'), ('thought', 'process', 'driven'), ('towards', 'image', 'processing'), ('translates', 'into', 'being')]
