### Set up packages and data

In [3]:
import os
import pandas as pd
import spacy
import numpy as np
import statistics as stat

from tqdm import tqdm
from scipy.spatial.distance import cosine

# setup
os.chdir('C:\\Users\\User\\OneDrive\\School\\2022\\compSoc\\Assignments\\data\\ind\\j_info')
cls = lambda: os.system('cls')
nlp = spacy.load('en_core_web_md')

# load data and take out corpus of text we are examining
df = pd.read_csv('indeed_20220522.csv', low_memory=False)
job_titles = df.job_title.str.lower().tolist()
job_des = df.job_description.str.lower().tolist()

### Q13.1

In [4]:
# function returns two outputs
# output 0 is a vocab containing a list of unique words (of specific pos_)
# output 1 is a list of sub-lists, each sublist contains all the word of specific pos_ designated
# output 2 is a list of phrases/doc that has empty sub-lists from output 1.
# output 2 is important because:
# (a) we want to examine what slips through the cracks when no context is presented (e.g. 'cook' as both verb and noun);
# (b) because this distorts our cosine calculation later since cosine of any array with a zero array will be 0

def get_dict(phrase_list, pos):
    ll_pos = []
    no_pos = []
    for phrase in tqdm(phrase_list, ncols=90):
        l_pos = []
        for t in nlp(phrase):
            if t.pos_ in pos:
                l_pos.append(t.lemma_.lower())
        ll_pos.append(l_pos)
        if len(l_pos) == 0:
            no_pos.append(phrase)
    vocab_list = list(set().union(*[set(x) for x in ll_pos]))
    print(f"List of phrases contains {len(vocab_list)} unique {pos}")
    return vocab_list, ll_pos, no_pos

nv = get_dict(job_titles, ['NOUN', 'PROPN'])[0]
av = get_dict(job_titles, ['ADJ'])[0]

100%|████████████████████████████████████████████████| 2541/2541 [00:14<00:00, 175.49it/s]


List of phrases contains 1759 unique ['NOUN', 'PROPN']


100%|████████████████████████████████████████████████| 2541/2541 [00:14<00:00, 175.84it/s]

List of phrases contains 295 unique ['ADJ']





In [7]:
print(nv[:20])
print(av[:20])

['assistance', 'ward', 'sanitation', 'patient', 'exec', 'analytic', 'conservatory', 'plan', 'institute', '3.5k', 'environmental', 'ilst', 'handler', 'act', 'giver', 'fitter', 'e&s', 'peo', 'ehs', 'faculty']
['industrial', 'employment', 'front', 'physical', 'hmda', 'flexible', 'sterile', 'patient', 'private', 'powered', 'aseptic', '*', 'environmental', 'non', 'interventional', 'vernal', 'standby', 'inbound', 'outside', 'microbial']


There are 1759 nouns in the vocabulary and 295 for adjectives. Already we can see potential imprecision in this sample lists, where "environmental" can be considered a noun or proper noun and "employment" as an adjective. This speaks to the lack of information / context surrounding these words, making it harder for exact identification. In terms of helpfulness, the noun list is somewhwat useful in signifying the types of work, location, or customers; while the adj list is more on the characteristics of the job or required for it, if only crusory. This is not enough for a meaningful analysis yet.

### Q13.2

In [8]:
# function returns the one hot encoding matrix for all the entries in a list of phrases/docs
# here I use the list of sub-lists instead of the actual job title so I don't have to check token.pos_ again

def pos_ohencoding(phrase_list, pos):
    pos_ohencoding_list = []
    pos_vocab, pos_entries = get_dict(phrase_list, pos)[0], get_dict(phrase_list, pos)[1]
    for entry in tqdm(pos_entries, ncols=90):
        ohencoding = np.zeros(len(pos_vocab))
        bin_words = []
        for pos_word in entry:
            index = pos_vocab.index(pos_word)
            bin_words.append(index)
        for i in bin_words:
            ohencoding[i] = 1
        pos_ohencoding_list.append(ohencoding)
    print(f"Matrix of encoded {pos} by one-hot encoding created\n"
        f"Matrix shape: {np.array(pos_ohencoding_list).shape}")
    return pos_ohencoding_list


# function calculates the cosine distance between two phrases/docs, returns a dictionary
# the key is the index in the original list and the value is the distance to the chosen phrase
# this function makes use of the empty lists for job title (no word of type pos_ identified in the title)
# for these titles, the distance is set to 1.0 (max), since if we calculate the cosine normally, they will actually be 0.0, leading to the wrong interpretation

def get_oh_similarity(phrase_list, primary_index, pos):
    ohsimilarity = {}
    pos_encoding = pos_ohencoding(phrase_list, pos)
    no_pos = get_dict(phrase_list, pos)[2]
    for k in tqdm(range(len(phrase_list)), ncols=90):
        if phrase_list[k] in no_pos:
            ohsimilarity[k] = 0.0
        else:
            p2k_distance = cosine(pos_encoding[primary_index], pos_encoding[k])
            ohsimilarity[k] = 1 - p2k_distance
    return ohsimilarity


title_ohnoun0 = get_oh_similarity(job_titles, 0, ['NOUN', 'PROPN'])

100%|████████████████████████████████████████████████| 2541/2541 [00:14<00:00, 175.20it/s]


List of phrases contains 1759 unique ['NOUN', 'PROPN']


100%|████████████████████████████████████████████████| 2541/2541 [00:13<00:00, 182.93it/s]


List of phrases contains 1759 unique ['NOUN', 'PROPN']


100%|██████████████████████████████████████████████| 2541/2541 [00:00<00:00, 19267.72it/s]


Matrix of encoded ['NOUN', 'PROPN'] by one-hot encoding created
Matrix shape: (2541, 1759)


100%|████████████████████████████████████████████████| 2541/2541 [00:13<00:00, 182.67it/s]


List of phrases contains 1759 unique ['NOUN', 'PROPN']


100%|██████████████████████████████████████████████| 2541/2541 [00:00<00:00, 23929.94it/s]


In [9]:
# this function sorts the dictionationary so the most similar job based on our encoding.
# the key for this function can be chosen to make it more meaningful than just an index number

def sort_and_show(d, key_list, number):
    sorted_dict = {key_list[i] : d[i] for i in sorted(d, key=d.get, reverse=True)}
    show = list(sorted_dict.items())[:number]
    print(f"Showing first {number} items:\n{show}")
    return show

In [10]:
title_ohnoun0_s = sort_and_show(title_ohnoun0, job_titles, 30)

Showing first 30 items:
[('certified pharmacy technician ii -retail pharmacy', 1), ('pharmacy technician', 0.7071067811865475), ('nationally certified pharmacy technician', 0.7071067811865475), ('safety technician ii', 0.5773502691896257), ('lab technician ii-iii', 0.5773502691896257), ('pharmacy pos billing technician', 0.5), ('administrative technician', 0.5), ('regulatory technician', 0.5), ('environmental technician', 0.5), ('sharps technician', 0.5), ('quality control technician ii', 0.5), ('pharmacy technician (part-time) *south philadelphia*', 0.408248290463863), ('accountant ii', 0.35355339059327373), ('i&e technician', 0.35355339059327373), ('irb analyst ii', 0.35355339059327373), ('refrigeration technician', 0.35355339059327373), ('quality technician', 0.35355339059327373), ('engineer ii, molding', 0.35355339059327373), ('survey technician', 0.35355339059327373), ('commercial maintenance technician', 0.35355339059327373), ('service technician', 0.35355339059327373), ('registe

As we can see, one_hot encoding leads to a list of jobs with similar names

### Q13.3

In [19]:
# function to get similarity table for a specific phrase based on a specific type of pos_
# function makes use of the native similarity attribute that spacy has at both the token and doc level
# the doc.similary calculation is using cosine distance with doc.vector calculated by default as the average of token.vector
# what this code is doing is converting sub-lists (of word of specific pos_) for each job title into a string, and use said string as an nlp doc
# it then calculates the similarity with other titles
# outputs a sorted dictionary, key: job title entry, value: similarity with selected title, shows the first 30 items
# the empty list from the first function is not needed here because similarity() already detectes empty vectors and spits out a warning

def get_vec_similarity(phrase_list, primary_index, pos):
    ll_pos = get_dict(phrase_list, pos)[1]
    doc_list = []
    similarity = {}
    for l in ll_pos:
        doc = ''
        for pos in l:
            doc += f'{pos} '
        doc_list.append(doc)
    for j in tqdm(range(len(doc_list)), ncols=90):
        prime = nlp(doc_list[primary_index])
        target = nlp(doc_list[j])
        similarity[j] = prime.similarity(target)
    return similarity


title_vnoun0 = get_vec_similarity(job_titles, 0, ['NOUN', 'PROPN'])
title_vnoun0_s = sort_and_show(title_vnoun0, job_titles, 30)

100%|████████████████████████████████████████████████| 2541/2541 [00:14<00:00, 180.20it/s]


List of phrases contains 1759 unique ['NOUN', 'PROPN']


  similarity[j] = prime.similarity(target)
100%|█████████████████████████████████████████████████| 2541/2541 [00:27<00:00, 92.26it/s]

Showing first 30 items:
[('certified pharmacy technician ii -retail pharmacy', 1.0), ('compliance specialist ii', 0.8689367102068929), ('coding specialist ii', 0.8675952858827413), ('safety technician ii', 0.8638825839247628), ('lab technician ii-iii', 0.8582232140953252), ('charge nurse ii', 0.8516040532068991), ('administrative services coordinator ii', 0.8449968135823515), ('clin rsch coord ii - emergency medicine', 0.8449081062392644), ('registered nurse ii', 0.8439622428051579), ('contract specialist ii', 0.8433489072122476), ('administrative assistant ii', 0.8428500989066926), ('qc a/ii chemist i', 0.8407057899440931), ('accountant ii', 0.8382220416723224), ('counsel ii, product', 0.8381752160777542), ('environmental inspector ii', 0.8362711982588467), ('financial services officer ii', 0.8350973817670132), ('microbiologist ii', 0.8335090092794141), ('clinical research associate ii (remote)', 0.8309790318622134), ('regulatory affairs specialist ii', 0.8307204794395902), ('deposit 




The results from using token.vector with the 'en_core_web_md' package here returned rather different results from the one-hot encoding, where words are more matched. Here the nature of the job - specialized seems to bre more of an emphasis.

### Bonus 1

In [23]:
# get vector similarity for the third job title (entries 1 and 2 did not have an adjective in their name) based of noun and adj
# take the average of each entry to construct final ranking
# put everything into a table to compare

title_vnoun2 = get_vec_similarity(job_titles, 2, ['NOUN', 'PROPN'])
title_vadj2 = get_vec_similarity(job_titles, 2, ['ADJ'])
title_vnounadj2 = {i : stat.mean([title_vnoun2[i], title_vadj2[i]]) for i in range(len(job_titles))}

col_1 = sort_and_show(title_vnoun2, job_titles, 50)
col_2 = sort_and_show(title_vadj2, job_titles, 50)
col_3 = sort_and_show(title_vnounadj2, job_titles, 50)


compare_table = pd.DataFrame({'by_noun': col_1,
                         'by_adj' : col_2,
                         'by_noun_adj' : col_3})

print(f"30 similar titles to '{job_titles[2]}' based on noun, adj, and noun + adj:\n{compare_table}")

100%|████████████████████████████████████████████████| 2541/2541 [00:14<00:00, 175.93it/s]


List of phrases contains 1759 unique ['NOUN', 'PROPN']


  similarity[j] = prime.similarity(target)
100%|█████████████████████████████████████████████████| 2541/2541 [00:27<00:00, 92.83it/s]
100%|████████████████████████████████████████████████| 2541/2541 [00:14<00:00, 175.47it/s]


List of phrases contains 295 unique ['ADJ']


100%|████████████████████████████████████████████████| 2541/2541 [00:18<00:00, 137.82it/s]

Showing first 50 items:
[('associate director learning management systems', 1.0), ('director of facilities management', 0.9225263524381729), ('director, quality systems', 0.9094947487418392), ('program manager system standard maintenance', 0.902586885936833), ('information systems compliance manager', 0.8938763662111235), ('system analyst principal', 0.8807686788776851), ('financial management internal controls senior consultant', 0.8786112026274142), ('global regulatory operations manager - submission management', 0.8785708727143294), ('compounding system manager', 0.8765307799170696), ('flight operations system operations control (soc) system support program manager', 0.8760064564852429), ('utilization management specialist', 0.8726809515570867), ('senior director quality management oversight', 0.8710902406177955), ('director, quality management', 0.8696058586730893), ('quality management medical director', 0.8696058545103785), ('manager facilities management', 0.8694130835724746), (




There are much fewer adj than nouns, as such, where the titles have commonality, most of this is an exact proximity (similarity = 1) - in this case the key word is associate. Since it is so heavily 'weighted' averaging across the similarity score does not do much.

### Bonus 2

In [24]:
# same thing for job description
job_des = df.job_description.str.lower().tolist()

des_vnoun0 = get_vec_similarity(job_des, 0, ['NOUN', 'PROPN'])
des_vadj0 = get_vec_similarity(job_des, 0, ['ADJ'])

col_1_2 = sort_and_show(des_vnoun0, job_titles, 30)
col_2_2 = sort_and_show(des_vadj0, job_titles, 30)

compare_table_2 = pd.DataFrame({'by_noun': col_1_2,
                         'by_adj' : col_2_2})


print(f"30 similar titles to '{job_titles[0]}' based on noun and adj:\n{compare_table_2}")

100%|█████████████████████████████████████████████████| 2541/2541 [06:08<00:00,  6.90it/s]


List of phrases contains 16834 unique ['NOUN', 'PROPN']


100%|█████████████████████████████████████████████████| 2541/2541 [04:35<00:00,  9.22it/s]
100%|█████████████████████████████████████████████████| 2541/2541 [05:54<00:00,  7.18it/s]


List of phrases contains 4520 unique ['ADJ']


100%|█████████████████████████████████████████████████| 2541/2541 [01:29<00:00, 28.40it/s]


Showing first 30 items:
[('certified pharmacy technician ii -retail pharmacy', 1.0), ('admission & marketing liaison', 0.9922335098920366), ('nuclear pharmacist', 0.9881222174402345), ('patient access specialist, full time, days', 0.9876801704042537), ('patient access specialist-wr emergency registration, casual, days', 0.98763474912723), ('pharmacy pos billing technician', 0.9871258607656888), ('pharmacy technician (inpatient) part-time (20 hr/wk) - rotating d/e shifts', 0.9865629514230565), ('clinical associate - ct', 0.9856420398007035), ('ct technologist second shift', 0.9856346339120847), ('manufacturing environmental health & safety coordinator (ehs) virtual hiring event', 0.984840893011909), ('warehouse supervisor- gww -us', 0.9848385910711788), ('transportation director - usbl - us', 0.9847402397540733), ("patient access concierge ii– st. peter's hospital – ft days", 0.9844006067019003), ('metrology technician', 0.984280806446807), ('registered nurse- prn', 0.9842648594227765),

THe results from noun and adj here seems more comparable to each other after inspecting some job description as there are more sample of token to examine, as well as more context provided, leading to better token type identification. Returned jobs from the noun column seem to lean more towards the same industry, while adj seems to be more concentrated on how skills need to be specialized

### Bonus 3
The code below (similarity_noun_chunks) calcualtes noun-verb pair-specific vector, then use their average to construct the vector for each entries. This gives more weight to pairing, but calculation takes longer. The other functions simply aggregate all the pairs into a string and calculate the entry vectors from there. The results they return differs but the few top results quite similarm, and seems to cover a more diverse set of industry and skills - perhaps this has to do with similar respoinsibilities reflected through noun verb pair (denoting activities)

In [25]:
def similarity_noun_chunks(phrase_list, primary_index, pos):
    ll = []
    similarity = {}
    for i, phrase in enumerate(tqdm(phrase_list, ncols=90)):
        vectors = []
        doc = nlp(phrase)
        for chunk in doc.noun_chunks:
            if chunk.root.head.pos_ == pos:
                pair_span = nlp(f'{chunk.text} {chunk.root.head.lemma_}')
                vectors.append(pair_span.vector)
        if len(vectors) != 0:
            phrase_vector = sum(vectors)
            phrase_vector /= len(vectors)
            ll.append(phrase_vector)
            similarity[i] = 1 - cosine(ll[primary_index], ll[i])
        else:
            ll.append([])
            similarity[i] = 0.0
    return similarity

des_nvchunk0 = similarity_noun_chunks(job_des, 0, 'VERB')
col_3_2 = sort_and_show(des_nvchunk0, job_titles, 30)

compare_table_2['by_nounverb_chunks'] = col_3_2

print(compare_table_2)

100%|█████████████████████████████████████████████████| 2541/2541 [24:50<00:00,  1.70it/s]

Showing first 30 items:
[('certified pharmacy technician ii -retail pharmacy', 1), ('admission & marketing liaison', 0.9954707026481628), ('rn - cvra (nights)', 0.9911576509475708), ('rn - recovery room', 0.9911041855812073), ('pharmacy operations manager', 0.988545298576355), ('accounting representative', 0.9875401258468628), ('finance assistant i', 0.9869964122772217), ('quality assurance supervisor', 0.9869123101234436), ('lab clinical manager', 0.9862802028656006), ('quality supervisor', 0.9860941767692566), ('maintenance manager', 0.9857950806617737), ('regulatory & quality director', 0.9856309294700623), ('olympus api development lead', 0.9855820536613464), ('enterprise hr coordinator', 0.9853156208992004), ('tdec-environmental manager 2 - 05192022-31775', 0.985310435295105), ('referral rep', 0.9853071570396423), ('validation senior manager - (bio/pharma)', 0.98516845703125), ('paint foreman (nights)', 0.9849919676780701), ('facilities and wastewater manager', 0.9849837422370911)




In [26]:
def similarity_noun_chunks_2(phrase_list, primary_index, pos):
    ll = []
    similarity = {}
    for phrase in tqdm(phrase_list, ncols=90):
        doc = nlp(phrase)
        shorten = ''
        for chunk in doc.noun_chunks:
            if chunk.root.head.pos_ == pos:
                shorten += f'{chunk.text} {chunk.root.head.lemma_} '
        ll.append(shorten)
    for i in tqdm(range(len(ll)), ncols=90):
        prime = nlp(ll[primary_index])
        target = nlp(ll[i])
        similarity[i] = prime.similarity(target)
    return similarity


des_nvchunk0 = similarity_noun_chunks_2(job_des, 0, 'VERB')
col_3_2 = sort_and_show(des_nvchunk0, job_titles, 30)

compare_table_2['by_nounverb_chunks'] = col_3_2

print(compare_table_2)

100%|█████████████████████████████████████████████████| 2541/2541 [05:52<00:00,  7.22it/s]
100%|█████████████████████████████████████████████████| 2541/2541 [04:34<00:00,  9.26it/s]


Showing first 30 items:
[('certified pharmacy technician ii -retail pharmacy', 1.0), ('admission & marketing liaison', 0.9956666373068972), ('rn - cvra (nights)', 0.9914325748681508), ('rn - recovery room', 0.9913728205067593), ('pharmacy operations manager', 0.987061459161649), ('quality assurance supervisor', 0.9859982990991617), ('accounting representative', 0.9856452290018924), ('finance assistant i', 0.9854234084566372), ('olympus api development lead', 0.9851782987845652), ('referral rep', 0.9850481134924166), ('lab clinical manager', 0.9848109830881654), ('clinical manager', 0.9847405491449185), ('clinical manager (32)', 0.9847405491449185), ('clinic manager', 0.9847405491449185), ('regulatory & quality director', 0.9846359742328581), ('quality supervisor', 0.984625045696747), ('access team lead, full time, days', 0.984528473950712), ('access team lead', 0.9844486513243581), ('maintenance manager', 0.9837589805104042), ('quality systems, specialist ii', 0.9836854931086186), ('pe