## Task 1. Created dataframe of 05-02-2022 data, cleaned for missing title/description entries. Changed the spacy tokenizer to explicitly split on hyphens, slashes, parantheses, and certain forms of punctuation. Collected all the vocabulary identified in the job titles (based off lemma) and then created two dataframes: one to contain all nouns (proper or not), one to contain all adjectives. 

In [1]:
import os
import pandas as pd
import spacy
import numpy as np
import re
from tqdm import tqdm
from spacy.tokens import Doc
from spacy.tokenizer import Tokenizer
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex


## Loading May-22-2022 job info data into a DataFrame

df = pd.DataFrame()
path_info = './indeed_scraped_data/job_info_data/'

for i in os.listdir(path=path_info):
    if '5222022' in i:
        df_temp = pd.read_csv(os.path.join(path_info, i))
        df_temp = df_temp.rename(columns={'lnks_link':'link',
                               'lnks_job_title':'job_title',
                                'lnks_company':'company',
                                'lnks_company_url':'company_url',
                                'lnks_company_location':'company_location',
                                'lnks_job_description':'job_description'
                               })
        df = pd.concat([df, df_temp])

df.dropna(subset=['job_title','job_description'], inplace=True)
df = df.reset_index(drop=True)

## Setting up the the Tokenizer -- keeps hyphenated as one word

nlp = spacy.load("en_core_web_sm")

def custom_tokenizer(nlp):
    infixes = (
        LIST_ELLIPSES
        + LIST_ICONS
        + [
            r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
            r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
            r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
            r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
            r'[-]'.format(a=ALPHA),
            r'[\\]'.format(a=ALPHA),
            r'[/]'.format(a=ALPHA),
            r'[/]'.format(a=ALPHA),
            r'[-]'.format(a=ALPHA),
            r'[(]'.format(a=ALPHA),
            r'[)]'.format(a=ALPHA)
        ]
    )

    infix_re = compile_infix_regex(infixes)
    prefix_re = re.compile(r'''[.\,\?\:\;\...\‘\’\`\-\“\”\"\'~]''')

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)

nlp.tokenizer = custom_tokenizer(nlp)

## Indexing unique nouns and adjectives in the job title

term = {}
lemma_codex = {}
for count, i in enumerate(tqdm(df['job_title'])):
    for token in nlp(i):
        if token.lemma not in lemma_codex.keys():
            term[token.lemma_.lower()] = token.pos_
            lemma_codex[token.lemma] = token.lemma_.lower()

lemma_df = pd.DataFrame({'lemma': lemma_codex.keys()})
lemma_df['term'] = lemma_df['lemma'].map(lemma_codex)

term_df = pd.DataFrame(columns=['term', 'position'])
term_df['term'] = term.keys()
term_df['position'] = term.values()


term_df = lemma_df.merge(term_df, how="right", on='term')
noun_df = term_df[(term_df['position'] == "NOUN") | (term_df['position'] == 'PROPN')].drop_duplicates(subset=['term']).reset_index(drop=True)
adj_df = term_df[term_df['position'] == 'ADJ'].drop_duplicates(subset=['term']).reset_index(drop=True)

print('number of unique nouns in the job title:', len(noun_df))
print('number of unique adjectives in the job title:', len(adj_df))

100%|█████████████████████████████████████████████████████████████████████████████| 4803/4803 [00:13<00:00, 349.90it/s]

number of unique nouns in the job title: 2106
number of unique adjectives in the job title: 62





In [2]:
noun_df

Unnamed: 0,lemma,term,position
0,12766903047227818414,certified,PROPN
1,3992478799797938649,pharmacy,NOUN
2,12443269718131323089,technician,PROPN
3,11179050979078045360,retail,PROPN
4,7262725514246706477,information,NOUN
...,...,...,...
2101,11850817706580600213,opelika,PROPN
2102,12809804041656136606,l2,NOUN
2103,18361816971547367771,aintenance,NOUN
2104,4621813996319058237,collections,PROPN


In [3]:
adj_df

Unnamed: 0,lemma,term,position
0,10117188822904858183,medical,ADJ
1,1062676570870034137,principal,ADJ
2,16629723108568963832,first,ADJ
3,9245954753952281271,senior,ADJ
4,6717339789729913285,personal,ADJ
...,...,...,...
57,14777363887688624875,5th,ADJ
58,898134140951421129,(part,ADJ
59,1991385364740084992,ccountant,ADJ
60,13921000724956410130,iosha,ADJ


### No neat separation, but nouns typically give a sense of occupation or task (e.g. 'technician'), while adjectives reveal field or details concerning position (e.g. 'senior'). It should be noted that spacy's in-built identifier is not always so ideal for properly distinguishing grammatical position and some might prefer to train their own model.

## Task 2. Using the first entry's job title, produced a dataframe of other titles, ranked on their similarity through one-hot encoding.

In [4]:
from scipy.spatial.distance import cosine

vocabulary_list = list(term.keys())
one_hot_encodings = []
index_encodings = []
job_titles = []

for i in tqdm(range(len(df))):
    job_title = df.loc[i, 'job_title']
    job_titles.append(job_title)
    token_indices = []

    for token in nlp(job_title):
        if token.pos_ == "NOUN" or token.pos_ == "PROPN":
            token_index_in_vocab = vocabulary_list.index(token.lemma_.lower())
            token_indices.append(token_index_in_vocab)

        ## exclude cases where no nouns or proper nouns registered
    #print(token_indices)
    if token_indices:
        one_hot_encoding = np.zeros(len(vocabulary_list))
        for token_index in token_indices:
            one_hot_encoding[token_index] = 1

        one_hot_encodings.append(one_hot_encoding)
        index_encodings.append(i)

similarity_values = []
for i in range(1, len(df)):
    if i in index_encodings:
        similarity_value = 1 - cosine(one_hot_encodings[0], one_hot_encodings[index_encodings.index(i)])
        similarity_values.append(similarity_value)
    else:
        similarity_values.append(np.nan)
#print(similarity_values)

similar_df = pd.DataFrame(columns=['job_title', 'similarity_value_with_one_hot'])
similar_df['job_title'] = df.loc[1:, 'job_title']
similar_df['similarity_value_with_one_hot'] = similarity_values

similar_df
print('the primary job title:', df.loc[0, 'job_title'])
similar_df.nlargest(40, 'similarity_value_with_one_hot')

100%|█████████████████████████████████████████████████████████████████████████████| 4803/4803 [00:14<00:00, 340.80it/s]


the primary job title: Certified Pharmacy Technician II -Retail Pharmacy


Unnamed: 0,job_title,similarity_value_with_one_hot
437,Pharmacy Technician,0.632456
471,Pharmacy Technician,0.632456
774,Pharmacy Technician,0.632456
800,Pharmacy Technician,0.632456
824,Pharmacy Technician,0.632456
1006,Nationally Certified Pharmacy Technician,0.632456
1858,Pharmacy Technician,0.632456
1908,Pharmacy Technician,0.632456
2189,Nationally Certified Pharmacy Technician,0.632456
2196,Pharmacy Technician,0.632456


## Task 3. Using the first entry's job title, produced a dataframe of other titles, ranked on their similarity through spacy's own word-vector encoding. Combined this dataframe with the prior to compare the metrics.
Use spacy's word vector to do Task 2. Compare the results. 

In [5]:
vector_indices = []
eligible_titles = []


for i in tqdm(range(len(df))):
    spacy_encoding = np.array([float(0) for i in range(96)])
    token_count = 0
    for token in nlp(df.loc[i, 'job_title']):
        if token.pos_ == "NOUN" or token.pos_ == "PROPN":
            token_count += 1
            spacy_encoding += np.array(token.vector)
    if token_count != 0:
        spacy_encoding /= token_count
        vector_indices.append(spacy_encoding)
        eligible_titles.append(i)

simvec = []

for i in tqdm(range(1, len(df))): # token-level vector
    if i in eligible_titles:
        simvec.append(1 - cosine(vector_indices[0], vector_indices[eligible_titles.index(i)]))
    else:
        simvec.append(np.nan)

simvec_df = pd.DataFrame(columns=['job_title', 'similarity_value_with_word_vectors'])
simvec_df['job_title'] = df.loc[1:, 'job_title']
simvec_df['similarity_value_with_word_vectors'] = simvec


similarity_df = similar_df.merge(simvec_df,left_index=True, right_index=True)
similarity_df = similarity_df.drop(['job_title_y'], axis=1).rename(columns={"job_title_x": "job_title"})

print(f"the primary job title: {df.loc[0, 'job_title']}")
similarity_df.nlargest(15, 'similarity_value_with_one_hot')

100%|█████████████████████████████████████████████████████████████████████████████| 4803/4803 [00:13<00:00, 345.21it/s]
100%|███████████████████████████████████████████████████████████████████████████| 4802/4802 [00:00<00:00, 14265.87it/s]

the primary job title: Certified Pharmacy Technician II -Retail Pharmacy





Unnamed: 0,job_title,similarity_value_with_one_hot,similarity_value_with_word_vectors
437,Pharmacy Technician,0.632456,0.642856
471,Pharmacy Technician,0.632456,0.642856
774,Pharmacy Technician,0.632456,0.642856
800,Pharmacy Technician,0.632456,0.642856
824,Pharmacy Technician,0.632456,0.642856
1006,Nationally Certified Pharmacy Technician,0.632456,0.716895
1858,Pharmacy Technician,0.632456,0.642856
1908,Pharmacy Technician,0.632456,0.642856
2189,Nationally Certified Pharmacy Technician,0.632456,0.716895
2196,Pharmacy Technician,0.632456,0.642856


In [6]:
similarity_df.nlargest(15, 'similarity_value_with_word_vectors')

Unnamed: 0,job_title,similarity_value_with_one_hot,similarity_value_with_word_vectors
1234,Compliance Senior Manager (Auditor) -- Global ...,0.0,0.901607
3395,Compliance Senior Manager (Auditor) -- Global ...,0.0,0.901607
636,Senior Legal Counsel - Asset Management,0.0,0.899431
368,Internal Audit - Senior Audit Manager - Produc...,0.0,0.886468
3276,Internal Audit - Senior Audit Manager - Produc...,0.0,0.886468
2538,Clinical Research Associate II - CRO - Remote US,0.169031,0.886442
396,Centralized Lending Specialist (Lending Office...,0.0,0.885131
2429,Centralized Lending Specialist (Lending Office...,0.0,0.885131
1835,Elementary Special Education Coordinator - Ant...,0.0,0.880555
2784,Elementary Special Education Coordinator - Ant...,0.0,0.880555


### We find some notable difference between the two metrics. While one-hot encoding is looking for the presence of the same set of reference nouns in each title (e.g. "technician"), the word-vector approach attempts to find synonymous or similar words through determining the "approximate meaning" of individual words and the words they're found in conjunction with. Almost all of the top matches through the embedding approach returned nothing through one-hot encoding. On the other hand, word2vec runs the risk of misdetermining meaning or similarity (e.g. "Pharmacy technician" scores much lower on word2vec than job titles that are considerably distinct from the reference title). As such, the top rankings for the one-hot encoding are more convincing.

## Bonus task 1. Repeated task 3 for nouns and adjectives in a combined setting. While certain similarity values differed, the results are largely the same. This is due in good part to the much higher density of identified nouns than adjectives in the dataset, meaning the adjectives contributed relatively little to the similarity scores.

In [7]:
one_hot_nv = []
vec_word_nv = []
index_encodings_nv = []
job_titles = []

for i in tqdm(range(len(df))):
    job_title = df.loc[i, 'job_title']
    job_titles.append(job_title)
    token_indices = []
    
    spacy_encoding = np.array([float(0) for i in range(96)])
    token_count = 0

    for token in nlp(job_title):
        if token.pos_ == "NOUN" or token.pos_ == "PROPN" or token.pos_ == "ADJ":
            token_index_in_vocab = vocabulary_list.index(token.lemma_.lower())
            token_indices.append(token_index_in_vocab)
            
            token_count += 1
            spacy_encoding += np.array(token.vector)
            
    if token_indices:
        spacy_encoding /= token_count
        vec_word_nv.append(spacy_encoding)
        
        one_hot_encoding = np.zeros(len(vocabulary_list))
        for token_index in token_indices:
            one_hot_encoding[token_index] = 1

        one_hot_nv.append(one_hot_encoding)
        index_encodings_nv.append(i)
        
similarity_nv_values = []
simvec_nv = []

for i in range(1, len(df)):
    if i in index_encodings:
        similarity_nv_values.append(1 - cosine(one_hot_nv[0], one_hot_nv[index_encodings_nv.index(i)]))
        simvec_nv.append(1 - cosine(vec_word_nv[0], vec_word_nv[index_encodings_nv.index(i)]))
    else:
        similarity_nv_values.append(np.nan)
        simvec_nv.append(np.nan)

similarity_nv_df = pd.DataFrame(columns=['job_title', 'similarity_value_with_one_hot', 'similarity_value_with_word_vectors'])
similarity_nv_df['job_title'] = df.loc[1:, 'job_title']
similarity_nv_df['similarity_value_with_one_hot'] = similarity_nv_values
similarity_nv_df['similarity_value_with_word_vectors'] = simvec_nv

similarity_nv_df
print(f"the primary job title: {df.loc[0, 'job_title']}")

100%|█████████████████████████████████████████████████████████████████████████████| 4803/4803 [00:14<00:00, 336.12it/s]


the primary job title: Certified Pharmacy Technician II -Retail Pharmacy


In [8]:
similarity_nv_df.nlargest(15, 'similarity_value_with_one_hot')

Unnamed: 0,job_title,similarity_value_with_one_hot,similarity_value_with_word_vectors
437,Pharmacy Technician,0.632456,0.642856
471,Pharmacy Technician,0.632456,0.642856
774,Pharmacy Technician,0.632456,0.642856
800,Pharmacy Technician,0.632456,0.642856
824,Pharmacy Technician,0.632456,0.642856
1006,Nationally Certified Pharmacy Technician,0.632456,0.716895
1858,Pharmacy Technician,0.632456,0.642856
1908,Pharmacy Technician,0.632456,0.642856
2189,Nationally Certified Pharmacy Technician,0.632456,0.716895
2196,Pharmacy Technician,0.632456,0.642856


In [9]:
similarity_nv_df.nlargest(15, 'similarity_value_with_word_vectors')

Unnamed: 0,job_title,similarity_value_with_one_hot,similarity_value_with_word_vectors
1234,Compliance Senior Manager (Auditor) -- Global ...,0.0,0.901607
3395,Compliance Senior Manager (Auditor) -- Global ...,0.0,0.901607
636,Senior Legal Counsel - Asset Management,0.0,0.901328
368,Internal Audit - Senior Audit Manager - Produc...,0.0,0.897343
3276,Internal Audit - Senior Audit Manager - Produc...,0.0,0.897343
4660,Senior Audit Strategist - Consumer Compliance,0.0,0.889765
4621,Advanced Manufacturing Engineer/Clinical Biome...,0.0,0.886816
4691,Advanced Manufacturing Engineer/Clinical Biome...,0.0,0.886816
2538,Clinical Research Associate II - CRO - Remote US,0.169031,0.886442
396,Centralized Lending Specialist (Lending Office...,0.0,0.885131


## Bonus task 2. Repeated task 3 for job descriptions. We find that both approaches tend to provide more similar rankings at the top level (although not scores), perhaps on account of the much larger amount of text we can work with. Similarly, the embedding approach here is more convincing than in task 3, on account of the much larger sample.

In [10]:
term_desc = {}
lemma_codex_desc = {}
for count, i in enumerate(tqdm(df['job_description'])):
    for token in nlp(i):
        if token.lemma not in lemma_codex_desc.keys():
            term_desc[token.lemma_.lower()] = token.pos_
            lemma_codex_desc[token.lemma] = token.lemma_.lower()

vocabulary_list_desc = list(term_desc.keys())
one_hot_desc = []
vec_word_desc = []
index_encodings_desc = []
job_descs = []

for i in tqdm(range(len(df))):
    job_desc = df.loc[i, 'job_description']
    job_descs.append(job_desc)
    token_indices = []
    
    spacy_encoding = np.array([float(0) for i in range(96)])
    token_count = 0

    for token in nlp(job_desc):
        if token.pos_ == "NOUN" or token.pos_ == "PROPN":
            token_index_in_vocab = vocabulary_list_desc.index(token.lemma_.lower())
            token_indices.append(token_index_in_vocab)
            
            token_count += 1
            spacy_encoding += np.array(token.vector)
            
    if token_indices:
        spacy_encoding /= token_count
        vec_word_desc.append(spacy_encoding)
        
        one_hot_encoding = np.zeros(len(vocabulary_list_desc))
        for token_index in token_indices:
            one_hot_encoding[token_index] = 1

        one_hot_desc.append(one_hot_encoding)
        index_encodings_desc.append(i)
        
similarity_desc_values = []
simvec_desc = []

for i in range(1, len(df)):
    if i in index_encodings:
        similarity_desc_values.append(1 - cosine(one_hot_desc[0], one_hot_desc[index_encodings_desc.index(i)]))
        simvec_desc.append(1 - cosine(vec_word_desc[0], vec_word_desc[index_encodings_desc.index(i)]))
    else:
        similarity_desc_values.append(np.nan)
        simvec_desc.append(np.nan)
        #print(similarity_values)

similarity_desc_df = pd.DataFrame(columns=['job_description', 'similarity_value_with_one_hot', 'similarity_value_with_word_vectors'])
similarity_desc_df['job_description'] = df.loc[1:, 'job_description']
similarity_desc_df['similarity_value_with_one_hot'] = similarity_desc_values
similarity_desc_df['similarity_value_with_word_vectors'] = simvec_desc

similarity_desc_df

100%|██████████████████████████████████████████████████████████████████████████████| 4803/4803 [08:05<00:00,  9.89it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 4803/4803 [09:06<00:00,  8.79it/s]


Unnamed: 0,job_description,similarity_value_with_one_hot,similarity_value_with_word_vectors
1,Description\nSHIFT: Day Job\nSCHEDULE: Full-ti...,0.228127,0.964839
2,"Who we are\nWe’re a global, midsize CRO that p...",0.221013,0.980324
3,Plant Engineer with a growing company in South...,0.193316,0.925720
4,Company Overview:\nShriners Children’s is a fa...,0.250909,0.974772
5,Overview:\nDon’t just work. Work Happy.\nA car...,0.222629,0.970756
...,...,...,...
4798,"J\nAssociate, Anti Money Laundering Compliance...",0.211892,0.958206
4799,DCP Midstream is a Fortune 500 natural gas com...,0.237571,0.985536
4800,Royal Building Products enables you to broaden...,0.239955,0.976252
4801,Financial Reporting Analyst - Bank Regulatory ...,0.232583,0.969294


In [11]:
print(f"the primary job description: {df.loc[0, 'job_description']}")

the primary job description: JOB PURPOSE OR MISSION: Assists pharmacists in preparing and distributing drugs, maintaining the drug inventory and maintenance of records under the supervision of a registered pharmacist for the age population served, as defined in the department’s scope of service.
PERFORMANCE CRITERIA
CRITERIA A: Everyday Excellence Values - Employee demonstrates Everyday Excellence values in the day-to-day performance of their job.
PERFORMANCE STANDARDS:
Demonstrates courtesy and caring to each other, patients and their families, physicians, and the community.
Takes initiative in living our Everyday Excellence values and vital signs.
Takes initiative in identifying customer needs before the customer asks.
Participates in teamwork willingly and with enthusiasm.
Demonstrates respect for the dignity and privacy needs of customers through personal action and attention to the environment of care.
Keeps customers informed, answers customer questions and anticipates informatio

In [12]:
similarity_desc_df.nlargest(15, 'similarity_value_with_word_vectors')

Unnamed: 0,job_description,similarity_value_with_one_hot,similarity_value_with_word_vectors
2181,JOB PURPOSE OR MISSION: Educates Community/Ref...,0.69192,0.995327
929,JOB PURPOSE OR MISSION: To improve the quality...,0.643888,0.994607
994,JOB PURPOSE OR MISSION: To improve the quality...,0.652848,0.993811
3741,JOB PURPOSE OR MISSION: To improve the quality...,0.652848,0.993811
563,Overview:\nUnder the direction of the Staffing...,0.268447,0.990077
4153,Welcome To\nWelcome to Lahey Hospital & Medica...,0.279707,0.989122
1534,Welcome To\nWelcome to Lahey Hospital & Medica...,0.284132,0.988607
4164,Welcome To\nWelcome to Lahey Hospital & Medica...,0.284132,0.988607
3745,Overview:\nThe Dining Room Server provides an ...,0.221565,0.988231
1791,Position: Full time CPSP (Comprehensive Perina...,0.243187,0.98787


In [13]:
similarity_desc_df.nlargest(15, 'similarity_value_with_one_hot')

Unnamed: 0,job_description,similarity_value_with_one_hot,similarity_value_with_word_vectors
2181,JOB PURPOSE OR MISSION: Educates Community/Ref...,0.69192,0.995327
994,JOB PURPOSE OR MISSION: To improve the quality...,0.652848,0.993811
3741,JOB PURPOSE OR MISSION: To improve the quality...,0.652848,0.993811
929,JOB PURPOSE OR MISSION: To improve the quality...,0.643888,0.994607
1974,Make a Difference. Join Our Great Team!\nCompe...,0.348875,0.970698
2909,Make a Difference. Join Our Great Team!\nCompe...,0.348875,0.970698
492,Nemours Children's Health is seeking a Pediatr...,0.333121,0.983348
3921,Nemours Children's Health is seeking a Pediatr...,0.333121,0.983348
1655,FUNCTION\nThe Hemodialysis Registered Professi...,0.32976,0.977881
4244,FUNCTION\nThe Hemodialysis Registered Professi...,0.32976,0.977881
