### Importing data and clean

In [13]:
import os
import folder
from tqdm import tqdm

import pandas as pd
import spacy

# setup
os.chdir(folder.j_info)
# print(os.listdir())

nlp = spacy.load("en_core_web_sm")
pd.set_option("display.width", 320); pd.set_option('display.max_columns',12)


# create list of csv file from May 22, concat
dfs = []
for file in os.listdir():
    if file.endswith("csv") and "5222022" in file:
        ind_df = pd.read_csv(file, low_memory=False)
        dfs.append(ind_df)

combined = pd.concat(dfs, ignore_index=True)
combined.columns = combined.columns.str.removeprefix("lnks_")

# clean data of duplicates and those without titles or description
data = combined.drop_duplicates(subset=["job_title"]).\
           dropna(subset=["job_description", "job_title"],
                  how="any").reset_index(drop=True)
print(data.columns, data.head(), f"total jobs: {len(data)}", sep="\n")

Index(['link', 'job_title', 'company', 'company_url', 'company_location', 'job_description'], dtype='object')
                                                link                                          job_title                company                                        company_url       company_location                                    job_description
0  https://www.indeed.com/rc/clk?jk=64568c71be4aa...  Certified Pharmacy Technician II -Retail Pharmacy    Baton Rouge General  https://www.indeed.com/cmp/Baton-Rouge-General...  Baton Rouge, LA 70809  JOB PURPOSE OR MISSION: Assists pharmacists in...
1  https://www.indeed.com/rc/clk?jk=7b12bce39025f...                       Information Security Advisor                 Anthem  https://www.indeed.com/cmp/Anthem,-Inc.?campai...     Richmond, VA 23218  Description\nSHIFT: Day Job\nSCHEDULE: Full-ti...
2  https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...     Associate Director Learning Management Systems                    NaN        

### (for bonus) divide data into two sets ("selected" and "vanilla")

In [14]:
# check keywords using tokens
# this gives room to search token.lemma_ in list of key words as well, so we can have a shorter list of keyword
# question: is there a way to group words with different pos_ but with similar roots, such as comply and compliance?
key_words = ["compliance", "regulatory", "regulation"]

data_range = range(len(data))
for i in tqdm(data_range):
    job_title = data.loc[i, "job_title"].lower()
    for token in nlp(job_title):
        if token.lemma_ in key_words:
            data.loc[i, "job_type"] = "selected"

data.job_type = data.job_type.fillna(value="not_selected")
print("\n", data.head(), "\n")

selected_set, vanilla_set = data[data.job_type == "selected"], \
                              data[data.job_type == "not_selected"]

print(f"searched by key word: {key_words},\n"
      f"number of entries in data found: {len(selected_set)},\n"
      f"data:\n {selected_set}")

100%|██████████| 2541/2541 [00:15<00:00, 169.27it/s]


                                                 link                                          job_title                company                                        company_url       company_location                                    job_description      job_type
0  https://www.indeed.com/rc/clk?jk=64568c71be4aa...  Certified Pharmacy Technician II -Retail Pharmacy    Baton Rouge General  https://www.indeed.com/cmp/Baton-Rouge-General...  Baton Rouge, LA 70809  JOB PURPOSE OR MISSION: Assists pharmacists in...  not_selected
1  https://www.indeed.com/rc/clk?jk=7b12bce39025f...                       Information Security Advisor                 Anthem  https://www.indeed.com/cmp/Anthem,-Inc.?campai...     Richmond, VA 23218  Description\nSHIFT: Day Job\nSCHEDULE: Full-ti...  not_selected
2  https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...     Associate Director Learning Management Systems                    NaN                                                NaN          United States




### function to return (1) number of unique tokens and (2) top 30 most frequent words, with corresponding frequency

In [15]:
# I eliminate some of the type of token/words we might not be interested in here
excluded_classes = ["PUNCT", "SYM", "X", "SPACE", "CCONJ", "AUX", "ADP", "PART", "PRON", "DET"]


# three-part function to return dictionary of unique token frequency
# the first part outputs a list of list of token for each job, then combines into a unique set. I use .lemma_ to make sure the unique set is meaningful
# the second part outputs a list of frequency table of unique token for each job (using corresponding list of token)
# the third part combines all the dictionaries in step 2 into a dataframe and tally total freq by unique terms

# the simpler way is to join all job description into a very big body of text and create the list of token based on that
# when I run that there seems to be a cap on memory for tokens, so this code process the data by batch
# for batch unit, job (every line in the df) is used for convenience, but I think you can just divide the df into multiple dfs, it works the same

def get_token_freq(df, col):
    jobs = df[col].tolist()
    tokens_lists = []
    for job in tqdm(jobs):
        tokens = []
        for t in nlp(job):
            if t.pos_ not in excluded_classes:
                tokens.append(str.lower(t.lemma_))
        tokens_lists.append(tokens)
    unique_set = set().union(*[set(k) for k in tokens_lists])
    freq_list = []
    for alist in tqdm(tokens_lists):
        ind_freq = {}
        for t in unique_set:
            ind_freq[t] = alist.count(t)
        freq_list.append(ind_freq)
    freq_json = {}
    for key in tqdm(freq_list[0].keys()):
        freq_json[key] = [d[key] for d in freq_list]
    freq_table = pd.DataFrame.from_dict(freq_json, orient="index")
    freq_table["freq"] = freq_table.sum(axis=1)
    freq_table = freq_table.sort_values("freq", ascending=False)
    show = freq_table.iloc[ : 29, list(freq_table.columns).index("freq")]
    print(f"\nNumber of unique tokens: {len(unique_set)}"
          f"\nfrequency table for top 20 tokens outside of excluded classes:\n{show}")

### Execute for three datasets

In [16]:
get_token_freq(selected_set, "job_description")

100%|██████████| 225/225 [00:25<00:00,  8.92it/s]
100%|██████████| 225/225 [00:10<00:00, 22.49it/s]
100%|██████████| 7249/7249 [00:00<00:00, 35395.11it/s]



Number of unique tokens: 7249
frequency table for top 20 tokens outside of excluded classes:
compliance     1734
regulatory     1186
work           1020
experience      889
management      650
team            644
other           635
include         616
ensure          586
business        586
provide         573
regulation      552
ability         546
program         537
require         530
requirement     520
as              499
risk            468
support         466
skill           446
process         439
year            423
review          420
knowledge       418
quality         413
policy          411
employee        396
company         383
information     378
Name: freq, dtype: int64


In [17]:
get_token_freq(vanilla_set, "job_description")

100%|██████████| 2316/2316 [04:43<00:00,  8.18it/s]
100%|██████████| 2316/2316 [07:05<00:00,  5.45it/s]
100%|██████████| 24692/24692 [00:15<00:00, 1613.12it/s]



Number of unique tokens: 24692
frequency table for top 20 tokens outside of excluded classes:
work           12906
experience      9791
require         8826
include         8194
other           8011
provide         7445
ability         7394
as              6977
ensure          6843
management      6738
service         6578
team            6524
employee        6382
job             6122
compliance      5772
program         5660
patient         5523
perform         5501
care            5423
skill           5373
position        5146
maintain        5105
requirement     5038
need            4975
support         4962
process         4866
knowledge       4831
customer        4781
quality         4750
Name: freq, dtype: int64


In [18]:
get_token_freq(data, "job_description")

100%|██████████| 2541/2541 [04:55<00:00,  8.61it/s]
100%|██████████| 2541/2541 [07:44<00:00,  5.47it/s]
100%|██████████| 25894/25894 [00:14<00:00, 1733.12it/s]



Number of unique tokens: 25894
frequency table for top 20 tokens outside of excluded classes:
work           13926
experience     10680
require         9356
include         8810
other           8646
provide         8018
ability         7940
compliance      7506
as              7476
ensure          7429
management      7388
team            7168
service         6868
employee        6778
job             6481
program         6197
skill           5819
perform         5745
patient         5645
care            5586
requirement     5558
maintain        5464
position        5459
support         5428
process         5305
need            5287
knowledge       5249
quality         5163
regulatory      5148
Name: freq, dtype: int64


#### Short notes:
- Total number of a specific token in the total data set should be the sum of that in the other two smaller ones
- It is expected that "regulation", "compliance" are among the most frequently appeared words given that we actively search for these kinds of jobs