# Mapping

## 1. Packages

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.offline as pyo
import re
import contractions
import string

from more_itertools import sort_together
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim import similarities
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import stopwords

## 2. Load data

In [None]:
df = pd.read_csv('data/jobs_labelled.csv', index_col=0)

with open('data/reference.json') as json_file:
    term2ka = json.load(json_file)

knowledge_areas = [
    'AAA',
    'AC',
    'AB',
    'C',
    'CI',
    'CPS',
    'DSS',
    'F',
    'FMS',
    'HF',
    'HS',
    'LR',
    'MAT',
    'NS',
    'OSV',
    'PLT',
    'POR',
    'RMG',
    'SOIM',
    'SS',
    'SSL',
    'WAM'
]

knowledge_groups = [
    'Introduction',
    'Human, Org., & Reg.',
    'Attacks & Defences',
    'Systems',
    'Software & Platform',
    'Infrastructure'
]

ka2doc = {
    'AAA': "",
    'AC': "",
    'AB': "",
    'C': "",
    'CI': "",
    'CPS': "",
    'DSS': "",
    'F': "",
    'FMS': "",
    'HF': "",
    'HS': "",
    'LR': "",
    'MAT': "",
    'NS': "",
    'OSV': "",
    'PLT': "",
    'POR': "",
    'RMG': "",
    'SOIM': "",
    'SS': "",
    'SSL': "",
    'WAM': ""
}

ka2group = {
    'AAA': "Systems",
    'AC': "Infrastructure",
    'AB': "Attacks & Defences",
    'C': "Systems",
    'CI': "Introduction",
    'CPS': "Infrastructure",
    'DSS': "Systems",
    'F': "Attacks & Defences",
    'FMS': "Systems",
    'HF': "Human, Org., & Reg.",
    'HS': "Infrastructure",
    'LR': "Human, Org., & Reg.",
    'MAT': "Attacks & Defences",
    'NS': "Infrastructure",
    'OSV': "Systems",
    'PLT': "Infrastructure",
    'POR': "Human, Org., & Reg.",
    'RMG': "Human, Org., & Reg.",
    'SOIM': "Attacks & Defences",
    'SS': "Software & Platform",
    'SSL': "Software & Platform",
    'WAM': "Software & Platform"
}

group2doc = {
    'Introduction': "",
    'Human, Org., & Reg.': "",
    'Attacks & Defences': "",
    'Systems': "",
    'Software & Platform': "",
    'Infrastructure': ""
}

ka2long = {
    'AAA': 'Authentication, Authorisation & Accountability',
    'AC': 'Applied Cryptography',
    'AB': 'Adversarial Behaviours',
    'C': 'Cryptography',
    'CI': 'CyBOK Introduction',
    'CPS': 'Cyber-Physical Systems Security',
    'DSS': 'Distributed Systems Security',
    'F': 'Forensics',
    'FMS': 'Formal Methods for Security',
    'HF': 'Human Factors',
    'HS': 'Hardware Security',
    'LR': 'Law & Regulation',
    'MAT': 'Malware & Attack Technologies',
    'NS': 'Network Security',
    'OSV': 'Operating Systems & Virtualisation',
    'PLT': 'Physical Layer & Telecommunications Security',
    'POR': 'Privacy & Online Rights',
    'RMG': 'Risk Management & Governance',
    'SOIM': 'Security Operations & Incident Management',
    'SS': 'Software Security',
    'SSL': 'Secure Software Lifecycle',
    'WAM': 'Web & Mobile Security'
}

For data vis. Source: https://colorbrewer2.org/

In [None]:
pal = ['#8dd3c7','#ffffb3','#bebada','#fb8072','#80b1d3','#fdb462','#b3de69','#fccde5','#d9d9d9','#bc80bd','#ccebc5','#ffed6f']
sns.palplot(pal)

sns.set_theme(style="darkgrid", palette=pal)

#### Make new corpora

##### Each ka being a document.

For *ka_tf-idf*.

In [None]:
for ka in knowledge_areas:
    for term in term2ka.keys():
        if ka in term2ka[term]:
            ka2doc[ka] += term + " "

In [None]:
doc2ka = {}
for ka in ka2doc.keys():
    doc = ka2doc[ka]
    doc2ka[doc] = ka

For *kg_tf-idf*.

In [None]:
for ka in knowledge_areas:
    for term in term2ka.keys():
        if ka in term2ka[term]:
            group2doc[ka2group[ka]] += term + " "

In [None]:
doc2group = {}
for group in group2doc.keys():
    doc = group2doc[group]
    doc2group[doc] = group

## 3. Building tf-idf representations

Utils. for building the representations.

In [None]:
class TfidfMapper():
    def __init__(self, tfidf, dictionary, corpus, documents):
        self.tfidf = tfidf
        self.dictionary = dictionary
        self.corpus = corpus
        self.documents = documents

In [None]:
def preprocess(text):
    # remove url links
    text = re.sub('(http|https):\/\/\S+', "", text)

    # remove contractions (e.g. I've > I have)
    text = contractions.fix(text)

    # remove stopwords
    token_list = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    token_list = [token for token in token_list if token not in stop_words]
    text = TreebankWordDetokenizer().detokenize(token_list)

    # remove punctuation
    token_list = word_tokenize(text)
    token_list = [token for token in token_list if token not in string.punctuation]
    text = TreebankWordDetokenizer().detokenize(token_list)

    # convert to uppercase
    text = text.upper()

    return text

In [None]:
def make_ka_tfidf(ka2doc, calc_results=True):
    documents = []
    for ka in knowledge_areas:
        if ka != 'CI': # exclude the introduction KA
            documents.append(ka2doc[ka])

    processed_documents = [preprocess(document) for document in documents]

    tokenized_documents = [word_tokenize(document) for document in processed_documents]

    dictionary = Dictionary(tokenized_documents)

    corpus = [dictionary.doc2bow(document) for document in tokenized_documents] # in matrix market format

    tfidf = TfidfModel(corpus)

    mapper = TfidfMapper(tfidf, dictionary, corpus, documents)

    tfidf_results = pd.DataFrame(columns=['token', 'document', 'tfidf'])
    if (calc_results):
        for idx, document in enumerate(corpus):
            token_weights = tfidf[document]
            for token_weight in token_weights:
                tfidf_results.loc[len(tfidf_results.index)] = [
                    dictionary[token_weight[0]],
                    documents[idx],
                    token_weight[1]
                ]
    
    return mapper, tfidf_results

def make_kg_tfidf(group2doc, calc_results=True):
    documents = []
    for kg in knowledge_groups:
        if kg != 'Introduction': # exclude the introductory group
            documents.append(group2doc[kg])

    processed_documents = [preprocess(document) for document in documents]
    
    tokenized_documents = [word_tokenize(document) for document in processed_documents]

    dictionary = Dictionary(tokenized_documents)

    corpus = [dictionary.doc2bow(document) for document in tokenized_documents] # in matrix market format

    tfidf = TfidfModel(corpus)

    mapper = TfidfMapper(tfidf, dictionary, corpus, documents)

    tfidf_results = pd.DataFrame(columns=['token', 'document', 'tfidf'])
    if (calc_results):
        for idx, document in enumerate(corpus):
            token_weights = tfidf[document]
            for token_weight in token_weights:
                tfidf_results.loc[len(tfidf_results.index)] = [
                    dictionary[token_weight[0]],
                    documents[idx],
                    token_weight[1]
                ]
    
    return mapper, tfidf_results
        

Sampling from both representations.

In [None]:
mapper, results = make_ka_tfidf(ka2doc)
results.sample(20)

In [None]:
kg_mapper, kg_results = make_kg_tfidf(group2doc)
kg_results.sample(20)

Mapping utils. for calculating similarity scores.

In [None]:
def map(mapper, query_doc):
    query_doc = preprocess(query_doc)

    # calc sims
    vec_bow = mapper.dictionary.doc2bow(query_doc.split())
    vec_tfidf = mapper.tfidf[vec_bow]
    index = similarities.MatrixSimilarity(mapper.tfidf[mapper.corpus])
    sims = index[vec_tfidf]

    # map to kas
    ka2score = {}
    for doc_position, doc_score in sorted(enumerate(sims), key=lambda item: item[1]):
        ka2score[doc2ka[mapper.documents[doc_position]]] = doc_score

    return ka2score

def map_groups(mapper, query_doc):
    query_doc = preprocess(query_doc)

    # calc sims
    vec_bow = mapper.dictionary.doc2bow(query_doc.split())
    vec_tfidf = mapper.tfidf[vec_bow]
    index = similarities.MatrixSimilarity(mapper.tfidf[mapper.corpus])
    sims = index[vec_tfidf]

    # map to kas
    group2score = {}
    for doc_position, doc_score in sorted(enumerate(sims), key=lambda item: item[1]):
        group2score[doc2group[mapper.documents[doc_position]]] = doc_score

    return group2score

## 4. Exploratory results

#### Illustrative mappings

In [None]:
dgs_df = df.loc[df['Title'].str.contains("Data Governance Specialist", case=False)]
dgs_df.head(12)

Note that the description is yet to be normalised (this happens as a part of the mapping).

In [None]:
query_doc = dgs_df['Description'].values[0]
query_doc

In [None]:
dgs_ka2score = map(mapper, query_doc)
dgs_group2score = map_groups(kg_mapper, query_doc)

In [None]:
def plot_mapping(ka2count, _color, _ax=None, trim=0):
    plt.figure(figsize=(11,7))

    # defining data and chart labels
    sorted = sort_together([ka2count.values(), ka2count.keys()], reverse=True)
    x = list(sorted[0])
    y = list(sorted[1])

    if trim != 0:
        x = x[:trim]
        y = y[:trim]

    axis_labels = ['no. hits', '']
    data_labels = []

    # make chart
    sns.barplot(x=x, y=y, color='#8dd3c7')
    plt.xticks(fontsize=10, color='DimGrey')
    plt.yticks(fontsize=10, color='DimGrey')
    plt.gca().spines['left'].set_color('DimGrey')
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color('DimGrey')
    plt.xlabel(axis_labels[0], fontsize=12, color='DimGrey', loc='left')
    plt.ylabel(axis_labels[1], fontsize=12, color='DimGrey', loc='top')

    if _ax == None:
        sns.barplot(x=x, y=y, color=_color)
    else:
        _ = sns.barplot(ax=_ax, x=x, y=y, color=_color)
    

plot_mapping(dgs_ka2score, pal[1])

In [None]:
plot_mapping(dgs_group2score, pal[1])

In [None]:
dev1_df = df.loc[df['Title'].str.contains("Full Stack Developer - Fully remote", case=False)]
dev1_df.head(12)

In [None]:
query_doc = dev1_df['Description'].values[0]
dev1_ka2score = map(mapper, query_doc)
dev1_group2score = map_groups(kg_mapper, query_doc)
plot_mapping(dev1_ka2score, pal[0])

In [None]:
plot_mapping(dev1_group2score, pal[0])

In [None]:
it_df = df.loc[df['Title'].str.contains("IT Support", case=False)]
it_df.head(12)

In [None]:
query_doc = it_df['Description'].values[0]
it_ka2score = map(mapper, query_doc)
it_group2score = map_groups(kg_mapper, query_doc)

In [None]:
qa_df = df.loc[df['Title'].str.contains("Software Test Engineer", case=False)]
qa_df.head(12)

In [None]:
query_doc = qa_df['Description'].values[0]
qa_ka2score = map(mapper, query_doc)
qa_group2score = map_groups(kg_mapper, query_doc)

In [None]:
devops_df = df.loc[df['Title'].str.contains("Cloud DevOps", case=False)]
devops_df.head(12)

In [None]:
query_doc = devops_df['Description'].values[0]
devops_ka2score = map(mapper, query_doc)
devops_group2score = map_groups(kg_mapper, query_doc)

In [None]:
data_df = df.loc[df['Title'].str.contains("Senior Data Scientist", case=False)]
data_df.head(12)

In [None]:
query_doc = data_df['Description'].values[0]
data_ka2score = map(mapper, query_doc)
data_group2score = map_groups(kg_mapper, query_doc)

In [None]:
sec_df = df.loc[df['Title'].str.contains("IT Security Analyst", case=False)]
sec_df.head(12)

In [None]:
query_doc = sec_df['Description'].values[0]
sec_ka2score = map(mapper, query_doc)
sec_group2score = map_groups(kg_mapper, query_doc)

In [None]:
man_df = df.loc[df['Title'].str.contains("Agile Delivery Manager", case=False)]
man_df.head(12)

In [None]:
query_doc = man_df['Description'].values[0]
man_ka2score = map(mapper, query_doc)
man_group2score = map_groups(kg_mapper, query_doc)

Make radar plot.

In [None]:
def quick_normalise(a):
    amin, amax = min(a), max(a)
    for i, val in enumerate(a):
        a[i] = (val-amin) / (amax-amin)

categories = list(dev1_group2score.keys())
categories = [*categories, categories[0]]

dev = list(dev1_group2score.values())
dev = [*dev, dev[0]]
quick_normalise(dev)

dgs = list(dgs_group2score.values())
dgs = [*dgs, dgs[0]]
quick_normalise(dgs)

it = list(it_group2score.values())
it = [*it, it[0]]
quick_normalise(it)

qa = list(qa_group2score.values())
qa = [*qa, qa[0]]
quick_normalise(qa)

devops = list(devops_group2score.values())
devops = [*devops, devops[0]]
quick_normalise(devops)

man = list(man_group2score.values())
man = [*man, man[0]]
quick_normalise(man)

sec = list(sec_group2score.values())
sec = [*sec, sec[0]]
quick_normalise(sec)

data = list(data_group2score.values())
data = [*data, data[0]]
quick_normalise(data)

fig = go.Figure(
    data=[
        go.Scatterpolar(r=dev, theta=categories, name='Dev', line_color=pal[0], line_width=5, marker_size=12),
        go.Scatterpolar(r=dgs, theta=categories, name='Other', line_color=pal[1], line_width=5, marker_size=12),
        go.Scatterpolar(r=it, theta=categories, name='IT', line_color=pal[7], line_width=5, marker_size=12),
        go.Scatterpolar(r=qa, theta=categories, name='QA', line_color=pal[6], line_width=5, marker_size=12),
        go.Scatterpolar(r=devops, theta=categories, name='DevOps', line_color=pal[3], line_width=5, marker_size=12),
        go.Scatterpolar(r=man, theta=categories, name='Man', line_color=pal[2], line_width=5, marker_size=12),
        go.Scatterpolar(r=sec, theta=categories, name='Security', line_color=pal[4], line_width=5, marker_size=12),
        go.Scatterpolar(r=data, theta=categories, name='Data', line_color=pal[5], line_width=5, marker_size=12),
    ],
    layout=go.Layout(
        title=go.layout.Title(text='Illustrative mappings'),
        polar={'radialaxis': {'visible': True}},
        showlegend=True
    )
)

pyo.plot(fig)

Make subplot.

In [None]:
fig, axes = plt.subplots(4, 2, figsize=(8.27, 16.69))

trim = 6

axes[0, 0].set_title('Dev')
axes[0, 1].set_title('Other')
axes[1, 0].set_title('IT')
axes[1, 1].set_title('QA')
axes[2, 0].set_title('DevOps')
axes[2, 1].set_title('Manager')
axes[3, 0].set_title('Security')
axes[3, 1].set_title('Data')


plot_mapping(dev1_ka2score, pal[0], _ax=axes[0, 0], trim=trim)
plot_mapping(dgs_ka2score, pal[1], _ax=axes[0, 1], trim=trim)
plot_mapping(it_ka2score, pal[7], _ax=axes[1, 0], trim=trim)
plot_mapping(qa_ka2score, pal[6], _ax=axes[1, 1], trim=trim)
plot_mapping(devops_ka2score, pal[3], _ax=axes[2, 0], trim=trim)
plot_mapping(man_ka2score, pal[2], _ax=axes[2, 1], trim=trim)
plot_mapping(sec_ka2score, pal[4], _ax=axes[3, 0], trim=trim)
plot_mapping(data_ka2score, pal[5], _ax=axes[3, 1], trim=trim)

#### Plot avg. mappings (by type)

In [None]:
def map_type(job_type):
    if job_type == 'IR' or job_type == 'Security analyst' or job_type == 'Sys Admin':
        return 'Security'
    elif job_type == 'Se' or job_type == 'SE':
        return 'Dev'
    elif job_type == 'Management':
        return 'Manager'
    elif job_type == 'Business analyst':
        return 'Data'
    elif job_type == 'Sales' or job_type == 'Researcher' or job_type == 'Design' or job_type == 'Consultant':
        return 'Other'
    else:
        return job_type
    

df['Job Type'] = df['Type'].apply(lambda t: map_type(t))

data = []
for index, row in df.iterrows():
    scores = map_groups(kg_mapper, row['Description'])
    scores['Job Type'] = row['Job Type']
    data.append(scores)

combined_df = pd.DataFrame.from_dict(data)

combined_df = combined_df.rename(columns=ka2long)

melt_df = combined_df.melt(id_vars=['Job Type'])
melt_df.sample(5)

In [None]:
plt.figure(figsize=(8.27, 11.69))

axis_labels = ['Similarity Score', '']
data_labels = []

plt.xticks(fontsize=10, color='DimGrey')
plt.yticks(fontsize=10, color='DimGrey')
plt.gca().spines['left'].set_color('DimGrey')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_color('DimGrey')

sns.boxenplot(data=melt_df, orient='h', width=.75, x='value', y='variable', hue='Job Type', showfliers=False, palette=[pal[0], pal[6], pal[3], pal[2], pal[4], pal[5], pal[1], pal[7]])

plt.xlabel(axis_labels[0], fontsize=12, color='DimGrey', loc='left')
plt.ylabel(axis_labels[1], fontsize=12, color='DimGrey', loc='top')

In [None]:
data = []
for index, row in df.iterrows():
    scores = map(mapper, row['Description'])
    scores['Job Type'] = row['Job Type']
    data.append(scores)

combined_df = pd.DataFrame.from_dict(data)

combined_df = combined_df.rename(columns=ka2long)

melt_df = combined_df.melt(id_vars=['Job Type'])
melt_df.sample(5)

In [None]:
plt.figure(figsize=(8.27, 11.69))

axis_labels = ['Similarity Score', '']
data_labels = []

plt.xticks(fontsize=10, color='DimGrey')
plt.yticks(fontsize=10, color='DimGrey')
plt.gca().spines['left'].set_color('DimGrey')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_color('DimGrey')

sns.boxenplot(data=melt_df, orient='h', width=.75, x='value', y='variable', hue='Job Type', showfliers=False)

plt.xlabel(axis_labels[0], fontsize=12, color='DimGrey', loc='left')
plt.ylabel(axis_labels[1], fontsize=12, color='DimGrey', loc='top')