In [6]:
import os
import re
import time

import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import sent_tokenize
from nltk import ngrams
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

# Transformers and related libraries
import transformers
from transformers import pipeline, AutoTokenizer, AutoModel

#nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rayen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load the job ads dataset and ESCO skills ontology
I removed duplicates from the job ads, the dataset is still very big, I won't be using all the entries.
<br>For the ESCO skills ontology, I will only be using the preferred labels and consider all entries as skills, even though ESCO divides them into subcategories.

In [7]:
JOBS_FP = 'marketing_sample_for_trulia_com-real_estate__20190901_20191031__30k_data.csv'
ESCO_SKILLS_FP = 'skills_en.csv'


df = pd.read_csv(JOBS_FP)
df.drop_duplicates(subset=['Job Description'], keep='first', inplace=True)
esco_df = pd.read_csv(ESCO_SKILLS_FP)
esco_df['label_cleaned'] = esco_df['preferredLabel'].apply(lambda x: re.sub(r'\([^)]*\)', '', x).strip())
esco_df['word_cnt'] = esco_df['label_cleaned'].apply(lambda x: len(str(x).split()))
esco_df = pd.DataFrame(esco_df, columns=['label_cleaned', 'altLabels', 'word_cnt'])

In [3]:
esco_df.shape

(13896, 3)

In [8]:
class EscoDataset(Dataset):
    def __init__(self, df, skill_col, backbone):
        texts = df
        self.tokenizer = AutoTokenizer.from_pretrained(backbone)
        self.texts = texts[skill_col].values.tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        res = self.tokenizer(
            self.texts[idx],
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=20
        )
        return {k:v[0] for k,v in res.items()}

    
class ClsPool(nn.Module):
    def forward(self, x):
        # batch * num_tokens * num_embedding
        return x[:, 0, :]    

    
class BertModel(nn.Module):
    def __init__(self, backbone):
        super().__init__()
        
        self.backbone_name = backbone
        self.backbone = AutoModel.from_pretrained(backbone)
        self.pool = ClsPool()
    
    def forward(self, x):
        x = self.backbone(**x)["last_hidden_state"]
        x = self.pool(x)
        
        return x

## JobBERT
JobBERT is a BERT model pre-trained on job propositions, it came from a paper where they concluded that a domain-specific pretrained model outperformed the non-adapted versions and published their model on [Huggingface](https://huggingface.co/jjzha/jobbert-base-cased). I will use it to create embeddings of ESCO skills, then embed the job postings and find relevant ESCO skills using vector similarity

In [5]:
backbone = 'jjzha/jobbert-base-cased'
emb_label = 'jobbert'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset and Dataloader
ds = EscoDataset(esco_df, 'label_cleaned', backbone)
dl = DataLoader(ds, shuffle=False, batch_size=32)

# Build custom model
model = BertModel(backbone)
model.eval()
model.to(device)

# Get embeddings for each skill
embs = []
with torch.no_grad():
    for i, x in enumerate(dl):
        x = {k:v.to(device) for k, v in x.items()}
        out = model(x)
        embs.extend(out.detach().cpu())
# Add them to the DataFrame
esco_df[emb_label] = embs

Some weights of BertModel were not initialized from the model checkpoint at jjzha/jobbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def get_sentences(job):
    """
    Given a raw html job description, parse it into sentences
    by using nltk's sentence tokenization + new line splitting, this can also accept raw text not only html text
    """
    soup = BeautifulSoup(job, 'html.parser')
    # Found some ads using unicode bullet points
    for p in soup.find_all('p'):
        p.string = p.get_text().replace("•", "")
    text = soup.get_text()
    st = sent_tokenize(text)
    sentences = []
    for sent in st:
        sentences.extend([x for x in sent.split('\n') if x !=''])
    return sentences

def compute_similarity(vec, emb_type):
    """
    Compute vector similarity for a given vec and all the ESCO skills embeddings.
    If more embeddings were created, the type is specified by the input parameter.
    Return the ESCO skill id with max similarity
    """
    esco_embs = esco_df[emb_type]
    sims = []
    # Compute cosine similarities
    for i, esco_vec in enumerate(esco_embs):
        sims.append((i, cosine_similarity(vec, esco_vec.reshape(1, -1))))
    # Return max similarity and esco skill index
    idx, sim = max(sims, key=lambda x: x[1])
    return idx, sim.item()


def compute_similarity_opt(emb_vec, emb_type):
    """
    Compute vector similarity for a given vec and all the ESCO skills embeddings
    by constructing a matrix from ESCO embeddings to process it faster.
    Return the ESCO skill id with max similarity
    """
    esco_embs = [x for x in esco_df[emb_type]]
    esco_vectors = torch.stack(esco_embs)
    # Normalize the stacked embeddings and the input vector
    norm_esco_vectors = torch.nn.functional.normalize(esco_vectors, p=2, dim=1)
    norm_emb_vec = torch.nn.functional.normalize(emb_vec.T, p=2, dim=0)
    # Compute cosine similarities
    cos_similarities = torch.matmul(norm_esco_vectors, norm_emb_vec)
    # Return max similarity and esco skill index
    sim, idx = torch.max(cos_similarities, dim=0)
    return idx.item(), sim.item()

def compute_similarity_mat(emb_mat, emb_type):
    esco_embs = [x for x in esco_df[emb_type]]
    esco_vectors = torch.stack(esco_embs)
    emb_vectors = torch.stack(emb_mat)
    # Normalize the stacked embeddings and the input vectors
    norm_esco_vectors = torch.nn.functional.normalize(esco_vectors, p=2, dim=1)
    norm_emb_vecs = torch.nn.functional.normalize(emb_vectors.T, p=2, dim=0)
    # Compute cosine similarities
    cos_similarities = torch.matmul(norm_esco_vectors, norm_emb_vecs)
    # Return max similarity and esco skill index
    max_similarities, max_indices = torch.max(cos_similarities, dim=0)
    return max_indices.numpy(), max_similarities.numpy()

In [7]:
def get_embedding(x):
    x = tokenizer(x, return_tensors='pt')
    x = {k:v.to(device) for k, v in x.items()}
    return model(x).detach().cpu()

def process_sentence(sent):
    emb = get_embedding(sent)
    return compute_similarity_opt(emb, emb_label)

tokenizer = AutoTokenizer.from_pretrained(backbone)
model = BertModel(backbone)
model.to(device)
model.eval()

# Used in performance optimization and output example
job_sample = df.iloc[15]['Job Description']
threshold = .8

Some weights of BertModel were not initialized from the model checkpoint at jjzha/jobbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Performance optimization
By improving the similarity calculation (using tensor operations) on one random job sample, the processing time for all the sentences went down from around 160 seconds to 2 seconds. (A different sample showed improvement from 300 to 5 seconds. The time improvement value is not exact but the improvement is significant)

In [8]:
sentences = get_sentences(job_sample)

# Simple similarity
sim_start_time = time.time()
for sent in sentences:
    x = tokenizer(sent, return_tensors='pt')
    x = {k:v.to(device) for k, v in x.items()}
    emb = model(x).detach().cpu()
    idx, sim = compute_similarity(emb.numpy(), emb_label)

sim_end_time = time.time()
execution_time = sim_end_time - sim_start_time
print(f"One-by-one similarity execution time: {execution_time:.4f} seconds")

# Optimized similarity
sim_start_time = time.time()
for sent in sentences:
    x = tokenizer(sent, return_tensors='pt')
    x = {k:v.to(device) for k, v in x.items()}
    emb = model(x).detach().cpu()
    idx, sim = compute_similarity_opt(emb, emb_label)

sim_end_time = time.time()
execution_time = sim_end_time - sim_start_time
print(f"Optimized similarity execution time: {execution_time:.4f} seconds")

One-by-one similarity execution time: 177.9822 seconds
Optimized similarity execution time: 1.3570 seconds


### Further optimization
I edited the method once again to compute the similarity between two matrices (matrix of sentence embeddings and a matrix of esco embeddings) in one operation. This further enhanced the processing time.

In [9]:
sentences = get_sentences(job_sample)

sim_start_time = time.time()
sent_embs = []
for sent in sentences:
    x = tokenizer(sent, return_tensors='pt')
    x = {k:v.to(device) for k, v in x.items()}
    emb = model(x).detach().cpu()
    sent_embs.append(emb.squeeze())
idxs, sims = compute_similarity_mat(sent_embs, emb_label)
# Calculate job description processing time
sim_end_time = time.time()
execution_time = sim_end_time - sim_start_time
print(f"Execution time: {execution_time:.4f} seconds")

Execution time: 0.7253 seconds


**Example with ESCO mapping outputs**

I am using a threshold parameter to filter out unlikely matches. We can see that the sentence-wise detection made some mistakes but the mapped skills are not entirely irrelevant. I also display speeds for both "fast" similarity calculation approaches

In [10]:
sim_start_time = time.time()
res = []
sentences = get_sentences(job_sample)
for sent in sentences:
    idx, sim = process_sentence(sent)
    if sim > threshold:
        res.append((sent, esco_df.iloc[idx]['label_cleaned'], sim))

sim_end_time = time.time()
execution_time = sim_end_time - sim_start_time
print(f"Execution time: {execution_time:.4f} seconds")

for r in res:
    print('=========================')
    print(f"sentence: {r[0]}\nESCO skill:{r[1]}\nSimilarity:{r[2]:.4f}") 

Execution time: 1.6109 seconds
sentence: Contribute to & maintain open source projects
ESCO skill:operate open source software
Similarity:0.8464
sentence: Familiar with agile methodologies
ESCO skill:Agile development
Similarity:0.8328
sentence: Experience with design and development of backend services
ESCO skill:implement front-end website design
Similarity:0.8274
sentence: Experience with software testing methodologies
ESCO skill:levels of software testing
Similarity:0.8499
sentence: Contributions to open-source projects
ESCO skill:Open source model
Similarity:0.8590


In [11]:
get_sentences(job_sample)

['Kochava builds real-time tracking and attribution analytics tools for connected devices; serving the world’s top brands and apps.',
 'We analyze millions of requests every single day and are ramping up at an extraordinary pace to serve billions of requests every day.',
 'The company is growing fast as we add new clients and services and we are looking to add talented, dedicated and innovative people who will strengthen our core team.',
 "We're looking for a Senior Software Engineer to function as a central member of Kochava’s XCHNG development team.",
 'The digital advertising blockchain market is challenging, dynamic, fun, and provides almost unlimited opportunities for personal and professional growth.',
 'We are looking for dedicated team players who are comfortable with self-direction, and inspired by the thrill of building creative solutions to challenging problems.',
 'Several of our programmers are self-taught, a degree is not a necessity.',
 'A love of programming and ability

In [12]:
sim_start_time = time.time()
res = []
sent_embs = []
sentences = get_sentences(job_sample)
for sent in sentences:
    sent_embs.append(get_embedding(sent).squeeze())
    
idxs, sims = compute_similarity_mat(sent_embs, emb_label)
for i in range(len(idxs)):
    if sims[i] > threshold:
        res.append((sentences[i], esco_df.iloc[idxs[i]]['label_cleaned'], sims[i]))

sim_end_time = time.time()
execution_time = sim_end_time - sim_start_time
print(f"Execution time: {execution_time:.4f} seconds")

for r in res:
    print('=========================')
    print(f"sentence: {r[0]}\nESCO skill:{r[1]}\nSimilarity:{r[2]:.4f}")

Execution time: 0.4956 seconds
sentence: Contribute to & maintain open source projects
ESCO skill:operate open source software
Similarity:0.8464
sentence: Familiar with agile methodologies
ESCO skill:Agile development
Similarity:0.8328
sentence: Experience with design and development of backend services
ESCO skill:implement front-end website design
Similarity:0.8274
sentence: Experience with software testing methodologies
ESCO skill:levels of software testing
Similarity:0.8499
sentence: Contributions to open-source projects
ESCO skill:Open source model
Similarity:0.8590


I tried both JobBERT and EscoXLMR but it seemed to me EscoXMLR had some problems with respresenting found spans correctly, moreover I am already using JobBERT embeddings so I opted for this model.

In [13]:
def get_classifiers(mtype):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if mtype == "jobbert":
        token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first", device=device)
        token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction", aggregation_strategy="first", device=device)
    elif mtype == "xlmr":        
        token_skill_classifier = pipeline(model="jjzha/escoxlmr_skill_extraction", aggregation_strategy="first", device=device)
        token_knowledge_classifier = pipeline(model="jjzha/escoxlmr_knowledge_extraction", aggregation_strategy="first", device=device)
    else:
        raise Exception("Unknown model name provided")
    return token_skill_classifier, token_knowledge_classifier


def extract_skills(job, token_skill_classifier, token_knowledge_classifier, out_treshold=.8, sim_threshold=.8):
    """
    Function that processes outputs from pre-trained, ready to use models
    that detect skills as a token classification task. There are two thresholds,
    out_threshold for filtering model outputs and sim_threshold for filtering
    based on vector similarity with ESCO skills
    """     
    sentences = get_sentences(job)
    pred_labels = []
    res = []
    skill_embs = []
    skill_texts = []
    for sent in sentences:
        skills = ner(sent, token_skill_classifier, token_knowledge_classifier)
        for entity in skills['entities']:
            text = entity['word']
            if entity['score'] > out_treshold:
                skill_embs.append(get_embedding(text).squeeze())
                skill_texts.append(text)
                
    idxs, sims = compute_similarity_mat(skill_embs, emb_label)
    for i in range(len(idxs)):
        if sims[i] > sim_threshold:
            pred_labels.append(idxs[i])
            res.append((skill_texts[i], esco_df.iloc[idxs[i]]['label_cleaned'], sims[i]))
    return pred_labels, res


def aggregate_span(results):
    new_results = []
    current_result = results[0]

    for result in results[1:]:
        if result["start"] == current_result["end"] + 1:
            current_result["word"] += " " + result["word"]
            current_result["end"] = result["end"]
        else:
            new_results.append(current_result)
            current_result = result

    new_results.append(current_result)

    return new_results


def ner(text, token_skill_classifier, token_knowledge_classifier):
    output_skills = token_skill_classifier(text)
    for result in output_skills:
        if result.get("entity_group"):
            result["entity"] = "Skill"
            del result["entity_group"]

    output_knowledge = token_knowledge_classifier(text)
    for result in output_knowledge:
        if result.get("entity_group"):
            result["entity"] = "Knowledge"
            del result["entity_group"]

    if len(output_skills) > 0:
        output_skills = aggregate_span(output_skills)
    if len(output_knowledge) > 0:
        output_knowledge = aggregate_span(output_knowledge)
    
    skills = []
    skills.extend(output_skills)
    skills.extend(output_knowledge)
    return {"text": text, "entities": skills}

We can see that this approach catches much more skills as it works on token-level. Most of them are also correct

In [14]:
tsc, tkc = get_classifiers("jobbert")

start_time = time.time()
_, res = extract_skills(job_sample, tsc, tkc)
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.4f} seconds")
for r in res:
    print('=========================')
    print(f"text: {r[0]}\nESCO skill:{r[1]}\nSimilarity:{r[2]:.4f}")

config.json:   0%|          | 0.00/942 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/942 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/jjzha/jobbert_knowledge_extraction/resolve/main/tokenizer.json: HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out.
Trying to resume download...


tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\rayen\anaconda3\Lib\logging\__init__.py", line 1110, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\rayen\anaconda3\Lib\logging\__init__.py", line 953, in format
    return fmt.format(record)
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\rayen\anaconda3\Lib\logging\__init__.py", line 687, in format
    record.message = record.getMessage()
                     ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\rayen\anaconda3\Lib\logging\__init__.py", line 377, in getMessage
    msg = msg % self.args
          ~~~~^~~~~~~~~~~
TypeError: not all arguments converted during string formatting
Call stack:
  File "<fro

Execution time: 1.4607 seconds
text: self - direction
ESCO skill:self-promote
Similarity:0.8803
text: building creative solutions to challenging problems
ESCO skill:create solutions to problems
Similarity:0.8554
text: work in a team
ESCO skill:work in a landscape team
Similarity:0.9260
text: Contribute to & maintain open source projects
ESCO skill:operate open source software
Similarity:0.8464
text: open source projects
ESCO skill:Open source model
Similarity:0.9168
text: software engineering
ESCO skill:computer engineering
Similarity:0.8377
text: leading a team
ESCO skill:lead a team
Similarity:0.9468
text: agile methodologies
ESCO skill:Agile development
Similarity:0.8723
text: design and development of backend services
ESCO skill:implement front-end website design
Similarity:0.8291
text: backend
ESCO skill:JavaScript
Similarity:0.8135
text: Java
ESCO skill:Java
Similarity:1.0000
text: Python
ESCO skill:Python
Similarity:1.0000
text: Javascript
ESCO skill:JavaScript
Similarity:0.9068

### Small labelled data subset
My task was to label a subset of data, I found out that the task was harder than I expected so I ended up manually labelling only 4 randomly selected job ads. For the labelling approach, I chose to use index names of ESCO skills as labels. It is true that this might mean that some skills, that are present in the description will not be picked up, but the outputs will be more uniform.

Another approach I though about was to label the data on span-level with the help of a labeling tool to make it faster and easier. I would then use just the parts of text detected as skills instead of ESCO terms. This might have helped to better extract concrete skills rather than generalized ontology terms.

In [15]:
def calculate_metrics(preds, labels):
    tp = 0
    fp = 0
    fn = 0
    for k, v in preds.items():
        target = labels[k] 
        # Calculate TP, FP, FN for the current entry
        tp += sum(1 for i in range(len(v)) if v[i] in target)
        fp += sum(1 for i in range(len(v)) if v[i] not in target)
        fn += sum(1 for i in range(len(target)) if target[i] not in v)

    # Calculate precision, recall, and F1-score for the current entry
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}")

In [16]:
df_sample = df.loc[[5, 211, 434, 6141]]
df_sample['labels'] = [
    [7049, 4850, 5814, 6104, 2180, 8242, 4893, 13032, 3453, 11317, 2966, 13431, 3654, 8186, 6224, 6762],
    [233, 6498, 9743, 4922, 3673],
    [13557, 5734, 7203, 1166, 1121, 8793],
    [1370, 11127, 4544, 3338, 6670, 699, 6667, 521, 680, 6535]
]
df_sample[['Job Description', 'labels']].to_csv("df_sample_labeled.csv")

for index, s in df_sample.iterrows():
    esco_skills = [esco_df.loc[x]['label_cleaned'] for x in s['labels']]
    print(s['Job Title'], '\n', esco_skills, "\n-----------------------------")

Cyber IT Risk & Strategy Senior Consultant 
 ['cyber security', 'cyber attack counter-measures', 'implement ICT risk management', 'risk management', 'perform business analysis', 'apply risk management processes', 'perform church service', 'DevOps', "assess risks of clients' assets", 'present reports', 'develop information security strategy', 'implement strategic planning', 'identify technological needs', 'consult with business clients', 'project management', 'advise client on technical possibilities'] 
-----------------------------
AV Systems Drawings AutoCAD Engineer 
 ['design electrical systems', 'create AutoCAD drawings', 'use CAD software', 'technical drawings', 'mathematics'] 
-----------------------------
Pre-owned Auto Sales Consultant 
 ['advise customers on motor vehicles', "identify customer's needs", 'satisfy customers', 'communication', 'Spanish', 'understand spoken Spanish'] 
-----------------------------
Senior Java Developer 
 ['Java', 'Python', 'Groovy', 'JavaScript', 

In [17]:
threshold = .8
preds = {}
res_log = {}
sim_start_time = time.time()
for index, s in df_sample.iterrows():
    res = []
    pred_labels = []
    sent_embs = []
    sentences = get_sentences(s['Job Description'])
    for sent in sentences:
        sent_embs.append(get_embedding(sent).squeeze())
        
    idxs, sims = compute_similarity_mat(sent_embs, emb_label)
    for i in range(len(idxs)):
        if sims[i] > threshold:
            pred_labels.append(idxs[i])
            res.append((sentences[i], esco_df.iloc[idxs[i]]['label_cleaned'], sims[i]))
    # Save results
    preds[index] = list(set(pred_labels))
    res_log[index] = list(set(res))
# Calculate job description processing time
sim_end_time = time.time()
execution_time = sim_end_time - sim_start_time
print(f"Execution time: {execution_time:.4f} seconds")
calculate_metrics(preds, df_sample['labels'])

Execution time: 1.6414 seconds
Precision: 0.1538, Recall: 0.0541, F1-score: 0.0800


In [18]:
out_treshold = .8
sim_treshold = .8
preds = {}
res_log = {}
start_time = time.time()
for index, s in df_sample.iterrows():
    pred_labels, res = extract_skills(s['Job Description'], tsc, tkc)
    # Save results
    preds[index] = list(set(pred_labels))
    res_log[index] = list(set(res))
# Calculate job description processing time
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.4f} seconds")
calculate_metrics(preds, df_sample['labels'])

Execution time: 4.7214 seconds
Precision: 0.3167, Recall: 0.5135, F1-score: 0.3918


## Testing on Nesta skill extractor library

nesta skill extractor library uses the same architecture and the basically functions the same way, the only problem is that it captures vague and general topic unlike our customized model.

In [22]:
pip install git+https://github.com/nestauk/ojd_daps_skills.git@dev

Collecting git+https://github.com/nestauk/ojd_daps_skills.git@dev
  Cloning https://github.com/nestauk/ojd_daps_skills.git (to revision dev) to /tmp/pip-req-build-qop_76cg
  Running command git clone --filter=blob:none --quiet https://github.com/nestauk/ojd_daps_skills.git /tmp/pip-req-build-qop_76cg
  Resolved https://github.com/nestauk/ojd_daps_skills.git to commit 6b94921f3173ed6686d574aad76567013cd24b89
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [23]:
!python -m spacy download en_core_web_sm


[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl#egg=en_core_web_sm==3.4.1 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617[0m[33m
[0mCollecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [24]:
from ojd_daps_skills.pipeline.extract_skills.extract_skills import ExtractSkills #import the module

es = ExtractSkills(config_name="extract_skills_toy", local=True) #instantiate with toy taxonomy configuration file

es.load() #load necessary models

job_adverts = [
    "The job involves communication skills and maths skills",
    "The job involves Excel skills. You will also need good presentation skills"
] #toy job advert examples

predicted_skills = es.get_skills(job_adverts) #extract skills from list of job adverts

[94;1;1m2024-04-16 16:39:00,583 - SkillsExtractor - INFO - Data folder downloaded from /opt/conda/lib/python3.10/site-packages/ojd_daps_skills_data (download_public_data.py:29)[0m
[94;1;1m2024-04-16 16:39:00,587 - SkillsExtractor - INFO - Loading the model from a local location (ner_spacy.py:507)[0m
[94;1;1m2024-04-16 16:39:00,588 - SkillsExtractor - INFO - Loading the model from /opt/conda/lib/python3.10/site-packages/ojd_daps_skills_data/outputs/models/ner_model/20230808/ (ner_spacy.py:510)[0m
[94;1;1m2024-04-16 16:39:03,780 - SkillsExtractor - INFO - Loading 'toy' taxonomy information (extract_skills.py:154)[0m


Downloading .gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [46]:
job = """Education : related courses : 
- statistical analysis 
- machine learning 
- deep learning 
- cloud services  
- probability and statistics 
Experience : 
machine learning intern : 
● Identified business challenges and opportunities within the recruitment process and applied Natural Language
Processing techniques to develop an innovative Application Tracking System. This system efficiently ranked
candidates for specific job descriptions, using named entity recognition and word embedding, resulting in a
significant reduction in the time spent on candidate selection.
● Orchestrated a comprehensive data acquisition strategy to harvest and preprocess candidate application data
with precision.
● Collaborated closely with HR and engineering teams to deploy the system using FastAPI and Vue.js, ensuring a
user-friendly interface for recruiters, and precisely mapped out a plan for the next 2 upcoming versions and
enhancements.
Buisness developer at enactus fst el manar : 
●Contributed to product creation, market evaluation, and customer segmentation while driving
clients' behavior analysis.
● Collaborated effectively with the sales team to develop a robust sales strategy for optimal results in 2 different
projects : Moonray, and Student plus
Junior machine learning engineer at omdena : 
● Played a pivotal role in the successful development and deployment of a computer vision system using deep
learning techniques ( pytorch ) in Egyptian orphanages, effectively tackling the issue of constrained monitoring.
● Gained expertise in deploying accurate solutions with low computational capabilities using python.
● Helped improve the well-being of more than 220 children.
Data analyst : omdena : 
● Successfully extracted and delivered actionable insights from a diverse dataset as part of the Omdena initiative for Peru's Open Data Platform, enabling positive transformation in Lima and significantly enhancing the quality of life for its residents. 
● Applied Python techniques for data collection and preprocessing, and Power BI for visualization dashboards, yielding critical insights into aggression trends and contributing to targeted interventions in Lima.
IT consultant at optima junior entreprise:  
● Utilized Vue.js, Laravel, and MySQL to successfully aggregate diverse data sources, standardize data storage, and develop interactive dashboards, leading to enhanced data management efficiency, elevated data quality, and user-friendly data accessibility.
Data science unity manager engineers spark : 
● Contributed To building a training strategy In the field of Data science for college students. 
● Executed 7 training sessions which significantly elevated students' proficiency in data science, achieving a remarkable 60% knowledge enhancement, coupled with an impressive satisfaction rate exceeding 85%.
Projects : 
Student performance prediction and analysis : 
Utilized and combined various machine learning techniques such as bagging and boosting with Python to predict
the performance of high school students.
● Employed Local and global interpretation techniques with Python to conduct in-depth analysis of the
inter-relations among various educational factors for performance prediction.
● Conducted A/B testing on a cohort of 1000 high school students to assess the impact of different educational
interventions and refine performance prediction models. 
Sales analysis : 
This project analyzes sales data with 3 interactive and clear dashboards. It summarizes sales indicators, highlights customer preferences, and details product categories/specifications driving valuable insights to inform business decisions. 
Virtual accounting firm : 
The platform aims to automate a workflow using angular. Additionally, it allows for better tracking of the accounting firm's resources and ensures more accurate and transparent billing for clients. It operates as a virtual office. Therefore, the principle is to automate various tasks that are typically performed in a "physical" office.""",
                   

In [42]:

predicted_skills = es.extract_skills(job) #extract skills from list of job adverts

[94;1;1m2024-04-16 16:53:54,162 - SkillsExtractor - INFO - Cleaned job skills (skill_ner_mapper.py:201)[0m
[94;1;1m2024-04-16 16:53:54,163 - SkillsExtractor - INFO - Mapping 28 skills to the 'toy' taxonomy (extract_skills.py:332)[0m
[94;1;1m2024-04-16 16:53:54,165 - SkillsExtractor - INFO - Getting embeddings for 5 texts ... (bert_vectorizer.py:37)[0m
[94;1;1m2024-04-16 16:53:54,288 - SkillsExtractor - INFO - Took 0.12240743637084961 seconds (bert_vectorizer.py:48)[0m
[94;1;1m2024-04-16 16:53:54,289 - SkillsExtractor - INFO - Getting embeddings for 28 texts ... (bert_vectorizer.py:37)[0m
[94;1;1m2024-04-16 16:53:54,303 - SkillsExtractor - INFO - Took 0.013159513473510742 seconds (bert_vectorizer.py:48)[0m
[94;1;1m2024-04-16 16:53:54,359 - SkillsExtractor - INFO - Mapped extracted skills onto 'toy' taxonomy (skill_ner_mapper.py:505)[0m


In [44]:
predicted_skills

[{},
 {},
 {'SKILL': [('machine learning', ('working with computers', 'S5'))]},
 {},
 {'SKILL': [('cloud services', ('working with computers', 'S5'))]},
 {},
 {},
 {'SKILL': [('machine learning intern', ('working with computers', 'S5'))]},
 {},
 {'SKILL': [('Processing techniques',
    ('communication, collaboration and creativity', 'S1'))]},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {'SKILL': [('product creation', ('working with computers', 'S5')),
   ('market evaluation',
    ('communication, collaboration and creativity', 'S1'))]},
 {},
 {},
 {'SKILL': [('Student',
    ('communication, collaboration and creativity', 'S1'))]},
 {},
 {},
 {},
 {'SKILL': [('python', ('working with computers', 'S5')),
   ('deploying accurate solutions with low computational capabilities',
    ('communication, collaboration and creativity', 'S1'))]},
 {},
 {},
 {},
 {'SKILL': [('data collection', ('working with computers', 'S5')),
   ('Python techniques', ('working with computers', 'S5')),
   ('prepr

In [49]:
_, res = extract_skills("""Education : related courses : 
- statistical analysis 
- machine learning 
- deep learning 
- cloud services  
- probability and statistics 
Experience : 
machine learning intern : 
● Identified business challenges and opportunities within the recruitment process and applied Natural Language
Processing techniques to develop an innovative Application Tracking System. This system efficiently ranked
candidates for specific job descriptions, using named entity recognition and word embedding, resulting in a
significant reduction in the time spent on candidate selection.
● Orchestrated a comprehensive data acquisition strategy to harvest and preprocess candidate application data
with precision.
● Collaborated closely with HR and engineering teams to deploy the system using FastAPI and Vue.js, ensuring a
user-friendly interface for recruiters, and precisely mapped out a plan for the next 2 upcoming versions and
enhancements.
Buisness developer at enactus fst el manar : 
●Contributed to product creation, market evaluation, and customer segmentation while driving
clients' behavior analysis.
● Collaborated effectively with the sales team to develop a robust sales strategy for optimal results in 2 different
projects : Moonray, and Student plus
Junior machine learning engineer at omdena : 
● Played a pivotal role in the successful development and deployment of a computer vision system using deep
learning techniques ( pytorch ) in Egyptian orphanages, effectively tackling the issue of constrained monitoring.
● Gained expertise in deploying accurate solutions with low computational capabilities using python.
● Helped improve the well-being of more than 220 children.
Data analyst : omdena : 
● Successfully extracted and delivered actionable insights from a diverse dataset as part of the Omdena initiative for Peru's Open Data Platform, enabling positive transformation in Lima and significantly enhancing the quality of life for its residents. 
● Applied Python techniques for data collection and preprocessing, and Power BI for visualization dashboards, yielding critical insights into aggression trends and contributing to targeted interventions in Lima.
IT consultant at optima junior entreprise:  
● Utilized Vue.js, Laravel, and MySQL to successfully aggregate diverse data sources, standardize data storage, and develop interactive dashboards, leading to enhanced data management efficiency, elevated data quality, and user-friendly data accessibility.
Data science unity manager engineers spark : 
● Contributed To building a training strategy In the field of Data science for college students. 
● Executed 7 training sessions which significantly elevated students' proficiency in data science, achieving a remarkable 60% knowledge enhancement, coupled with an impressive satisfaction rate exceeding 85%.
Projects : 
Student performance prediction and analysis : 
Utilized and combined various machine learning techniques such as bagging and boosting with Python to predict
the performance of high school students.
● Employed Local and global interpretation techniques with Python to conduct in-depth analysis of the
inter-relations among various educational factors for performance prediction.
● Conducted A/B testing on a cohort of 1000 high school students to assess the impact of different educational
interventions and refine performance prediction models. 
Sales analysis : 
This project analyzes sales data with 3 interactive and clear dashboards. It summarizes sales indicators, highlights customer preferences, and details product categories/specifications driving valuable insights to inform business decisions. 
Virtual accounting firm : 
The platform aims to automate a workflow using angular. Additionally, it allows for better tracking of the accounting firm's resources and ensures more accurate and transparent billing for clients. It operates as a virtual office. Therefore, the principle is to automate various tasks that are typically performed in a "physical" office.
  """, tsc, tkc)


In [50]:
res

[('statistical analysis', 'apply statistical analysis techniques', 0.8258099),
 ('machine learning', 'machine learning', 1.0000004),
 ('cloud services', 'develop with cloud services', 0.86818767),
 ('probability', 'boxing', 0.99760455),
 ('statistics', 'statistics', 0.99999964),
 ('Identified business challenges and opportunities',
  'identify new business opportunities',
  0.8265768),
 ('Processing techniques', 'photographic processing techniques', 0.8895103),
 ('develop', 'spelling', 0.96657646),
 ('deploy the system', 'install operating system', 0.80585754),
 ('ensuring a', 'ensure sanitation', 0.80705374),
 ('driving', 'Slovenian', 0.903703),
 ('develop a robust sales strategy for optimal results',
  'develop revenue generation strategies',
  0.8317915),
 ('development and deployment of a computer vision system',
  'design a media integration system',
  0.817845),
 ('computer vision system', 'develop computer vision system', 0.9435278),
 ('Python', 'Python', 1.0000001),
 ('data col