**1. Preprocessing Raw Data**

In [1]:
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [2]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
import json
import re
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")

In [None]:
with open("jobs.json", "r", encoding="utf-8") as f:
    jobs_data = json.load(f)

cleaned_jobs = []

for job in jobs_data:
    title = job.get("title", "")
    keyword = job.get("keyword", "")
    raw_text = job.get("description", "")

    text = raw_text.replace("\n", " ")                           #Remove line breaks
    text = re.sub(r"http\S+|www\S+|[\[\]{}()]", "", text)        #Remove URLs and brackets
    text = re.sub(r"\s{2,}", " ", text).strip()                  #Normalize spaces

    #sentence splitting
    doc = nlp(text)
    raw_sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 15]

    #cleaning each sentence
    cleaned_sentences = []
    for sent in raw_sentences:
        cleaned = re.sub(r"[^A-Za-z0-9\s.,:/\-]", "", sent)
        cleaned = re.sub(r"\s{2,}", " ", cleaned).strip()
        cleaned_sentences.append(cleaned)

    cleaned_jobs.append({
        "title": title,
        "keyword": keyword,
        "sentences": cleaned_sentences
    })

with open("cleaned_jobs.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_jobs, f, indent=2)

**2. Phrase Extraction (Skills)**

In [None]:
with open("final_categorized_phrases.json", "r", encoding="utf-8") as f:
    raw_phrases = json.load(f)

categorized_phrases = {
    "SKILL": raw_phrases.get("SKILL_PHRASES", []),
    "TOOL": raw_phrases.get("TOOL_PHRASES", []),
    "SOFT_SKILL": raw_phrases.get("SOFT_SKILL_PHRASES", []),
    "FIELD": raw_phrases.get("FIELD_PHRASES", []),
    "LANG": raw_phrases.get("LANG_PHRASES", []),
    "CERT": raw_phrases.get("CERT_PHRASES", [])
}

In [None]:
with open("cleaned_jobs.json", "r", encoding="utf-8") as f:
    job_data = json.load(f)

def is_phrase_in_text(phrase, text):
    phrase_clean = phrase.strip().lower()
    text_clean = text.lower()
    if len(phrase_clean) <= 3:
        pattern = r'(?<!\w)' + re.escape(phrase_clean) + r'(?!\w)'
    else:
        pattern = r'\b' + re.escape(phrase_clean) + r'\b'
    return re.search(pattern, text_clean) is not None


results = []
for job in tqdm(job_data, desc="Processing jobs"):
    job_text = " ".join(job.get("sentences", []))
    matched_entities = []

    for category, phrases in categorized_phrases.items():
        for phrase in phrases:
            if is_phrase_in_text(phrase, job_text):
                matched_entities.append({
                    "text": phrase,
                    "label": category
                })

    results.append({
        "title": job.get("title", ""),
        "text": job_text,
        "entities": matched_entities
    })

with open("extracted_phrases.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print("Done, extracted all releevant phrases to extracted_phrases.json")

Processing jobs: 100%|██████████| 6697/6697 [34:32<00:00,  3.23it/s]


Done, extracted all releevant phrases to extracted_phrases.json


**3. BIO-TAGGING**

In [None]:
with open("cleaned_jobs.json", "r", encoding="utf-8") as f:
    all_jobs = json.load(f)

with open("extracted_phrases.json", "r", encoding="utf-8") as f:
    phrase_data = json.load(f)

phrase_lookup = {entry["title"]: entry["entities"] for entry in phrase_data}

def is_phrase_in_text(phrase, text):
    phrase = phrase.strip().lower()
    text = text.lower()
    if len(phrase) <= 3:
        pattern = r'(?<!\w)' + re.escape(phrase) + r'(?!\w)'
    else:
        pattern = r'\b' + re.escape(phrase) + r'\b'
    return re.search(pattern, text) is not None

def tag_sentence(text, matched_phrases):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    token_lowers = [t.lower() for t in tokens]
    labels = ["O"] * len(tokens)

    for ent in matched_phrases:
        phrase = ent["text"].lower()
        label = ent["label"]
        phrase_tokens = phrase.split()
        n = len(phrase_tokens)

        for i in range(len(tokens) - n + 1):
            if token_lowers[i:i + n] == phrase_tokens:
                labels[i] = f"B-{label}"
                for j in range(1, n):
                    labels[i + j] = f"I-{label}"
                break

    return {
        "tokens": tokens,
        "labels": labels
    }

In [None]:
tagged_jobs = []
for job in all_jobs:
    job_title = job["title"]
    keyword = job["keyword"]
    sentences = job["sentences"]
    matched_entities = phrase_lookup.get(job_title, [])

    tagged_sents = []
    for sent in sentences:
        matched = [e for e in matched_entities if is_phrase_in_text(e["text"], sent)]
        tagged_sents.append(tag_sentence(sent, matched))

    tagged_jobs.append({
        "title": job_title,
        "keyword": keyword,
        "tagged_sentences": tagged_sents
    })

with open("bio_tagged_all_jobs.json", "w", encoding="utf-8") as f:
    json.dump(tagged_jobs, f, indent=2)