In [None]:
!kaggle datasets download -d mehyarmlaweh/ner-annotated-cvs

In [None]:
import zipfile

dataset_name = "ner-annotated-cvs.zip"  
with zipfile.ZipFile(dataset_name, 'r') as zip_ref:
    zip_ref.extractall("dataset")

In [1]:
import warnings
import os
import spacy
import json
import random
import re
#import torch
from spacy.training.example import Example
from spacy.util import minibatch,compounding
#from sklearn.model_selection import train_test_split
#import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

In [None]:
folder_path = "/Users/rocky/HSF/data preprocessing/dataset/ResumesJsonAnnotated/ResumesJsonAnnotated"

In [None]:
def read_data(folder_path,num_of_files):
    data = []
    files = os.listdir(folder_path)
    json_files =[file for file in files if file.endswith("json")]
    random.shuffle(json_files)
    json_files = json_files[:num_of_files]
    for filename in json_files:
        json_file_path = os.path.join(folder_path, filename)
        with open(json_file_path,"r") as file:
            resume_data = json.load(file)
            data.append(resume_data)

    return data

In [None]:
all_data = read_data(folder_path,2000)

In [None]:
def filter_overlapping(entities):
    entities = sorted(entities,key = lambda x: (x[0],x[1]))
    filtered_entities = []
    last_end = -1
    for start,end,label in entities:
        if start >= last_end:
            filtered_entities.append((start,end,label))
            last_end = end
    return filtered_entities

In [None]:
training_data = []

for data in all_data:
    text = data["text"].strip()
    annotations = data["annotations"]

    entities = [(int(start), int(end), label) for start, end,label in annotations]
    entities = filter_overlapping(entities)

    training_data.append({"text":text,"entities":entities})

In [None]:
def clean_entity(data):
    invalid_span_tokens = re.compile(r'\s')
    cleaned_data = []

    for dic in data:
        text = dic["text"]
        entities = dic["entities"]
        valid_entities = []
        for start,end,label in entities:
            valid_start = start
            valid_end = end

            while( valid_start < len(text) and invalid_span_tokens.match(text[valid_start])):
                valid_start += 1
            while valid_end > 1 and valid_end <= len(text) and invalid_span_tokens.match(text[valid_end - 1]):
                valid_end -= 1
            if valid_start < valid_end:
                valid_entities.append([valid_start,valid_end,label])
            cleaned_data.append({"text":text,'entities': valid_entities})
    return cleaned_data

In [None]:
training_data = clean_entity(training_data)

In [None]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en")
doc_bin = DocBin()

In [None]:
from spacy.util import filter_spans

for training_example in tqdm(training_data):
    text = training_example["text"]
    labels = training_example["entities"]
    doc = nlp.make_doc(text)
    ents = []
    for start,end,_ in labels:
        span = doc.char_span(start,end,label="Skill",alignment_mode="contract")
        if span is not None:
            ents.append(span)

    doc.ents = filter_spans(ents)
    doc_bin.add(doc)

#doc_bin.to_disk("training_data.spacy")

In [None]:
import spacy
from spacy.tokens import DocBin
from pathlib import Path

def split_and_save_docbin(large_docbin, nlp, output_dir, docs_per_bin=20000):
    # Create output directory if it doesn't exist
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Get all docs from the large DocBin using the model's vocab
    docs = list(large_docbin.get_docs(nlp.vocab))
    total_docs = len(docs)
    
    saved_files = []
    
    # Split into smaller chunks
    for i in range(0, total_docs, docs_per_bin):
        # Create a new DocBin for this chunk
        small_bin = DocBin()
        
        # Add docs to the smaller bin
        chunk = docs[i:i + docs_per_bin]
        for doc in chunk:
            small_bin.add(doc)
        
        # Save this chunk
        output_file = output_dir / f"docs_{i//docs_per_bin}.spacy"
        small_bin.to_disk(output_file)
        saved_files.append(output_file)
        
        #print(f"Saved {len(chunk)} docs to {output_file}")
    
    return saved_files


output_directory = "split_docs"
saved_files = split_and_save_docbin(doc_bin, nlp, output_directory)
# Example usage:
"""
# Load your spaCy model
nlp = spacy.load("your_model")  # e.g., "en_core_web_sm"

# If you have your large DocBin in a variable called 'doc_bin':
output_directory = "split_docs"
saved_files = split_and_save_docbin(doc_bin, nlp, output_directory)

# Later, to load all the docs:
all_docs = []
for file in Path("split_docs").glob("*.spacy"):
    doc_bin = DocBin().from_disk(file)
    all_docs.extend(list(doc_bin.get_docs(nlp.vocab)))
"""

In [None]:
import warnings
import os
import spacy
import json
import random
import re
import torch
from spacy.training.example import Example
from spacy.util import minibatch,compounding
#from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
python -m spacy train config.cfg --output ./output --paths.train ./split_docs --paths.dev ./split_docs

In [None]:
!python -m spacy train config.cfg --output ./output --paths.train ./split_docs/docs_0.spacy --paths.dev ./split_docs/docs_1.spacy

In [None]:
import json
import re

In [None]:
with open("ner.json","r") as f:
    all_data = json.load(f)

In [None]:
def filter_overlapping(entities):
    entities = sorted(entities,key = lambda x: (x[0],x[1]))
    filtered_entities = []
    last_end = -1
    for start,end,label in entities:
        if start >= last_end:
            filtered_entities.append((start,end,label))
            last_end = end
    return filtered_entities

In [None]:
training_data = []

for data in all_data:
    text = data["document"].strip()
    annotations = data["annotation"]

    entities = [(int(value['start']),int(value['end']), value['label']) for value in annotations]
    entities = filter_overlapping(entities)

    training_data.append({"text":text,"entities":entities})

In [None]:
def clean_entity(data):
    invalid_span_tokens = re.compile(r'\s')
    cleaned_data = []

    for dic in data:
        text = dic["text"]
        entities = dic["entities"]
        valid_entities = []
        for start,end,label in entities:
            valid_start = start
            valid_end = end

            while( valid_start < len(text) and invalid_span_tokens.match(text[valid_start])):
                valid_start += 1
            while valid_end > 1 and valid_end <= len(text) and invalid_span_tokens.match(text[valid_end - 1]):
                valid_end -= 1
            if valid_start < valid_end:
                valid_entities.append([valid_start,valid_end,label])
            cleaned_data.append({"text":text,'entities': valid_entities})
    return cleaned_data

In [None]:
training_data = clean_entity(training_data)

In [None]:
from torch.utils.data import Dataset, random_split


n = len(training_data)
train_size = int(0.8*n)
train_data, val_data = random_split(training_data, [train_size,n-train_size ])

In [None]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en")
doc_bin = DocBin()

In [None]:
from spacy.util import filter_spans

for training_example in tqdm(val_data):
    text = training_example["text"]
    labels = training_example["entities"]
    doc = nlp.make_doc(text)
    ents = []
    for start,end,label in labels:
        span = doc.char_span(start,end,label=label,alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)

    doc.ents = filter_spans(ents)
    doc_bin.add(doc)

#doc_bin.to_disk("training_data.spacy")

In [None]:
doc_bin.to_disk("val_data.spacy")

In [2]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
python -m spacy train config.cfg --output ./output/ --paths.train ./train_data.spacy --paths.dev ./val_data.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
^C


In [9]:
import spacy
ner_model = spacy.load("output/model-best")

In [8]:
job_description = """
Job Description – Senior Business Analyst / Data Analyst (10+ Years of Experience)



We are seeking an experienced Business Analyst / Data Analyst with 10+ years of expertise in handling BA/DA roles, particularly in data-driven customer transformations. The ideal candidate will have strong analytical skills, technical proficiency, and experience working in the banking domain.



Key Responsibilities:

Define and agree on API contracts with consumers.
Conduct data profiling and drive data definitions & data mapping.
Document feed specifications and ensure alignment with business requirements.
Write and refine user stories, capturing acceptance criteria and validating functional test scenarios in JIRA.
Utilize Confluence for documentation and collaboration.
Define and implement data quality measures, operational models, and exception management frameworks.
Collaborate with external and internal stakeholders, ensuring smooth communication and project alignment.
Leverage data-led customer transformation methodologies to drive business outcomes.


Technical Skills:

SQL & MongoDBI – Data extraction, transformation, and reporting.
Postman – API testing and validation.
Master Data Management (MDM) – Managing and governing enterprise data.
ETL Tools – Experience in working with ETL pipelines and data integration.
JIRA & Confluence – Agile project tracking and documentation.
Microsoft PowerPoint & Stakeholder Management – Presenting insights effectively.


Preferred Experience:

10+ years of experience in Business Analysis / Data Analysis.
Strong background in the banking sector.
Experience in defining and managing data quality operations."""

# test the algorithm
doc = ner_model(job_description)

for ent in doc.ents:
    print(ent.text, '--->', ent.label_)

10+ Years ---> EXPERIENCE
Business Analyst ---> SKILLS
10+ years ---> EXPERIENCE
BA ---> SKILLS
banking ---> DOMAIN
data mapping ---> SKILLS
test ---> SKILLS
JIRA ---> SKILLS
Confluence ---> SKILLS
management ---> SKILLS
business ---> EDUCATION
SQL ---> SKILLS
Master Data Management (MDM ---> SKILLS
ETL Tools ---> SKILLS
ETL pipelines ---> SKILLS
data integration ---> SKILLS
JIRA ---> SKILLS
Confluence ---> SKILLS
Microsoft PowerPoint ---> SKILLS
Stakeholder Management ---> SKILLS
10+ years ---> EXPERIENCE
Business Analysis ---> SKILLS
Data Analysis ---> SKILLS
banking ---> DOMAIN
