# Importing The Libraries

In [1]:
import spacy
import pandas as pd
import json
import re
import requests

# Preparing The Data

In [2]:
def get_resume_text(url):
  response = requests.get(url)
  if response.status_code == 200:
    return response.text
  else:
    raise Exception(f"Failed to get resume text: {response.status_code}")

In [3]:
def clean_entity_span(text, start, end):
    # Strip leading and trailing whitespace from the entity span
    while start < end and text[start].isspace():
        start += 1
    while end > start and text[end - 1].isspace():
        end -= 1
    return start, end

In [4]:
# Read NDJSON file line by line
i = 0
j = 1
entities_data = []
# Read NDJSON file line by line
with open('data/Internify Resume dataset (4).ndjson', 'r', encoding='utf-8') as file:
    for line in file:
        resume = json.loads(line)
        url = resume['data_row']['row_data']
        resume_text = get_resume_text(url)
        print(f"Processing Resume: {j}/602")
        j += 1
        
        entity_list = []
        if 'projects' in resume:
            for project_id, project_info in resume['projects'].items():
                for label_info in project_info['labels']:
                    if 'annotations' in label_info:
                        for annotation in label_info['annotations']['objects']:
                            name = annotation['name']
                            resume_id = resume['data_row']['id']
                            location = annotation.get('location', '')
                            start = annotation['location']['start']
                            end = annotation['location']['end']
                            # entity_list.append((start, end, name))
                            # i += 1
                            # Clean the entity span
                            if start >= 0 and end >= start and end <= len(resume_text):
                                original_start, original_end = start, end
                                start, end = clean_entity_span(resume_text, start, end)
                            
                                # Validate the cleaned span
                                entity_text = resume_text[start:end]
                                if entity_text.strip() == entity_text and not re.match(r'^\s*$', entity_text):
                                    entity_list.append((start, end, name))
                                    i += 1
                                else:
                                    print(f"Invalid entity span detected and removed:")
                                    print(f"  Original span: '{resume_text[original_start:original_end]}'")
                                    print(f"  Cleaned span: '{entity_text}'")
                                    print(f"  Context: '{resume_text[max(0, start-10):min(len(resume_text), end+10)]}'")
                                    print("-----------------------------------------------------")
                            else:
                                print(f"Invalid entity span detected and removed:")
                                print(f"  Invalid start/end positions: start={start}, end={end}")
        
        entities_data.append({
            'text': resume_text,
            'entities': entity_list
        })
print(f"Number of entities processed: {i}")

Processing Resume: 1/602
Processing Resume: 2/602
Processing Resume: 3/602
Processing Resume: 4/602
Processing Resume: 5/602
Processing Resume: 6/602
Invalid entity span detected and removed:
  Original span: ''
  Cleaned span: ''
  Context: 'ilinx ISE V 11 Caden'
-----------------------------------------------------
Processing Resume: 7/602
Processing Resume: 8/602
Processing Resume: 9/602
Processing Resume: 10/602
Processing Resume: 11/602
Processing Resume: 12/602
Invalid entity span detected and removed:
  Original span: ''
  Cleaned span: ''
  Context: 'ills Java C Python A'
-----------------------------------------------------
Processing Resume: 13/602
Processing Resume: 14/602
Processing Resume: 15/602
Processing Resume: 16/602
Processing Resume: 17/602
Processing Resume: 18/602
Processing Resume: 19/602
Processing Resume: 20/602
Processing Resume: 21/602
Processing Resume: 22/602
Processing Resume: 23/602
Processing Resume: 24/602
Processing Resume: 25/602
Invalid entity span d

In [5]:
texts = []
annotations = []
for item in entities_data:
    texts.append(item['text'])
    annotations.append(item['entities'])
df = pd.DataFrame({'text': texts, 'annotations': annotations})
df

Unnamed: 0,text,annotations
0,CONSTRUCTION FOREMAN Summary Looking for a cha...,"[(3226, 3249, Education)]"
1,CONSTRUCTION WORKER Summary A motivated hard w...,"[(1372, 1397, Education)]"
2,CONSTRUCTION INSPECTOR Professional Summary Re...,"[(549, 561, Skills), (563, 573, Skills), (579,..."
3,LEAD CONSTRUCTION MANAGER REGIONAL QUALITY MAN...,"[(332, 348, Skills), (677, 685, Skills), (687,..."
4,CONSTRUCTION MANAGER Summary Energetic Constru...,"[(794, 810, Skills), (841, 860, Skills), (1000..."
...,...,...
597,## John Doe\n**(555) 123-4567 | johndoe@email....,"[(1987, 2025, Education)]"
598,**John Doe**\n(123) 456-7890 | john.doe@email....,"[(508, 510, Skills), (517, 520, Skills), (523,..."
599,## John Doe\n\n(555) 123-4567 | johndoe@email....,"[(567, 568, Skills), (571, 574, Skills), (577,..."
600,## John Doe\n\n**(555) 123-4567** | **johndoe@...,"[(531, 532, Skills), (1209, 1224, Skills)]"


# Preparing Final Data

In [6]:
df

Unnamed: 0,text,annotations
0,CONSTRUCTION FOREMAN Summary Looking for a cha...,"[(3226, 3249, Education)]"
1,CONSTRUCTION WORKER Summary A motivated hard w...,"[(1372, 1397, Education)]"
2,CONSTRUCTION INSPECTOR Professional Summary Re...,"[(549, 561, Skills), (563, 573, Skills), (579,..."
3,LEAD CONSTRUCTION MANAGER REGIONAL QUALITY MAN...,"[(332, 348, Skills), (677, 685, Skills), (687,..."
4,CONSTRUCTION MANAGER Summary Energetic Constru...,"[(794, 810, Skills), (841, 860, Skills), (1000..."
...,...,...
597,## John Doe\n**(555) 123-4567 | johndoe@email....,"[(1987, 2025, Education)]"
598,**John Doe**\n(123) 456-7890 | john.doe@email....,"[(508, 510, Skills), (517, 520, Skills), (523,..."
599,## John Doe\n\n(555) 123-4567 | johndoe@email....,"[(567, 568, Skills), (571, 574, Skills), (577,..."
600,## John Doe\n\n**(555) 123-4567** | **johndoe@...,"[(531, 532, Skills), (1209, 1224, Skills)]"


In [7]:
final_train_df = pd.DataFrame()

final_train_df['Resume Text'] = df['text'][:482]
final_train_df['entities'] = df['annotations'][:482]
final_train_df

Unnamed: 0,Resume Text,entities
0,CONSTRUCTION FOREMAN Summary Looking for a cha...,"[(3226, 3249, Education)]"
1,CONSTRUCTION WORKER Summary A motivated hard w...,"[(1372, 1397, Education)]"
2,CONSTRUCTION INSPECTOR Professional Summary Re...,"[(549, 561, Skills), (563, 573, Skills), (579,..."
3,LEAD CONSTRUCTION MANAGER REGIONAL QUALITY MAN...,"[(332, 348, Skills), (677, 685, Skills), (687,..."
4,CONSTRUCTION MANAGER Summary Energetic Constru...,"[(794, 810, Skills), (841, 860, Skills), (1000..."
...,...,...
477,## Sameer Batra\n\n**Contact:** sameer.batra@[...,"[(1629, 1669, Education), (1721, 1759, Educati..."
478,## John Smith\n\n### Contact\n\n* (123) 456-78...,"[(1822, 1863, Education)]"
479,## Sameer Batra\n\n**Contact:** sameer.batra@e...,"[(1838, 1878, Education), (2036, 2074, Educati..."
480,## Sameer Batra\n\n**Contact:** sameer.batra@e...,"[(1970, 2002, Education), (2005, 2007, Educati..."


In [8]:
final_test_df = pd.DataFrame()
final_test_df['Resume Text'] = df['text'][482:]
final_test_df['entities'] = df['annotations'][482:]
final_test_df

Unnamed: 0,Resume Text,entities
482,## Sameer Batra\n\n**Contact:** sameer.batra@e...,"[(1480, 1520, Education), (1571, 1609, Educati..."
483,## Sameer Batra\n\n**Contact:** sameer.batra@[...,"[(2040, 2080, Education), (2131, 2169, Educati..."
484,## Sameer Batra\n\n**Contact:** sameer.batra@e...,"[(1305, 1345, Education), (1470, 1508, Educati..."
485,## Sameer Batra\n\n**(555) 555-5555** | **same...,"[(1340, 1378, Education), (1565, 1601, Educati..."
486,## Sameer Batra\n\n**Contact:** sameer.batra@e...,"[(1509, 1549, Education), (1599, 1637, Educati..."
...,...,...
597,## John Doe\n**(555) 123-4567 | johndoe@email....,"[(1987, 2025, Education)]"
598,**John Doe**\n(123) 456-7890 | john.doe@email....,"[(508, 510, Skills), (517, 520, Skills), (523,..."
599,## John Doe\n\n(555) 123-4567 | johndoe@email....,"[(567, 568, Skills), (571, 574, Skills), (577,..."
600,## John Doe\n\n**(555) 123-4567** | **johndoe@...,"[(531, 532, Skills), (1209, 1224, Skills)]"


In [9]:
data = []
entities = []
train_data = []
i = 0
for index, row in final_train_df.iterrows():
  text = row['Resume Text']
  entities = row['entities']
  
  train_data.append([text, entities])

In [10]:
data = []
entities = []
test_data = []
for index, row in final_test_df.iterrows():
  text = row['Resume Text']
  entities = row['entities']
  
  test_data.append([text, entities])

In [11]:
from spacy.tokens import DocBin
doc_bin = DocBin()

## Preparing Train.spacy

In [12]:
from spacy.util import filter_spans

In [13]:
nlp = spacy.load("en_core_web_lg")
for text, annot in train_data:
    doc = nlp(text)
    ents = []
    print(f"Entities: {annot}")
    for start, end, label in annot:
        span = doc.char_span(start, end+1, label=label)
        if span is None:
            print('skipping entity')
        else:
            ents.append(span)
    filtered_entities = filter_spans(ents)
    doc.ents = filtered_entities
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")

Entities: [(3226, 3249, 'Education')]
Entities: [(1372, 1397, 'Education')]
Entities: [(549, 561, 'Skills'), (563, 573, 'Skills'), (579, 588, 'Skills'), (723, 729, 'Skills'), (731, 735, 'Skills'), (737, 753, 'Skills'), (755, 757, 'Skills'), (759, 774, 'Skills'), (776, 789, 'Skills'), (791, 817, 'Skills'), (6046, 6087, 'Education'), (6163, 6201, 'Education'), (6216, 6255, 'Education'), (6433, 6439, 'Skills'), (6465, 6477, 'Skills'), (6479, 6491, 'Skills')]
Entities: [(332, 348, 'Skills'), (677, 685, 'Skills'), (687, 695, 'Skills'), (774, 782, 'Skills'), (792, 809, 'Skills'), (848, 854, 'Skills'), (856, 858, 'Skills'), (860, 862, 'Skills'), (968, 987, 'Skills')]
Entities: [(794, 810, 'Skills'), (841, 860, 'Skills'), (1000, 1005, 'Skills')]
Entities: [(246, 260, 'Skills'), (648, 653, 'Skills'), (981, 996, 'Skills'), (998, 1001, 'Skills'), (1003, 1007, 'Skills'), (1009, 1018, 'Skills'), (1020, 1026, 'Skills'), (1028, 1036, 'Skills'), (1038, 1052, 'Skills'), (1054, 1058, 'Skills'), (1060, 1

In [14]:
for text, annot in test_data:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot:
        span = doc.char_span(start, end+1, label=label)
        if span is None:
            print('skipping entity')
        else:
            ents.append(span)
    filtered_entities = filter_spans(ents)
    doc.ents = filtered_entities
    doc_bin.add(doc)
    
doc_bin.to_disk("dev.spacy")

skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping entity
skipping

### Checking The Data

In [15]:
doc_bin = DocBin().from_disk("./train.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))
for doc in docs:
    print(f"Text: {doc.text}")
    print("Entities:", [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents])

Text: CONSTRUCTION FOREMAN Summary Looking for a challenging and rewarding future in a healthy and competitive organization and to expose my knowledge experience and potential through hard work and dedication I offer myself as an excellent team player with honesty dynamism and commitment Over 8 years of success in overseeing multi million dollar building construction and renovation projects from beginning through occupancy Proven track record of safely and effectively operating a wide range of construction equipment excavators backhoes loaders forklifts and mechanized trenchers Demonstrated ability to effectively manage a team of up to 10 construction workers performing a variety of activities Conversant with conducting project briefings and assigning daily work tasks to construction workers Particularly effective in working within allocated budgets Highlights CONSTRUCTION FOREMAN Project Scheduling Empowerment Regulatory Compliance Planning and Budgeting Crew Management Contract Negot

In [19]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m[+] Auto-filled config with all values[0m
[38;5;2m[+] Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [20]:
!python -m spacy debug data config.cfg --ignore-warnings --verbose --no-format

Data file validation
Pipeline can be initialized with data
Corpus is loadable
Training stats
Language: en
Training pipeline: tok2vec, ner
482 training docs
602 evaluation docs
It's recommended to use at least 2000 examples (minimum 100)
Vocab & Vectors
297148 total word(s) in the data (14396 unique)
10 most common words: '*' (21134), ',' (19747), 'and' (11595), '

' (8819), '.' (6561), 'to' (4998), '

' (4760), 'of' (4743), '-' (3905), 'the' (3324)
No word vectors present in the package
Named Entity Recognition
2 label(s)
0 missing value(s) (tokens with '-' label)
Labels in train data: 'Skills' (6398), 'Education' (595)
Good amount of examples for all labels
Examples without occurrences available for all labels
No entities consisting of or starting/ending with whitespace
Summary
5 checks passed


In [21]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy

[38;5;4m[i] No output directory provided[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    157.90    0.14    0.11    0.18    0.00
  0     200       1477.38   8439.04   71.62   69.92   73.41    0.72
  0     400        532.49   2713.26   76.57   71.29   82.68    0.77
  1     600       7765.52   2326.50   80.32   79.50   81.14    0.80
  1     800        344.75   2066.32   80.15   79.74   80.57    0.80
  2    1000        446.73   1995.35   79.65   82.40   77.08    0.80
  2    1200        318.61   1844.24   77.10   85.92   69.92    0.77
  2    1400        331.27   1830.59   81.86   83.13   80.63    0.82
  3    1600        488.65   1615.58   80.59   77.30   84.17    0.81
  3    1800       3658.68   1697.66   78.8