This is a test version of NER module for project for extracting important info from text. This version works on english dataset


# Installing spacy core

In [None]:
! python -m spacy download en_core_web_sm

# Imports

In [None]:
import json
import os
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import spacy
print(spacy.__version__)
import re
import random

spacy.prefer_gpu()

# Download datasets

In [None]:
%%capture
!pip install opendatasets

In [None]:
import opendatasets as od

dataset_url = "https://www.kaggle.com/datasets/harsh907/resume-entities-for-ner-2"
od.download(dataset_url)

# Data preprocessing

In [None]:
def trim_entity_spans(data: list) -> list:
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for ent in entities:
            if len(ent) > 0 and len(ent[2]) > 0:
                label = ent[2]
                valid_start = ent[0]
                valid_end = ent[1]
                while valid_start < len(text) and invalid_span_tokens.match(
                        text[valid_start]):
                    valid_start += 1
                while valid_end > 1 and invalid_span_tokens.match(
                        text[valid_end]):
                    valid_end -= 1
                if valid_start > valid_end:
                    valid_start, valid_end = valid_end, valid_start
                if valid_end != valid_start:
                    valid_entities.append([valid_start, valid_end + 1, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

def validate_overlap(ALL_DATA):
    for ix,x in enumerate(ALL_DATA):
        startCK=[]
        for iy,y in enumerate(x[-1]['entities']):
            if iy == 0:
                startCK.append([y[0],y[1]])
            else:
                pop = False
                for z in startCK:
                    if z[0] <= y[0] < z[1]:
                        ALL_DATA[ix][-1]['entities'].pop(iy)
                        pop = True
                        break
                if pop == False:
                    startCK.append([y[0],y[1]])
    return ALL_DATA

def convert_doccano_to_spacy(doccano_JSON_FilePath):
    try:
        training_data = []
        with open(doccano_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = data.get('annotation', [])
            spacy_ents = []
            if entities:
                for ent in entities:
                    if ent and ent["label"]:
                        points = ent["points"][0]
                        start = points["start"]
                        end = points["end"]
                        label = ent["label"][0]  # Assuming one label per entity
                        spacy_ents.append((start, end, label))
                training_data.append((text, {"entities": spacy_ents}))
        return training_data
    except Exception as e:
        logging.exception("Unable to process " + doccano_JSON_FilePath + "\n" + "error = " + str(e))
        return None


In [None]:
DATA_FILE_PATH = "./resume-entities-for-ner-2/train_Data1.json"
ALL_DATA = convert_doccano_to_spacy(DATA_FILE_PATH)
ALL_DATA = trim_entity_spans(ALL_DATA)
ALL_DATA = validate_overlap(ALL_DATA)
random.shuffle(ALL_DATA)
print(len(ALL_DATA))

# Train

In [None]:
import pandas as pd
from spacy.util import filter_spans
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

c = 0
for text, annot in tqdm(ALL_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="strict")
        if span is None:
            s = doc.text
            sub_E = s[end:]
            sub_S = s[:start]
            end = end+ (0 if len(sub_E.split(" ", 1)[0]) <= 0 else len(sub_E.split(" ", 1)[0]))
            start = start - (0 if len(sub_S.rsplit(" ", 1)[-1]) <= 0 else len(sub_S.rsplit(" ", 1)[-1]))
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                c+=1
        else:
            ents.append(span)
    pat_orig = len(ents)
    filtered = filter_spans(ents) # THIS DOES THE TRICK
    pat_filt =len(filtered)
    doc.ents = filtered
    db.add(doc)

db.to_disk("./train.spacy") # save the docbin object
print(c)

# Config

In [None]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency -F

In [None]:
! python -m spacy train config.cfg --output ./output_ --paths.train ./train.spacy --paths.dev ./train.spacy  --gpu-id 0 --training.max_epochs 20

In [None]:
!mkdir base_model
!python -m spacy package output_/model-best/ base_model/
!pip install base_model/en_pipeline-0.0.0/dist/en_pipeline-0.0.0.tar.gz

# Test

In [None]:
text = """Abhishek Jha\nApplication Development Associate - Accenture\n\nBengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a\n\n• To work for an organization which provides me the opportunity to improve my skills\nand knowledge for my individual and company's growth in best possible ways.\n\nWilling to relocate to: Bangalore, Karnataka\n\nWORK EXPERIENCE\n\nApplication Development Associate\n\nAccenture -\n\nNovember 2017 to Present\n\nRole: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries\nfor the Bot which will be triggered based on given input. Also, Training the bot for different possible\nutterances (Both positive and negative), which will be given as\ninput by the user.\n\nEDUCATION\n\nB.E in Information science and engineering\n\nB.v.b college of engineering and technology -  Hubli, Karnataka\n\nAugust 2013 to June 2017\n\n12th in Mathematics\n\nWoodbine modern school\n\nApril 2011 to March 2013\n\n10th\n\nKendriya Vidyalaya\n\nApril 2001 to March 2011\n\nSKILLS\n\nC (Less than 1 year), Database (Less than 1 year), Database Management (Less than 1 year),\nDatabase Management System (Less than 1 year), Java (Less than 1 year)\n\nADDITIONAL INFORMATION\n\nTechnical Skills\n\nhttps://www.indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a?isid=rex-download&ikw=download-top&co=IN\n\n\n• Programming language: C, C++, Java\n• Oracle PeopleSoft\n• Internet Of Things\n• Machine Learning\n• Database Management System\n• Computer Networks\n• Operating System worked on: Linux, Windows, Mac\n\nNon - Technical Skills\n\n• Honest and Hard-Working\n• Tolerant and Flexible to Different Situations\n• Polite and Calm\n• Team-Player"""
nlp = spacy.load("en_pipeline")
doc = nlp(text)
spacy.displacy.render(doc, style="ent")
