In [1]:
import spacy
import pickle
import fitz
import warnings
import spacy
from spacy import displacy
from spacy.training import Example
import pickle
import random
import json
import re
from sklearn.metrics import classification_report
#supressing any warnings from spacy 
warnings.filterwarnings("ignore", message=r"\[W030\]")

# NER training function

In [85]:
#loading a blank spacy model
nlp = spacy.blank('en')

def train_model(train_data):
    if 'ner' not in nlp.pipe_names:
        # Add the NER pipeline to the NLP model by name
        nlp.add_pipe('ner', last=True)
    
    # Get the NER component from the pipeline
    ner = nlp.get_pipe('ner')

    # Add labels to the NER pipeline
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    
    # Remove other pipelines if they exist
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # Only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):  # Train for 10 iterations
            print("Starting iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            for i, (text, annotations) in enumerate(train_data, 1):
                try:
                    # Create an Example object for each training instance
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    # Update the model with the Example object
                    nlp.update(
                        [example],  # Batch of Example objects
                        drop=0.2,  # Dropout - make it harder to memorize data
                        sgd=optimizer,  # Callable to update weights
                        losses=losses)
                except Exception as e:
                    if "conflicting doc.ents" in str(e):
                        #print(f"Error updating model in document {i}: {e}")
                        continue
                    else:
                        #print(f"Error updating model in document {i}: {e}")
                        continue
                
            print(losses)

# First NER model

## Loading the dataset

In [6]:
#loading data from the .pkl file
input_file = 'train_data.pkl'
try:
    with open(input_file, 'rb') as f:
        data = pickle.load(f)
        if isinstance(data, list) and len(data) > 0:
            print("Format of the first item:", type(data[0])) #ensuring if it is in a tuple format for NER model training
            print("First item:", data[0])
        else:
            print("Error: The data structure in the .pkl file is empty or not a list.")
except FileNotFoundError:
    print("Error: The input file does not exist.")
except Exception as e:
    print(f"An error occurred: {e}")


Format of the first item: <class 'tuple'>
First item: ('Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://ww

In [7]:
train_data = pickle.load(open('train_data.pkl','rb'))
train_data[0]

('Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-

## Traning the first model

In [6]:
#load the pickle data as training data and train the model
try:
    if train_data:
        train_model(train_data)
    else:
        print("Training data is empty.")
except Exception as e:
    print(f"Error loading training data: {e}")

Starting iteration 0
{'ner': 6828.171925347045}
Starting iteration 1
{'ner': 4395.715847712157}
Starting iteration 2
{'ner': 2507.2566905149843}
Starting iteration 3
{'ner': 2284.291106134359}
Starting iteration 4
{'ner': 2497.4110026419935}
Starting iteration 5
{'ner': 1630.607526862903}
Starting iteration 6
{'ner': 1423.8682003389397}
Starting iteration 7
{'ner': 1426.2483074225695}
Starting iteration 8
{'ner': 1304.9306828203803}
Starting iteration 9
{'ner': 1206.07932385505}


## Saving the first model

In [7]:
# Saving the 1st model
nlp.to_disk('nlp_ner_model')

# Second NER model

## Loading the dataset

In [8]:
#loading the data for json file format
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

def trim_entity_spans(data: list) -> list:
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [9]:
train_data2 = trim_entity_spans(convert_dataturks_to_spacy("Entity Recognition in Resumes.json"))
train_data2[200]

['Chaban kumar Debbarma Tripura - Email me on Indeed: indeed.com/r/Chaban-kumar-Debbarma/bf721c55fb380d19  Willing to relocate to: Agartala, Tripura - Tripura  WORK EXPERIENCE  Microsoft  -  June 2018 to December 2018  I want full time jobs  EDUCATION  10th  School  https://www.indeed.com/r/Chaban-kumar-Debbarma/bf721c55fb380d19?isid=rex-download&ikw=download-top&co=IN',
 {'entities': [[277, 328, 'Email Address'],
   [257, 263, 'College Name'],
   [251, 255, 'Degree'],
   [175, 184, 'Companies worked at'],
   [139, 146, 'Location'],
   [52, 103, 'Email Address'],
   [22, 29, 'Location'],
   [0, 21, 'Name']]}]

## Training the model

In [65]:
# Load 2nd training data and train the model
try:
    if train_data2:
        train_model(train_data2)
    else:
        print("Training data is empty.")
except Exception as e:
    print(f"Error loading training data: {e}")

Starting iteration 0
{'ner': 9497.58259765291}
Starting iteration 1
{'ner': 4623.628019280945}
Starting iteration 2
{'ner': 3009.4312399461774}
Starting iteration 3
{'ner': 2555.4946249874865}
Starting iteration 4
{'ner': 2311.659011917309}
Starting iteration 5
{'ner': 2130.24619241971}
Starting iteration 6
{'ner': 2022.2324279438317}
Starting iteration 7
{'ner': 1742.5072192930722}
Starting iteration 8
{'ner': 1747.1796054833994}
Starting iteration 9
{'ner': 1567.4053748861297}


## Saving the model

In [66]:
# Saving the 2nd model
nlp.to_disk('nlp_ner_model2')

# Testing and comparing both models using the dataset

In [10]:
#Loading 1st Model
nlp_model = spacy.load('nlp_ner_model')

In [11]:
#Loading 2nd Model
nlp_model2 = spacy.load('nlp_ner_model2')

## First model results

In [12]:
# trying and seeing the prediction of the 1st model
doc = nlp_model(train_data2[15][0])
for ent in doc.ents:
    print(f"{ent.label_.upper():{40}}-{ent.text}")

NAME                                    -Darshan G.
COMPANIES WORKED AT                     -Oracle
LOCATION                                -Bengaluru
EMAIL ADDRESS                           -indeed.com/r/Darshan-G/025a61a82c6a8c5a
DESIGNATION                             -Financial Analyst
COMPANIES WORKED AT                     -Oracle
COMPANIES WORKED AT                     -Accenture
DEGREE                                  -MBA in Finance
COLLEGE NAME                            -Adhichunchanagiri Institute Of Technology
GRADUATION YEAR                         -2013
COLLEGE NAME                            -B B M in Education
SKILLS                                  -Excel (Less than 1 year), MS Excel (Less than 1 year), Tally (Less than 1 year)


In [13]:
displacy.render(doc, style = "ent",jupyter = True)

## Second model results

In [14]:
# trying and seeing the prediction of the 2nd model
doc = nlp_model2(train_data2[0][0])
for ent in doc.ents:
    print(f"{ent.label_.upper():{30}}-{ent.text}")

NAME                          -Abhishek Jha
DESIGNATION                   -Application Development Associate
COMPANIES WORKED AT           -Accenture
LOCATION                      -Bengaluru
EMAIL ADDRESS                 -indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a
LOCATION                      -Bangalore
DESIGNATION                   -Application Development Associate
COMPANIES WORKED AT           -Accenture
DEGREE                        -B.E in Information science and engineering
COLLEGE NAME                  -B.v.b college of engineering and technology
COLLEGE NAME                  -Woodbine modern school
DEGREE                        -10th
COLLEGE NAME                  -Kendriya Vidyalaya
SKILLS                        -C (Less than 1 year), Database (Less than 1 year), Database Management (Less than 1 year), Database Management System (Less than 1 year), Java (Less than 1 year)  ADDITIONAL INFORMATION  Technical Skills
SKILLS                        -• Programming language: C, C++, J

In [15]:
displacy.render(doc, style = "ent",jupyter = True)

# Testing and comparing both models on pdf resumes

## Upload the pdf and convert it to text

In [16]:
# Pdf to text to evaluate the models
fname = 'Alice Clark CV.pdf'
doc = fitz.open(fname)
text = ""
for page in doc:
    text += page.get_text()
tx = " ".join(text.split('\n'))
print(tx)

Alice Clark  AI / Machine Learning    Delhi, India Email me on Indeed  •  20+ years of experience in data handling, design, and development  •  Data Warehouse: Data analysis, star/snow flake scema data modelling and design specific to  data warehousing and business intelligence  •  Database: Experience in database designing, scalability, back-up and recovery, writing and  optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes.  Cloud platform: Worked on Microsoft Azure cloud services like Document DB, SQL Azure,  Stream Analytics, Event hub, Power BI, Web Job, Web App, Power BI, Azure data lake  analytics(U-SQL)  Willing to relocate anywhere    WORK EXPERIENCE  Software Engineer  Microsoft – Bangalore, Karnataka  January 2000 to Present  1. Microsoft Rewards Live dashboards:  Description: - Microsoft rewards is loyalty program that rewards Users for browsing and shopping  online. Microsoft Rewards members can earn points when searching with Bing, bro

## First model results

In [17]:
# Applying the 1st model
doc = nlp_model(tx)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Alice Clark
LOCATION                      - Delhi
COMPANIES WORKED AT           - Microsoft
DESIGNATION                   - Software Engineer
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COLLEGE NAME                  - Indian Institute of Technology – Mumbai
GRADUATION YEAR               - 2001


In [18]:
displacy.render(doc, style = "ent",jupyter = True)

## Second model results

In [19]:
# Applying the 2nd model
doc = nlp_model2(tx)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Alice Clark
LOCATION                      - Delhi
EMAIL ADDRESS                 - •
SKILLS                        - Database: Experience in database designing, scalability, back-up and recovery, writing and  optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes.
DESIGNATION                   - Software Engineer
COMPANIES WORKED AT           - Microsoft
LOCATION                      - Bangalore
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COLLEGE NAME                  - Indian Institute of Technology
SKILLS                        - Machine Learning, Natural Language Processing, and Big Data Handling    ADDITIONAL INFORMATION  Professional Skills  • Excellent analytical, problem solving, communication, knowledge transfer and interpersonal  skills with ability to interact with individuals at all the 

In [20]:
displacy.render(doc, style = "ent",jupyter = True)