In [1]:
import spacy
import pickle
import random
from tqdm import tqdm

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
train_data = pickle.load(open('./data/training/train_data.pkl', 'rb'))


In [4]:
train_data[0]

('Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-

In [5]:
nlp = spacy.blank('en')

def train_model(train_data):
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last = True)
    
    for _, annotation in train_data:
        for ent in annotation['entities']:
            ner.add_label(ent[2])
            
    
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in tqdm(range(20)):
            print("Statring iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            index = 0
            for text, annotations in train_data:
                try:
                    nlp.update(
                        [text],  # batch of texts
                        [annotations],  # batch of annotations
                        drop=0.2,  # dropout - make it harder to memorise data
                        sgd=optimizer,  # callable to update weights
                        losses=losses)
                except Exception as e:
                    pass
                
            print(losses)
    
    

In [6]:
train_model(train_data)

  0%|                                                    | 0/20 [00:00<?, ?it/s]

Statring iteration 0


  5%|██▏                                         | 1/20 [00:16<05:10, 16.33s/it]

{'ner': 13052.597557786865}
Statring iteration 1


 10%|████▍                                       | 2/20 [00:32<04:50, 16.13s/it]

{'ner': 12646.69761198669}
Statring iteration 2


 15%|██████▌                                     | 3/20 [00:47<04:29, 15.84s/it]

{'ner': 8846.171866216398}
Statring iteration 3


 20%|████████▊                                   | 4/20 [01:03<04:15, 15.96s/it]

{'ner': 8281.585663212973}
Statring iteration 4


 25%|███████████                                 | 5/20 [01:19<03:58, 15.92s/it]

{'ner': 8158.390096849009}
Statring iteration 5


 30%|█████████████▏                              | 6/20 [01:35<03:42, 15.91s/it]

{'ner': 5425.709286715244}
Statring iteration 6


 35%|███████████████▍                            | 7/20 [01:51<03:25, 15.78s/it]

{'ner': 5332.411704513117}
Statring iteration 7


 40%|█████████████████▌                          | 8/20 [02:06<03:08, 15.70s/it]

{'ner': 5409.625052748773}
Statring iteration 8


 45%|███████████████████▊                        | 9/20 [02:22<02:51, 15.61s/it]

{'ner': 5001.851437900312}
Statring iteration 9


 50%|█████████████████████▌                     | 10/20 [02:37<02:35, 15.54s/it]

{'ner': 4299.680079252118}
Statring iteration 10


 55%|███████████████████████▋                   | 11/20 [02:53<02:19, 15.55s/it]

{'ner': 5820.755321802668}
Statring iteration 11


 60%|█████████████████████████▊                 | 12/20 [03:08<02:04, 15.52s/it]

{'ner': 6358.013331718032}
Statring iteration 12


 65%|███████████████████████████▉               | 13/20 [03:24<01:49, 15.58s/it]

{'ner': 3607.334208724863}
Statring iteration 13


 70%|██████████████████████████████             | 14/20 [03:39<01:33, 15.57s/it]

{'ner': 4462.896693939768}
Statring iteration 14


 75%|████████████████████████████████▎          | 15/20 [03:55<01:17, 15.54s/it]

{'ner': 3641.8460098285586}
Statring iteration 15


 80%|██████████████████████████████████▍        | 16/20 [04:11<01:02, 15.62s/it]

{'ner': 3378.6877964951987}
Statring iteration 16


 85%|████████████████████████████████████▌      | 17/20 [04:26<00:46, 15.58s/it]

{'ner': 4806.548780069955}
Statring iteration 17


 90%|██████████████████████████████████████▋    | 18/20 [04:42<00:31, 15.59s/it]

{'ner': 4166.407776096502}
Statring iteration 18


 95%|████████████████████████████████████████▊  | 19/20 [04:58<00:15, 15.92s/it]

{'ner': 3581.88026194414}
Statring iteration 19


100%|███████████████████████████████████████████| 20/20 [05:15<00:00, 15.78s/it]

{'ner': 3630.5932952886938}





### To Save the Model

In [7]:
nlp.to_disk('nlp_model')

### To Load the Model

In [8]:
nlp_model = spacy.load('nlp_model')

### Test Model

In [9]:
doc = nlp_model(train_data[0][0])
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Sweety Garg
LOCATION                      - Bengaluru
EMAIL ADDRESS                 - indeed.com/r/Sweety-Garg/9f2d2afa546d730d
DESIGNATION                   - Technical consultant
DEGREE                        - B.Tech in Engineering
COLLEGE NAME                  - SRM University
GRADUATION YEAR               - 2013
DEGREE                        - Engineering
SKILLS                        - TECHNICAL SKILLS: ❖ Expertise in Microsoft Office Power Point, Microsoft Office Word.  ❖ Basic knowledge about C and C++ Programming.


### Test with the Resume

In [10]:
import sys, fitz

fname = './data/test/Alice Clark CV.pdf'
doc = fitz.open(fname)



In [11]:
doc

Document('./data/test/Alice Clark CV.pdf')

In [12]:
text = ""
for page in doc:
    text = text + str(page.get_text())

tx = " ".join(text.split('\n'))
print(tx)

Alice Clark  AI / Machine Learning    Delhi, India Email me on Indeed  •  20+ years of experience in data handling, design, and development  •  Data Warehouse: Data analysis, star/snow flake scema data modelling and design specific to  data warehousing and business intelligence  •  Database: Experience in database designing, scalability, back-up and recovery, writing and  optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes.  Cloud platform: Worked on Microsoft Azure cloud services like Document DB, SQL Azure,  Stream Analytics, Event hub, Power BI, Web Job, Web App, Power BI, Azure data lake  analytics(U-SQL)  Willing to relocate anywhere    WORK EXPERIENCE  Software Engineer  Microsoft – Bangalore, Karnataka  January 2000 to Present  1. Microsoft Rewards Live dashboards:  Description: - Microsoft rewards is loyalty program that rewards Users for browsing and shopping  online. Microsoft Rewards members can earn points when searching with Bing, bro

In [13]:
doc = nlp_model(tx)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Alice Clark
LOCATION                      - Delhi
COMPANIES WORKED AT           - Microsoft
DESIGNATION                   - Software Engineer
COMPANIES WORKED AT           - Microsoft
LOCATION                      - Bangalore
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COLLEGE NAME                  - Indian Institute of Technology
SKILLS                        - SKILLS  Machine Learning, Natural Language Processing, and Big Data Handling    ADDITIONAL INFORMATION  Professional Skills  • Excellent analytical, problem solving, communication, knowledge transfer and interpersonal  skills with ability to interact with individuals at all the levels  • Quick learner and maintains cordial relationship with project manager and team members and  good performer

In [None]:
# https://www.youtube.com/watch?v=WpaioLNsoGI
# https://spacy.io/usage/training#quicstart
# https://kgptalkie.com/resume-and-cv-summarization/
# https://github.com/laxmimerit/CV-Parsing-using-Spacy-3