In [1]:
import spacy

In [2]:
import pickle
import random


In [3]:
train_data = pickle.load(open('train_data.pkl', 'rb'))
train_data[0]


('Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-

In [30]:
nlp = spacy.blank('en')

def train_model(train_data):
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last = True)
    
    for _, annotation in train_data:
        for ent in annotation['entities']:
            ner.add_label(ent[2])
            
    
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Statring iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            index = 0
            for text, annotations in train_data:
                try:
                    nlp.update(
                        [text],  # batch of texts
                        [annotations],  # batch of annotations
                        drop=0.2,  # dropout - make it harder to memorise data
                        sgd=optimizer,  # callable to update weights
                        losses=losses)
                except Exception as e:
                    pass
                
            print(losses)
    

In [31]:
train_model(train_data)

Statring iteration 0
{'ner': 12849.667007756303}
Statring iteration 1
{'ner': 9135.672787442774}
Statring iteration 2
{'ner': 12218.261943330564}
Statring iteration 3
{'ner': 5962.759234057488}
Statring iteration 4
{'ner': 6984.376047604001}
Statring iteration 5
{'ner': 7464.385567539543}
Statring iteration 6
{'ner': 5660.690796557983}
Statring iteration 7
{'ner': 4766.868107130604}
Statring iteration 8
{'ner': 6485.084220011915}
Statring iteration 9
{'ner': 5143.012780885376}


In [32]:
nlp.to_disk('nlp_model')

In [33]:
nlp_model = spacy.load('nlp_model')

In [34]:
train_data[0]

("Kasturika Borah Team Member - Cisco  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Kasturika- Borah/9e71468914b38ee8  • Software Engineer with overall 3+ years of experience in Network Monitoring system tool (EM7, Quicksilver) Database tool (SQL, Maria DB) and reporting tool (Splunk) in all the releases. • Relevant experience as a Test engineer for the releases includes Functional testing as well as regression testing. Testing includes writing test cases, execute them and raise bugs. • Relevant 1+ years of experience in handling releases for EM7 with proper documentation, Power pack creation and Tar creation for Sprint releases. • Creating Splunk reports from last 6 months. • Competent technical person involved in requirement gathering, analysis, design and coding. • Experience in coding Python, SQL, and XML as per the requirement. • Have knowledge in Event generating using traps and Syslog's generator. • Exposure to Agile methodologies using Scrum Works framework, even han

In [35]:
doc = nlp_model(train_data[0][0])
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Kasturika Borah
DESIGNATION                   - Team Member
COMPANIES WORKED AT           - Cisco
LOCATION                      - Bengaluru
EMAIL ADDRESS                 - indeed.com/r/Kasturika- Borah/9e71468914b38ee8
DESIGNATION                   - Software Engineer
COMPANIES WORKED AT           - Cisco
DESIGNATION                   - Software Engineer
COMPANIES WORKED AT           - Cisco
COMPANIES WORKED AT           - Cisco
COMPANIES WORKED AT           - Cisco
COLLEGE NAME                  - Compucom Insitute of Information Technology
COLLEGE NAME                  - rajasthan University
SKILLS                        - Database (3 years), Python (3 years), Splunk (Less than 1 year), SQL (3 years), xml (3 years)


In [36]:
!pip install PyMuPDF



In [38]:
import sys, fitz
fname = 'Alice Clark CV.pdf'
doc = fitz.open(fname)
text = ""
for page in doc:
    text = text + str(page.getText())

tx = " ".join(text.split('\n'))
print(tx)

Alice Clark  AI / Machine Learning    Delhi, India Email me on Indeed  •  20+ years of experience in data handling, design, and development  •  Data Warehouse: Data analysis, star/snow flake scema data modelling and design specific to  data warehousing and business intelligence  •  Database: Experience in database designing, scalability, back-up and recovery, writing and  optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes.  Cloud platform: Worked on Microsoft Azure cloud services like Document DB, SQL Azure,  Stream Analytics, Event hub, Power BI, Web Job, Web App, Power BI, Azure data lake  analytics(U-SQL)  Willing to relocate anywhere    WORK EXPERIENCE  Software Engineer  Microsoft – Bangalore, Karnataka  January 2000 to Present  1. Microsoft Rewards Live dashboards:  Description: - Microsoft rewards is loyalty program that rewards Users for browsing and shopping  online. Microsoft Rewards members can earn points when searching with Bing, bro

In [39]:
doc = nlp_model(tx)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

NAME                          - Alice Clark
LOCATION                      - Delhi
DESIGNATION                   - Software Engineer
COMPANIES WORKED AT           - Microsoft – Bangalore
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
COMPANIES WORKED AT           - Microsoft
DEGREE                        - Indian Institute of Technology – Mumbai
SKILLS                        - Machine Learning, Natural Language Processing, and Big Data Handling    ADDITIONAL INFORMATION  Professional Skills  • Excellent analytical, problem solving, communication, knowledge transfer and interpersonal  skills with ability to interact with individuals at all the levels  • Quick learner and maintains cordial relationship with project manager and team members and  good performer both in team and independent job environments  • Positive attitude towards superiors &amp; peers  • Supervised j