In [1]:
!pip install spacy[transformers]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.7-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.5 MB/s 
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 10.1 MB/s 
[?25hCollecting transformers<4.21.0,>=3.4.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 52.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 1

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [12]:
import re
import json
import spacy
import random
from tqdm import tqdm
from spacy.tokens import DocBin
from spacy.util import filter_spans

In [4]:
ROOT_DIR = "/content/gdrive/MyDrive/ResumeRanker"

In [5]:
def extract_data_from_json_spacy3(filepath):
    text_dataset = []
    dataset = []
    with open(filepath, 'r') as f:
        lines = f.readlines()

    for line in tqdm(lines,desc='Extracting Data    '):
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        data_annotations = data['annotation']
        entities = []
        if data_annotations is not None:
            for annotation in data_annotations:
                point = annotation['points'][0]
                labels = annotation['label']
                if isinstance(labels, list):
                    if not labels:
                        continue
                    label = labels[0]
                else:
                    label = labels
                if (label == "College Name") or (label == "Degree"):
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']
                    
                    lspace = len(point_text) - len(point_text.lstrip())
                    rspace = len(point_text) - len(point_text.rstrip())
                    if lspace != 0:
                        point_start = point_start + lspace
                    if rspace != 0:
                        point_end = point_end - rspace
                    entities.append((point_start, point_end + 1 , label))
        dataset.append((text, {"entities" : entities}))
        text_dataset.append(text)

    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in tqdm(dataset,desc='Processing Entities'):
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start > 0 and valid_start < len(text):
                if invalid_span_tokens.match(text[valid_start]):
                    valid_start += 1
                elif (not invalid_span_tokens.match(text[valid_start])) and (not invalid_span_tokens.match(text[valid_start-1])):
                    valid_start -= 1
                else:
                    break
            while valid_end > 1 and valid_end < len(text):
                if invalid_span_tokens.match(text[valid_end - 1]):
                    valid_end -= 1
                elif (not invalid_span_tokens.match(text[valid_end-1])) and (not invalid_span_tokens.match(text[valid_end])):
                    valid_end += 1
                else:
                    break
            valid_entities.append((valid_start, valid_end, label))
        cleaned_data.append({'text':text,'entities': valid_entities})
    return cleaned_data

In [6]:
json_file_path = "/content/gdrive/MyDrive/ResumeRanker/Dataset/Entity Recognition in Resumes.json"
training_data = extract_data_from_json_spacy3(json_file_path)

Extracting Data    : 100%|██████████| 220/220 [00:00<00:00, 12211.30it/s]
Processing Entities: 100%|██████████| 220/220 [00:00<00:00, 65405.93it/s]


In [8]:
training_data[2]

{'entities': [(3421, 3458, 'College Name'), (3381, 3419, 'Degree')],
 'text': "Akhil Yadav Polemaina Hyderabad, Telangana - Email me on Indeed: indeed.com/r/Akhil-Yadav-Polemaina/ f6931801c51c63b1  ● Senior System Engineer at Infosys with 3.2 years of experience in software development and Maintenance. ● Maintained data processing using mainframe technology for multiple front end applications of Walmart Retail Link platform and ensured on-time deliverables. ● Worked on automating the uses cases to reduce manual effort in solving repeating incidents using Service Now orchestration. ● Possess good analytical, logical ability and systematic approach to problem analysis, strong debugging and troubleshooting skills. ● Good exposure to Retail domain.  Willing to relocate to: hyderbad, Telangana  WORK EXPERIENCE  Senior Systems Engineer  Infosys Limited -  Hyderabad, Telangana -  January 2015 to Present  ● Working on all the Major and Minor Enhancement requests as part of Maintenance and Supp

In [9]:
nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin()

In [13]:
for training_example  in tqdm(training_data): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("/content/gdrive/MyDrive/ResumeRanker/Models/SPACY3_NER_CLG/training_data.spacy") # save the docbin object

100%|██████████| 220/220 [00:02<00:00, 108.95it/s]


In [14]:
cd /content/gdrive/MyDrive/ResumeRanker/Models/SPACY3_NER_CLG

/content/gdrive/MyDrive/ResumeRanker/Models/SPACY3_NER_CLG


In [16]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [17]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-07-14 07:17:51,465] [INFO] Set up nlp object from config
[2022-07-14 07:17:51,476] [INFO] Pipeline: ['transformer', 'ner']
[2022-07-14 07:17:51,481] [INFO] Created vocabulary
[2022-07-14 07:17:51,482] [INFO] Finished initializing nlp object
Downloading: 100% 481/481 [00:00<00:00, 498kB/s]
Downloading: 100% 878k/878k [00:00<00:00, 8.44MB/s]
Downloading: 100% 446k/446k [00:00<00:00, 3.93MB/s]
Downloading: 100% 1.29M/1.29M [00:00<00:00, 11.1MB/s]
Downloading: 100% 478M/478M [00:07<00:00, 63.6MB/s]
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequ

In [18]:
nlp = spacy.load("/content/gdrive/MyDrive/ResumeRanker/Models/SPACY3_NER_CLG/model-best")

In [19]:
for i in training_data[:2]:
    text = i["text"]
    print("Data :")
    print(text)
    doc = nlp(" ".join(text.split('\n')))
    for ent in doc.ents:
        print(f'{ent.label_.upper():{20}} - {ent.text}')

Data :
Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  

In [20]:
text = '''
Bhanu Prakash Pebbeti 

ML/DL Enthusiast | Implementation based learner | Looking for an opportunity to expand my
learning, knowledge and skills which help me in achieving greater practical excellence and
contribute to the success of the organization. 

pebbetibhanu2017@gmail.com 

+91 6303733897 

Hyderabad, Telangana, India 

www.hackerrank.com/bhanuprakash_b12 

linkedin.com/in/bhanu-prakash-pebbeti-700b80191 

github.com/BhanuPrakashPebbeti 

EDUCATION 

ELECTRONICS AND COMMUNICATION
ENGINEERING | B.TECH 
National Institute of Technology Calicut 
2019 - Present,  

CGPA-8.72/10(till 5th sem) 

SKILLS 

Python 

ML 

AI 

DL 

WORK EXPERIENCE 

INTERMEDIATE 
Narayana Junior College,Hyderabad 
2017 - 2019,  

Percentage-97.7% 

Member at AI Club NITC (11/2020 - Present)
One of the member at AI Club NITC, aimed at high quality
Artiﬁcial Intelligence research and developing Artiﬁcial
Intelligence systems for real world applications. 

SECONDARY HIGH SCHOOL-SSC 
Shivappa High School,Hyderabad 
2017,  

GPA-9.5/10 

Computer Vision Engineer at Intelligent
Mobility Labs (06/2021 - Present) 
Research Lab focused on Self Driving Technology and
Autonomous Mobile Robots. 

PROJECTS 

Automation  of  Cleaning  Cervical  dataset  using  deep
learning techniques (01/2021 - 05/2021) 

Used Supervised contrastive learning to remove outliers and boost
our classiﬁer performance. 

Multi Task Learning(MTL) for Self Driving Technology
 (05/2021 - Present)

Worked on Perception stack for Indian Road Conditions which
includes Semantic segmentation, Depth Estimation and Object
detection using MTL. 

Reinforcement Learning to solve Games

Worked on models like Reinforce, Sarsa, Q-Learning, DQN, Deuling
DQN to solve games like Balancing Pendulum, CartPole, Lunar
Lander from OpenAI Gym and custom made environments like Flappy
Bird. 

Image Generation using VQVAE

Used VQVAE to learn discreate representations of the images and
then a gpt prior is trained on top of these representations to
generate new images. 

CERTIFICATIONS 

Applied Data Science With Python
Specialization (08/2020)
Coursera-University of Michigan 

Neural Networks and Deep Learning (08/2020)

Coursera-deeplearning.ai 

Python for Everybody Specialization (05/2020)

Coursera-University of Michigan 

LANGUAGES 

English 
Fluent 

Telugu 
Native 

Sudoko Solver

Application made using python which solves sudoko puzzles with a
simple Graphical user interface made using pygame. 

INTERESTS 

Reading blogs 

Playing Sports(cricket) 
'''

In [21]:
doc = nlp(" ".join(text.split('\n')))
for ent in doc.ents:
    print(f'{ent.label_.upper():{20}} - {ent.text}')

DEGREE               - ELECTRONICS AND COMMUNICATION ENGINEERING
DEGREE               - | B.TECH
COLLEGE NAME         - National Institute of Technology Calicut


In [23]:
%%capture
!pip install docx2txt
!pip install pdfminer

In [28]:
import io
import re
import nltk
import pandas as pd
from dateutil import parser
import nltk
import docx2txt
from datetime import datetime
from dateutil import relativedelta
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFSyntaxError
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pickle


def extract_text_from_pdf(pdf_path):
    """
    Helper function to extract the plain text from .pdf files
    :param pdf_path: path to PDF file to be extracted (remote or local)
    :return: iterator of string of extracted text
    """
    if not isinstance(pdf_path, io.BytesIO):
        # extract text from local pdf file
        with open(pdf_path, "rb") as fh:
            try:
                for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
                    resource_manager = PDFResourceManager()
                    fake_file_handle = io.StringIO()
                    converter = TextConverter(
                        resource_manager,
                        fake_file_handle,
                        # codec="utf-8",
                        laparams=LAParams(),
                    )
                    page_interpreter = PDFPageInterpreter(resource_manager, converter)
                    page_interpreter.process_page(page)

                    text = fake_file_handle.getvalue()
                    yield text

                    # close open handles
                    converter.close()
                    fake_file_handle.close()
            except PDFSyntaxError:
                return
    else:
        # extract text from remote pdf file
        try:
            for page in PDFPage.get_pages(
                pdf_path, caching=True, check_extractable=True
            ):
                resource_manager = PDFResourceManager()
                fake_file_handle = io.StringIO()
                converter = TextConverter(
                    resource_manager,
                    fake_file_handle,
                    # codec="utf-8",
                    laparams=LAParams(),
                )
                page_interpreter = PDFPageInterpreter(resource_manager, converter)
                page_interpreter.process_page(page)

                text = fake_file_handle.getvalue()
                yield text

                # close open handles
                converter.close()
                fake_file_handle.close()
        except PDFSyntaxError:
            return


def get_number_of_pages(file_name):
    try:
        if isinstance(file_name, io.BytesIO):
            # for remote pdf file
            count = 0
            for page in PDFPage.get_pages(
                file_name, caching=True, check_extractable=True
            ):
                count += 1
            return count
        else:
            # for local pdf file
            if file_name.endswith(".pdf"):
                count = 0
                with open(file_name, "rb") as fh:
                    for page in PDFPage.get_pages(
                        fh, caching=True, check_extractable=True
                    ):
                        count += 1
                return count
            else:
                return None
    except PDFSyntaxError:
        return None


def extract_text_from_docx(doc_path):
    """
    Helper function to extract plain text from .docx files
    :param doc_path: path to .docx file to be extracted
    :return: string of extracted text
    """
    try:
        temp = docx2txt.process(doc_path)
        text = [line.replace("\t", " ") for line in temp.split("\n") if line]
        return " ".join(text)
    except KeyError:
        return " "


def extract_text(file_path, extension):
    """
    Wrapper function to detect the file extension and call text
    extraction function accordingly
    :param file_path: path of file of which text is to be extracted
    :param extension: extension of file `file_name`
    """
    text = ""
    if extension == ".pdf":
        for page in extract_text_from_pdf(file_path):
            text += " " + page
    elif extension == ".docx":
        text = extract_text_from_docx(file_path)
    elif extension == ".txt":
        with open(file_path, "r") as file:
            text = file.read().replace("\n", "")
    return text

In [31]:
text = extract_text("/content/vidhi.pdf",".pdf")

In [32]:
text

' github.com/vidhsss/\n(+91) 8447202370\nvidhijain.contact@gmail.com\n\nlinkedin.com/in/vidhijain23\nPortfolio\nI am highly inclined towards emerging technologies with passion and knowledge towards machine learning, computer vision, data science,\nand data structures. My out-of-the-box ideas, hardworking nature, and management skills can be an asset.As a sophomore, I started working\nas a research student which gave me the ability to explore different fields and innovate with deep learning and machine learning.\n\nVIDHI JAIN\n\nEDUCATION\nBachelor of Technology, Electrical, Netaji Subhas Institute of Technology, GPA: 8.62/10.00\nHigher secondary, The Heritage School, Percentage: 93.2/100\nEXPERIENCE\nResearch Student\nNetaji Subhas Institute of Technology , Delhi\n• Working under Dr. Deepak Kumar Sharma, Department of Information Technology, NSUT, introduced an efficient power saving model for\n\nAugust 2019— present\nApril 2017— May 2019\n\nMarch 2021-present\n\nsmart homes using time

In [33]:
doc = nlp(" ".join(text.split('\n')))
for ent in doc.ents:
    print(f'{ent.label_.upper():{20}} - {ent.text}')

DEGREE               - Bachelor of Technology, Electrical
COLLEGE NAME         - Netaji Subhas Institute of Technology
COLLEGE NAME         - Netaji Subhas Institute of Technology
DEGREE               - e-commerce
