In [1]:
!pip install spacy[transformers]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
import re
import json
import spacy
import random
from tqdm import tqdm
from spacy.tokens import DocBin

In [5]:
ROOT_DIR = "/content/gdrive/MyDrive/ResumeRanker"

In [6]:
def extract_data_from_json_spacy3(filepath):
    text_dataset = []
    dataset = []
    with open(filepath, 'r') as f:
        lines = f.readlines()

    for line in tqdm(lines,desc='Extracting Data    '):
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        data_annotations = data['annotation']
        entities = []
        if data_annotations is not None:
            for annotation in data_annotations:
                point = annotation['points'][0]
                labels = annotation['label']
                if isinstance(labels, list):
                    if not labels:
                        continue
                    label = labels[0]
                else:
                    label = labels

                point_start = point['start']
                point_end = point['end']
                point_text = point['text']
                
                lspace = len(point_text) - len(point_text.lstrip())
                rspace = len(point_text) - len(point_text.rstrip())
                if lspace != 0:
                    point_start = point_start + lspace
                if rspace != 0:
                    point_end = point_end - rspace
                entities.append((point_start, point_end + 1 , label))
        dataset.append((text, {"entities" : entities}))
        text_dataset.append(text)

    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in tqdm(dataset,desc='Processing Entities'):
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start > 0 and valid_start < len(text):
                if invalid_span_tokens.match(text[valid_start]):
                    valid_start += 1
                elif (not invalid_span_tokens.match(text[valid_start])) and (not invalid_span_tokens.match(text[valid_start-1])):
                    valid_start -= 1
                else:
                    break
            while valid_end > 1 and valid_end < len(text):
                if invalid_span_tokens.match(text[valid_end - 1]):
                    valid_end -= 1
                elif (not invalid_span_tokens.match(text[valid_end-1])) and (not invalid_span_tokens.match(text[valid_end])):
                    valid_end += 1
                else:
                    break
            valid_entities.append((valid_start, valid_end, label))
        cleaned_data.append({'text':text,'entities': valid_entities})
    return cleaned_data

In [7]:
json_file_path = "/content/gdrive/MyDrive/ResumeRanker/Dataset/Entity Recognition in Resumes.json"
training_data = extract_data_from_json_spacy3(json_file_path)

Extracting Data    : 100%|██████████| 220/220 [00:00<00:00, 10334.04it/s]
Processing Entities: 100%|██████████| 220/220 [00:00<00:00, 12040.80it/s]


In [7]:
nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin()

In [8]:
for training_example  in tqdm(training_data): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("/content/gdrive/MyDrive/ResumeRanker/Models/SPACY3_NER/training_data.spacy") # save the docbin object

 71%|███████▏  | 157/220 [00:01<00:00, 127.49it/s]

Skipping entity


100%|██████████| 220/220 [00:02<00:00, 105.36it/s]


In [9]:
cd /content/gdrive/MyDrive/ResumeRanker/Models/SPACY3_NER

/content/gdrive/MyDrive/ResumeRanker/Models/SPACY3_NER


In [10]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [11]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-06-20 09:24:32,070] [INFO] Set up nlp object from config
[2022-06-20 09:24:32,079] [INFO] Pipeline: ['transformer', 'ner']
[2022-06-20 09:24:32,083] [INFO] Created vocabulary
[2022-06-20 09:24:32,084] [INFO] Finished initializing nlp object
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSeq

In [8]:
nlp = spacy.load("/content/gdrive/MyDrive/ResumeRanker/Models/SPACY3_NER/model-best")

In [9]:
for i in training_data[:2]:
    text = i["text"]
    print("Data :")
    print(text)
    doc = nlp(" ".join(text.split('\n')))
    for ent in doc.ents:
        print(f'{ent.label_.upper():{20}} - {ent.text}')

Data :
Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  

In [16]:
text = '''
Bhanu Prakash Pebbeti 

ML/DL Enthusiast | Implementation based learner | Looking for an opportunity to expand my
learning, knowledge and skills which help me in achieving greater practical excellence and
contribute to the success of the organization. 

pebbetibhanu2017@gmail.com 

+91 6303733897 

Hyderabad, Telangana, India 

www.hackerrank.com/bhanuprakash_b12 

linkedin.com/in/bhanu-prakash-pebbeti-700b80191 

github.com/BhanuPrakashPebbeti 

EDUCATION 

ELECTRONICS AND COMMUNICATION
ENGINEERING | B.TECH 
National Institute of Technology Calicut 
2019 - Present,  

CGPA-8.72/10(till 5th sem) 

SKILLS 

Python 

ML 

AI 

DL 

WORK EXPERIENCE 

INTERMEDIATE 
Narayana Junior College,Hyderabad 
2017 - 2019,  

Percentage-97.7% 

Member at AI Club NITC (11/2020 - Present)
One of the member at AI Club NITC, aimed at high quality
Artiﬁcial Intelligence research and developing Artiﬁcial
Intelligence systems for real world applications. 

SECONDARY HIGH SCHOOL-SSC 
Shivappa High School,Hyderabad 
2017,  

GPA-9.5/10 

Computer Vision Engineer at Intelligent
Mobility Labs (06/2021 - Present) 
Research Lab focused on Self Driving Technology and
Autonomous Mobile Robots. 

PROJECTS 

Automation  of  Cleaning  Cervical  dataset  using  deep
learning techniques (01/2021 - 05/2021) 

Used Supervised contrastive learning to remove outliers and boost
our classiﬁer performance. 

Multi Task Learning(MTL) for Self Driving Technology
 (05/2021 - Present)

Worked on Perception stack for Indian Road Conditions which
includes Semantic segmentation, Depth Estimation and Object
detection using MTL. 

Reinforcement Learning to solve Games

Worked on models like Reinforce, Sarsa, Q-Learning, DQN, Deuling
DQN to solve games like Balancing Pendulum, CartPole, Lunar
Lander from OpenAI Gym and custom made environments like Flappy
Bird. 

Image Generation using VQVAE

Used VQVAE to learn discreate representations of the images and
then a gpt prior is trained on top of these representations to
generate new images. 

CERTIFICATIONS 

Applied Data Science With Python
Specialization (08/2020)
Coursera-University of Michigan 

Neural Networks and Deep Learning (08/2020)

Coursera-deeplearning.ai 

Python for Everybody Specialization (05/2020)

Coursera-University of Michigan 

LANGUAGES 

English 
Fluent 

Telugu 
Native 

Sudoko Solver

Application made using python which solves sudoko puzzles with a
simple Graphical user interface made using pygame. 

INTERESTS 

Reading blogs 

Playing Sports(cricket) 
'''

In [17]:
doc = nlp(" ".join(text.split('\n')))
for ent in doc.ents:
    print(f'{ent.label_.upper():{20}} - {ent.text}')

NAME                 - Bhanu Prakash Pebbeti
COLLEGE NAME         - National Institute of Technology Calicut
SKILLS               - Python   ML   AI   DL
COLLEGE NAME         - Narayana Junior College,Hyderabad
COLLEGE NAME         - Coursera-University of Michigan
COLLEGE NAME         - Coursera-University of Michigan
