# Import Libraries


In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

# Install Spacy

In [3]:
!pip install spacy==2.1.4

Collecting spacy==2.1.4
  Downloading spacy-2.1.4-cp37-cp37m-manylinux1_x86_64.whl (29.8 MB)
[K     |████████████████████████████████| 29.8 MB 1.6 MB/s 
Collecting blis<0.3.0,>=0.2.2
  Downloading blis-0.2.4-cp37-cp37m-manylinux1_x86_64.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 35.6 MB/s 
Collecting plac<1.0.0,>=0.9.6
  Downloading plac-0.9.6-py2.py3-none-any.whl (20 kB)
Collecting thinc<7.1.0,>=7.0.2
  Downloading thinc-7.0.8-cp37-cp37m-manylinux1_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 41.5 MB/s 
Collecting jsonschema<3.1.0,>=2.6.0
  Downloading jsonschema-3.0.2-py2.py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 3.0 MB/s 
[?25hCollecting preshed<2.1.0,>=2.0.1
  Downloading preshed-2.0.1-cp37-cp37m-manylinux1_x86_64.whl (82 kB)
[K     |████████████████████████████████| 82 kB 360 kB/s 
Installing collected packages: preshed, plac, blis, thinc, jsonschema, spacy
  Attempting uninstall: preshed
    Found exi

# Create functions to convert Json file to Spacy format

In [4]:
# import logging
import json
import re

# JSON formatting functions
def json_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]

    
    with open(dataturks_JSON_FilePath, 'r') as f:
        lines = f.readlines()

    #For each json line
    for line in lines:
        data = json.loads(line)

        #add content key in data dictionary
        text = data['content'].replace("\n", " ")
        entities = []

        #add annotation key in dictionary
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    #find the left and right white spaces and remove them
                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())

                    #move the pointer for white spaces
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff

                    #add the updates locations of the entities
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

def trim_entity_spans(data: list) -> list:
    #Removes leading and trailing white spaces from entity spans.
    #Returns The cleaned data.

    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end

            # remove the whitespaces in the entity spans
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

# Apply above conversion and cleaning on the training data

In [5]:
data = trim_entity_spans(json_to_spacy("/content/Entity Recognition in Resumes.json"))

data[0]


["Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

# Cleaning the Data - Remove overlapping entities

In [6]:
def clean_entities(training_data):
    
    clean_data = []
    for text, annotation in training_data:
        
        entities = annotation.get('entities')
        entities_copy = entities.copy()
        
        # append entity only if it is longer than its overlapping entity
        i = 0
        for entity in entities_copy:
            j = 0
            for overlapping_entity in entities_copy:
                # Skip self
                if i != j:
                    e_start, e_end, oe_start, oe_end = entity[0], entity[1], overlapping_entity[0], overlapping_entity[1]
                    # Delete any entity that overlaps, keep if longer
                    if ((e_start >= oe_start and e_start <= oe_end) \
                    or (e_end <= oe_end and e_end >= oe_start)) \
                    and ((e_end - e_start) <= (oe_end - oe_start)):
                        entities.remove(entity)
                j += 1
            i += 1
        clean_data.append((text, {'entities': entities}))
                
    return clean_data

data = clean_entities(data)
data[0]

("Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

### TRAINING NER SPACY MODEL

In [7]:
import random
import math

#splitting the train and test set

def train_test_split(data, test_size, random_state):

    random.Random(random_state).shuffle(data)
    test_idx = len(data) - math.floor(test_size * len(data))
    train_set = data[0: test_idx]
    test_set = data[test_idx: ]

    return train_set, test_set

In [8]:
train_data, test_data = train_test_split(data, test_size = 0.1, random_state = 42)

In [9]:
import spacy
import ast

f = open("/content/skill.txt", "r")
l = f.read()
patterns = l
res = ast.literal_eval(patterns)

def train_spacy():
    
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy

    # adding 'parser', 'ner','tagger' and 'entity_ruler' pipeline components
    if 'ner' not in nlp.pipe_names and 'parser' not in nlp.pipe_names and 'tagger' not in nlp.pipe_names:

        parser = nlp.create_pipe('parser')
        nlp.add_pipe(parser, last = True)

        tagger = nlp.create_pipe('tagger')
        nlp.add_pipe(tagger, last = True)

        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)

        ruler = nlp.create_pipe('entity_ruler')
        nlp.add_pipe(ruler, last = True)
        ruler.add_patterns(res)
        
        
    # add labels
    for _, annotations in train_data:
         for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            
    # get names of other pipes to disable them during training

    #other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    #with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(20):
        print("Starting iteration " + str(itn))
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.2,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)
    return nlp

In [10]:
nlp = train_spacy()

Starting iteration 0
{'ner': 23276.257017462623, 'tagger': 5722.872280407645, 'parser': 0.0}
Starting iteration 1
{'parser': 0.0, 'ner': 19873.066430215393, 'tagger': 7.107822704142755e-16}
Starting iteration 2
{'parser': 0.0, 'ner': 15277.770656268653, 'tagger': 7.044494409078274e-16}
Starting iteration 3
{'ner': 13618.23676392248, 'tagger': 6.556077972019632e-16, 'parser': 0.0}
Starting iteration 4
{'ner': 12067.997216046933, 'parser': 0.0, 'tagger': 6.805490417826164e-16}
Starting iteration 5
{'parser': 0.0, 'ner': 11755.930661153074, 'tagger': 9.418288909510104e-16}
Starting iteration 6
{'parser': 0.0, 'ner': 11472.101195391251, 'tagger': 6.601206457719352e-16}
Starting iteration 7
{'tagger': 6.356842108881723e-16, 'parser': 0.0, 'ner': 9128.631876792022}
Starting iteration 8
{'tagger': 6.453534942428895e-16, 'parser': 0.0, 'ner': 9050.064341428717}
Starting iteration 9
{'ner': 9575.540671202074, 'tagger': 6.63278779416483e-16, 'parser': 0.0}
Starting iteration 10
{'tagger': 6.3509

In [11]:
# from google.colab import drive
# drive.mount('/content/drive')

In [12]:
#nlp.to_disk('/content/drive/My Drive/my_model')

In [13]:
import spacy

In [14]:
import pickle

pickle_out = open("nlp.pkl","wb")
pickle.dump(nlp,pickle_out)
pickle_out.close()

In [15]:
# nlp = spacy.load('/content/drive/My Drive/my_model')

# Finding accuracy of the model on the test set


In [16]:
from spacy.gold import GoldParse
from itertools import groupby

def doc_to_bilou(nlp, text):
    
    doc = nlp(text)
    tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in doc]
    entities = []
    for entity, group in groupby(tokens, key=lambda t: t[-1]):
        if not entity:
            continue
        group = list(group)
        _, start, _ = group[0]
        word, last, _ = group[-1]
        end = last + len(word)
        
        entities.append((
                start,
                end,
                entity
            ))

    gold = GoldParse(nlp(text), entities = entities)
    pred_ents = gold.ner
    
    return pred_ents

y_test = []
y_pred = []

for text, annots in test_data:
    
    gold = GoldParse(nlp.make_doc(text), entities = annots.get("entities"))
    ents = gold.ner
    pred_ents = doc_to_bilou(nlp, text)
    
    y_test.append(ents)
    y_pred.append(pred_ents)
    
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from itertools import chain

def ner_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset
    ), accuracy_score(y_true_combined, y_pred_combined)
    
report, accuracy = ner_report(y_test, y_pred)
print(report)

                       precision    recall  f1-score   support

                    -       0.00      0.00      0.00       142
       B-College Name       0.78      0.66      0.71        32
       I-College Name       0.78      0.63      0.70        63
       L-College Name       0.74      0.62      0.68        32
       U-College Name       0.00      0.00      0.00         1
B-Companies worked at       0.56      0.67      0.61        30
I-Companies worked at       0.50      0.50      0.50         4
L-Companies worked at       0.56      0.67      0.61        30
U-Companies worked at       0.34      0.59      0.43        41
             B-Degree       1.00      0.83      0.91        24
             I-Degree       1.00      0.92      0.96        66
             L-Degree       1.00      0.83      0.91        24
             U-Degree       0.40      0.67      0.50         3
        B-Designation       0.73      0.68      0.70        47
        I-Designation       0.76      0.47      0.58  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
print(accuracy)

0.8718145639879914


### Testing data

In [22]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 12.5 MB/s 
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.19.6


In [24]:
import sys, fitz
fname = '/content/Resume_Raghav_2.5.pdf'
doc = fitz.open(fname)
doc

Document('/content/Resume_Raghav_2.5.pdf')

In [25]:
text = ''
for page in doc:
    text = text + str(page.get_text())
tx = " ".join(text.split('\n'))
tx

'Raghav Rastogi  Cherokee Street, Boston • 6177496312    Education    Master of Science in Data Science  Northeastern University, Khoury College of Computer Sciences, Boston, MA  • Graduate Teaching Assistant : Foundations of Data Science  • Coursework : Algorithms, Database Management, NLP, Supervised Machine Learning  Expected May 2023  GPA: 4.0  • Project: Project: Forecast depression, finding key attributes in survey data and predicting depression  based on specific demographic information with an error of around 10%  B.Tech : Electrical and Electronics Engineering  Vellore Institute of Technology, Bangalore, India  • Research project on Smart Charging Schemes For Electric Vehicles To Reduce Its Impact On Grid  • Relevant Coursework : Introduction to programming, Data Structures, Probability and Statistics,  Multivariable calculus, Linear Algebra  Skills and Certifications  • Programming Languages: Python, R, SQL, HTML,CSS  May 2018  GPA: 8.82  • Tools and Framework: TensorFlow, Ke

In [26]:
doc = nlp(tx)
print(doc)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

Raghav Rastogi  Cherokee Street, Boston • 6177496312    Education    Master of Science in Data Science  Northeastern University, Khoury College of Computer Sciences, Boston, MA  • Graduate Teaching Assistant : Foundations of Data Science  • Coursework : Algorithms, Database Management, NLP, Supervised Machine Learning  Expected May 2023  GPA: 4.0  • Project: Project: Forecast depression, finding key attributes in survey data and predicting depression  based on specific demographic information with an error of around 10%  B.Tech : Electrical and Electronics Engineering  Vellore Institute of Technology, Bangalore, India  • Research project on Smart Charging Schemes For Electric Vehicles To Reduce Its Impact On Grid  • Relevant Coursework : Introduction to programming, Data Structures, Probability and Statistics,  Multivariable calculus, Linear Algebra  Skills and Certifications  • Programming Languages: Python, R, SQL, HTML,CSS  May 2018  GPA: 8.82  • Tools and Framework: TensorFlow, Ker

### Extract Skills


In [27]:
def extract_spacy():
    
    nlp1 = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'entity_ruler' not in nlp1.pipe_names:

        ruler = nlp1.create_pipe('entity_ruler')
        nlp1.add_pipe(ruler, last = True)
        ruler.add_patterns(res) 

    return nlp1

    

In [28]:
extract = extract_spacy()

In [29]:
doc = extract(tx)
print(doc)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

Raghav Rastogi  Cherokee Street, Boston • 6177496312    Education    Master of Science in Data Science  Northeastern University, Khoury College of Computer Sciences, Boston, MA  • Graduate Teaching Assistant : Foundations of Data Science  • Coursework : Algorithms, Database Management, NLP, Supervised Machine Learning  Expected May 2023  GPA: 4.0  • Project: Project: Forecast depression, finding key attributes in survey data and predicting depression  based on specific demographic information with an error of around 10%  B.Tech : Electrical and Electronics Engineering  Vellore Institute of Technology, Bangalore, India  • Research project on Smart Charging Schemes For Electric Vehicles To Reduce Its Impact On Grid  • Relevant Coursework : Introduction to programming, Data Structures, Probability and Statistics,  Multivariable calculus, Linear Algebra  Skills and Certifications  • Programming Languages: Python, R, SQL, HTML,CSS  May 2018  GPA: 8.82  • Tools and Framework: TensorFlow, Ker

### Job Description Similarity

In [30]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
import os
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [31]:

# This is a function to extract skills from the resume-parser dataset to train Doc2Vec model
def get_skills(data):
    temp_skill = []
    #generate a 2-D list skills and name from the resume
    for j in data:
        if j['label'] == ['Skills']:
            temp_skill.append(j['points'][0]['text'])
        if j['label'] == ['Name']:
            temp_name = j['points'][0]['text']
        else:
            temp_name = "No name"

    #clean the list of skills
    for i,j in enumerate(temp_skill):
        j = j.replace("•","")
        j = j.replace('\n',",")
        j = re.sub("[^A-Za-z0-9+-, ]","",j)
        j = j.split(',')
        j = [x for x in j if x!= '']
        temp_skill[i] = j
        
    temp_s = []

    #Convert the 2-D list into a 1-D list
    for j in temp_skill:
        for i in j:
            temp_s.append(i)

    
    return (temp_name, temp_s)


In [32]:
skills = {}
df = pd.read_json('/content/Entity Recognition in Resumes.json', lines = True)
data = df["annotation"]
#Create a dictionary of skills with key as name and value as the list of skills
for i in data:
    name,skill = get_skills(i)
    skills[name] = skill


Cosine Similarity function


In [33]:
#This is a function to calculate cosine similarity between 2 vectors
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

Doc2Vec


In [34]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# This is a function to train the doc2vec model using the skills dictionary
def doc2vec_similarity_train(skills):
    tokenized_dict = {}

    # Here we create a tokenized dictionary with key as name and values as tokens from the skills
    for n,s in skills.items():
        tokenized_list = []
        for i in s:
            x = word_tokenize(i.lower())
            for j in x:
                tokenized_list.append(j)
        tokenized_dict[n] = tokenized_list
    
    # This tags each document (tokens of skills) to feed to the Doc2Vec model as input
    tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_dict.values())]

    # we create the doc2Vec model that outputs a vector of length 40. 
    # Window size = 3 for the continours bag of words
    # We don't count words with count less than 1 
    # and we train for 100 epochs
    model = Doc2Vec(vector_size=40,window = 3, min_count = 1, epochs = 100)
    
    #build and train the model
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    
    return model

#this is a function to test the job requirement and resume skills and output similarity
def doc2vec_similarity_test(model, req, skill):
    tokenized_req = []
    tokenized_skill = []
    #tokenizes the job requirements
    for i in req:
        x = word_tokenize(i.lower())
        for j in x:
                tokenized_req.append(j)
    #tokenizes the resume skills
    for i in skill:
        x = word_tokenize(i.lower())
        for j in x:
                tokenized_skill.append(j)
    
    #calculates vector for resume skills
    skill_vector = model.infer_vector(tokenized_skill)
    #calculates vector for job requirement skills
    req_vector = model.infer_vector(tokenized_req)
    
    #calculates similarity
    similarity = cosine(skill_vector, req_vector)
    
    return similarity
    

In [35]:
#train the model with the skills dictionary
model = doc2vec_similarity_train(skills)


Sentence Transformers


In [36]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 2.6 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 6.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 34.8 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 34.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.

In [37]:
from sentence_transformers import SentenceTransformer

#this is a function to calculate similarity using sentence-bert

def sentence_bert_similarity(req, skill):
    #Here we use and import pre trained sentence bert model 
    sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
    
    # here we are converting the skills into a single string
    req = ",".join(req)
    skill = ",".join(skill)
    
    #calculating the vectors for resume and job description
    skill_vec = sbert_model.encode([skill])[0]
    req_vec = sbert_model.encode([req])[0]
    
    #calculate similarity
    similarity = cosine(skill_vec, req_vec)
    
    return similarity

In [38]:
#get user skills from trained NER model
user_skills = [i.text for i in doc.ents]

In [39]:
#function to clean and preprocess job description 
def clean_job(description):
  description = description.lower()
  description= description.replace("•"," ")
  description = description.replace('\n',",")
  description = re.sub("[^A-Za-z0-9,]"," ",description)

  return description



Data Science Job description

In [40]:
#job description for data science role
job_req = "To qualify you must have a 1. Masters degree in a quantitative discipline (Biomedical Informatics, Computer Science, Machine Learning, Applied Statistics, Mathematics or similar field, Proficiency in at least one programming language (Python, R) and machine learning tools (scikit learn, R), Knowledge of predictive modeling and machine learning concepts, including design, development, evaluation, deployment and scaling to large datasets, Familiarity with computing models for big data Hadoop / MapReduce, Spark etc., Knowledge of databases (Relational / SQL, NOSQL, MongoDB, etc.), Good grasp of software engineering principles. Experience in integrating modern software architectures, Knowledge and some experience in operational aspects of software development and deployment, including automation, testing, virtualization and container technology, Knowledge of clinical and operational aspects of healthcare delivery, Excellent written and oral communication skills for a variety of audiences, Preferred Qualifications, PhD degree in a quantitative field (Biomedical Informatics, Computer Science, Machine Learning, Applied Statistics, Mathematics or similar field) + 2 years experience, Demonstrated skills in design and implementation of complex machine learning models, Demonstrated knowledge of software engineering and operational skills through prior projects."

job_req = clean_job(job_req)
 
print(job_req)

# run the ner model on the job description and extract the skills
doc = extract(job_req)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

job_skills = [i.text for i in doc.ents]


#similarity using sentence_bert
print(sentence_bert_similarity(job_skills, user_skills))

#similarity using doc2vec
print(doc2vec_similarity_test(model, job_skills,user_skills))


to qualify you must have a 1  masters degree in a quantitative discipline  biomedical informatics, computer science, machine learning, applied statistics, mathematics or similar field, proficiency in at least one programming language  python, r  and machine learning tools  scikit learn, r , knowledge of predictive modeling and machine learning concepts, including design, development, evaluation, deployment and scaling to large datasets, familiarity with computing models for big data hadoop   mapreduce, spark etc , knowledge of databases  relational   sql, nosql, mongodb, etc  , good grasp of software engineering principles  experience in integrating modern software architectures, knowledge and some experience in operational aspects of software development and deployment, including automation, testing, virtualization and container technology, knowledge of clinical and operational aspects of healthcare delivery, excellent written and oral communication skills for a variety of audiences, 

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

0.84415627
0.5684721


Software Engineering Job Description


In [41]:
#job description for software engineer role
job_req = "Computer Science, Engineering or related degree with a minimum GPA of 3.0 or higher Software engineering skills and experience with some of the following technologies: Java, .NET, Node.js, Python, Angular, React, AWS, Azure, GCP, SQL or mobile Knowledge of common data structures and algorithmsStrong problem-solving and software triage skills with the ability to work cross-functionally in a fast-paced and rapidly changing work environment Strong analytical and interpersonal communication skills"

job_req = clean_job(job_req)

print(job_req)

# run the ner model on the job description and extract the skills
doc = extract(job_req)
for ent in doc.ents:
    print(f'{ent.label_.upper():{30}}- {ent.text}')

job_skills = [i.text for i in doc.ents]

#similarity using sentence_bert
print(sentence_bert_similarity(job_skills, user_skills))

#similarity using doc2vec
print(doc2vec_similarity_test(model, job_skills,user_skills))

computer science, engineering or related degree with a minimum gpa of 3 0 or higher software engineering skills and experience with some of the following technologies  java,  net, node js, python, angular, react, aws, azure, gcp, sql or mobile knowledge of common data structures and algorithmsstrong problem solving and software triage skills with the ability to work cross functionally in a fast paced and rapidly changing work environment strong analytical and interpersonal communication skills
SKILL                         - computer science
SKILL                         - engineering
SKILL                         - software engineering
SKILL                         - java
SKILL                         - node js
SKILL                         - python
SKILL                         - angular
SKILL                         - react
SKILL                         - azure
SKILL                         - mobile
SKILL                         - data structures
SKILL                         - soft