<a href="https://colab.research.google.com/github/sachitmunjal/Resume-Parser/blob/main/Project_Parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **RESUME** **PARSER**

In [None]:
# Installing the spacy library
!pip install spacy==2.1.9

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Importing the required libraries and functions

import json
import spacy
import random
import math
import logging
import pandas as pd

# Importing functions from spacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer

# Importing functions from sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

Splitting the dataset into training and testing data

In [None]:
with open('/content/Entity Recognition in Resumes.json') as f:
    lines = f.readlines()

# Taking 70% of the data for training and 30% for testing    
train, test = train_test_split(lines, test_size=0.3)

# Creating training_data.json 
with open("training_data.json", "w") as outfile:
    for obj in train:
        outfile.write(obj)

# Creating testing_data.json 
with open("testing_data.json", "w") as outfile:
    for obj in test:
        outfile.write(obj)

Training the dataset

In [None]:
# Funtion to convert the resumes to format such that the spacy library can process it.
def convert_json_to_py(JSON_FilePath):
  
    try:
        #training_data list to store the training data in form of python objects from json objects
        training_data = []
        #list to store all the resumes
        lines=[]
        #Opening the json prepare python processable training data
        with open(JSON_FilePath, 'r') as file:
            lines = file.readlines()

        # Iterating the json line objects to convert into python readable format
        for line in lines:
            # Converting json object to dictionary
            data = json.loads(line)
            # list to store the content of the resume
            text = data['content']
            # list to store the entities 
            entities = []

            # Iterating the annotations 
            for annotation in data['annotation']:
                point = annotation['points'][0]    # Stores contents of points in annotatation
                labels = annotation['label']    # Stores the labels (Skills, Education etc.)   

                # If type(label) is not list, then the label is converted to list
                if not isinstance(labels, list):
                    labels = [labels]
                
                for label in labels:
                    #json indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1 ,label))

            # Appending the content and entities
            training_data.append((text, {"entities" : entities}))
        # returns the python object training data
        return training_data

    # handling the exception case
    except Exception as e:
        logging.exception("Unable to process " + JSON_FilePath + "\n" + "error = " + str(e))
        return None

# New Section

In [None]:
#training the model
def train_spacy():
    # extracting train data from json by using json to py converter function
    TRAIN_DATA = convert_json_to_py("/content/training_data.json")
    # creating blank Language class using the spacy library for nlp 
    # basically it creates the built-in pipeline components and adds them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    nlp = spacy.blank('en') 
    # if we do not have named entity recognition i.e the heart of our prject resume parser then we add it's pipeline as well
    #pipelines are used because nlp is an area of AI but to make it work efficiently with ml we need the pipeline
    if 'ner' not in nlp.pipe_names:
        # creating ner pipeline to use nlp on ml models effectively
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            # entities contains 3 things basically - (0-> labels,1-> point sof the values of the label, 3-> the values in text format)
            # so we are here adding the labels for the training in string format
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training, so that we only use the ner component as it is only what is needed
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    #disabling other components and then beginnig the training on the nlp class with ner
    with nlp.disable_pipes(*other_pipes):  
        #training begins :), nlp model's initialised by begin_training function, optimizer stores the model in it
        optimizer = nlp.begin_training()
        for itn in range(10):
            # data shown by shuffling the values to prevent memorizing and then getting biased
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            # text - > input variable, annotations -> output variable
            for text, annotations in TRAIN_DATA:
              try:
                #updating the function according to the data values shown
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
              except Exception as error:
                  # print(error)
                  #print(losses)
                  continue
            print(losses)

    #test the model and evaluate it
    examples = convert_json_to_py("/content/testing_data.json")
    tp=0
    tr=0
    tf=0
    ta=0
    c=0  
    for text,annot in examples:
        f=open("resume"+str(c)+".txt","w")
        # doc_to_test -> document to test, putting only the text attribute in the model for named entities prediction
        doc_to_test=nlp(text)
        #dictionary d to store the entiy values obtained after running the model
        d={}
        #.ents - > entities, .label_  -> labels
        for ent in doc_to_test.ents:
            #ent.label_ -> label name eg- skill, ent.text-> dtected skill from the model eg-> Machine Learning
            d[ent.label_]=[]
        for ent in doc_to_test.ents:
            d[ent.label_].append(ent.text)

        for i in set(d.keys()):
            f.write("\n\n")
            f.write(i +":"+"\n")
            for j in set(d[i]):
                f.write(j.replace('\n','')+"\n")
        d={}
        for ent in doc_to_test.ents:
            d[ent.label_]=[0,0,0,0,0,0]
        for ent in doc_to_test.ents:
            doc_gold_text= nlp.make_doc(text)
            gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
            y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner]
            y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test]  
            if(d[ent.label_][0]==0):
                #f.write("For Entity "+ent.label_+"\n")   
                #f.write(classification_report(y_true, y_pred)+"\n")
                (p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted')
                a=accuracy_score(y_true,y_pred)
                d[ent.label_][0]=1
                d[ent.label_][1]+=p
                d[ent.label_][2]+=r
                d[ent.label_][3]+=f
                d[ent.label_][4]+=a
                d[ent.label_][5]+=1
        c+=1
    for i in d:
        print("\n For Entity "+i+"\n")
        print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%")
        print("Precision : "+str(d[i][1]/d[i][5]))
        print("Recall : "+str(d[i][2]/d[i][5]))
        print("F-score : "+str(d[i][3]/d[i][5]))

In [None]:
train_spacy()

Starting iteration 0
{'ner': 18894.05935380143}
Starting iteration 1
{'ner': 11471.082485291567}
Starting iteration 2
{'ner': 10527.813256252683}
Starting iteration 3
{'ner': 8654.825265563624}
Starting iteration 4
{'ner': 10377.452781661481}
Starting iteration 5
{'ner': 6542.791312447571}
Starting iteration 6
{'ner': 6374.255223468657}
Starting iteration 7
{'ner': 5950.349536865915}
Starting iteration 8
{'ner': 5791.696513799835}
Starting iteration 9
{'ner': 6087.136466322255}

 For Entity Name

Accuracy : 96.15384615384616%
Precision : 0.963076923076923
Recall : 0.9615384615384616
F-score : 0.955520669806384

 For Entity Location

Accuracy : 96.15384615384616%
Precision : 0.9630467571644042
Recall : 0.9615384615384616
F-score : 0.9523076923076922

 For Entity Email Address

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0

 For Entity Designation

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0

 For Entity Companies worked at

Accuracy : 100.0%
Precision : 1

# Matching the resumes with the job description

In [None]:
# Function to match the resume with job description provided
def resumeMatching(percent_matching):
    # Iterating through all the resumes in test
    for i in range(len(test)):
        resume = open('resume' + str(i) + '.txt').read()
        job_description = open('job_description.txt').read()
        
        # A list of text
        text = [resume, job_description]

        cv = CountVectorizer()
        count_matrix = cv.fit_transform(text)

        #get the match percentage
        matchPercentage = cosine_similarity(count_matrix)[0][1] * 100
        matchPercentage = round(matchPercentage, 2) # round to two decimal
        percent_matching.append(matchPercentage)
    return percent_matching

In [None]:
percent_matching = resumeMatching([])
# Creating a dataframe to print the result
matching_percent = pd.DataFrame(percent_matching)
matching_percent.rename(columns = {0: "% Matching"})