In [1]:
import json
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
import spacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score

In [2]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, encoding="utf8") as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

In [3]:
################### Train Spacy NER.###########
def train_spacy():

    TRAIN_DATA = convert_dataturks_to_spacy("traindata.json")
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    #test the model and evaluate it
    examples = convert_dataturks_to_spacy("testdata.json")
    tp=0
    tr=0
    tf=0

    ta=0
    c=0        
    for text,annot in examples:

        f=open("resume"+str(c)+".txt","w", encoding='utf-8')
        doc_to_test=nlp(text)
        d={}
        for ent in doc_to_test.ents:
            d[ent.label_]=[]
        for ent in doc_to_test.ents:
            d[ent.label_].append(ent.text)

        for i in set(d.keys()):

            f.write("\n\n")
            f.write(i +":"+"\n")
            for j in set(d[i]):
                f.write(j.replace('\n','')+"\n")
        d={}
        for ent in doc_to_test.ents:
            d[ent.label_]=[0,0,0,0,0,0]
        for ent in doc_to_test.ents:
            doc_gold_text= nlp.make_doc(text)
            gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
            y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner]
            y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test]  
            if(d[ent.label_][0]==0):
                #f.write("For Entity "+ent.label_+"\n")   
                #f.write(classification_report(y_true, y_pred)+"\n")
                (p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted')
                a=accuracy_score(y_true,y_pred)
                d[ent.label_][0]=1
                d[ent.label_][1]+=p
                d[ent.label_][2]+=r
                d[ent.label_][3]+=f
                d[ent.label_][4]+=a
                d[ent.label_][5]+=1
        c+=1
    for i in d:
        print("\n For Entity "+i+"\n")
        print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%")
        print("Precision : "+str(d[i][1]/d[i][5]))
        print("Recall : "+str(d[i][2]/d[i][5]))
        print("F-score : "+str(d[i][3]/d[i][5]))

In [4]:
train_spacy()

Statring iteration 0
{'ner': 6457.626186404118}
Statring iteration 1
{'ner': 3922.7332458961623}
Statring iteration 2
{'ner': 3380.6075062233976}
Statring iteration 3
{'ner': 2935.8592867321286}
Statring iteration 4
{'ner': 2591.141894535311}
Statring iteration 5
{'ner': 2285.3888324927107}
Statring iteration 6
{'ner': 2152.1570752364855}
Statring iteration 7
{'ner': 1923.178305102996}
Statring iteration 8
{'ner': 1746.272026233502}
Statring iteration 9
{'ner': 1649.7589684202035}


  'recall', 'true', average, warn_for)



 For Entity Name

Accuracy : 99.83805668016194%
Precision : 0.9983831936194594
Recall : 0.9983805668016195
F-score : 0.9981113185060555

 For Entity Location

Accuracy : 99.27125506072875%
Precision : 0.9927657005623397
Recall : 0.9927125506072875
F-score : 0.9897446574315648

 For Entity Email Address

Accuracy : 99.43319838056681%
Precision : 1.0
Recall : 0.994331983805668
F-score : 0.9971579374746244

 For Entity Companies worked at

Accuracy : 98.78542510121457%
Precision : 1.0
Recall : 0.9878542510121457
F-score : 0.9938900203665988

 For Entity Designation

Accuracy : 99.83805668016194%
Precision : 1.0
Recall : 0.9983805668016195
F-score : 0.9991896272285252

 For Entity College Name

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0

 For Entity Skills

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0


In [5]:
import csv
import re
import spacy
import sys 
from importlib import reload
reload(sys)
import pandas as pd
#sys.setdefaultencoding('utf-8')
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from spacy.lang.xx import MultiLanguage
import os
import sys, getopt
import numpy as np
from bs4 import BeautifulSoup
import urllib
from urllib.request import urlopen

In [6]:
#Function converting pdf to string
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    
    
    infile = open(fname, 'rb')
    
    
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text


In [136]:
"""

def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = file("resume.pdf", 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close

    with open('1.txt', 'w') as pdf_file:
        pdf_file.write(text)

    return text
    
"""    

'\n\ndef convert(fname, pages=None):\n    if not pages:\n        pagenums = set()\n    else:\n        pagenums = set(pages)\n\n    output = StringIO()\n    manager = PDFResourceManager()\n    converter = TextConverter(manager, output, laparams=LAParams())\n    interpreter = PDFPageInterpreter(manager, converter)\n\n    infile = file("resume.pdf", \'rb\')\n    for page in PDFPage.get_pages(infile, pagenums):\n        interpreter.process_page(page)\n    infile.close()\n    converter.close()\n    text = output.getvalue()\n    output.close\n\n    with open(\'1.txt\', \'w\') as pdf_file:\n        pdf_file.write(text)\n\n    return text\n    \n'

In [7]:
#Function to extract names from the string using spacy
def extract_name(string):
    r1 = str(string)
    nlp = MultiLanguage()
    doc = nlp(r1)
    for ent in doc.ents:
        if(ent.label_ == 'PER'):
            print(ent.text)
            break

In [8]:
#Function to extract Phone Numbers from string using regular expressions
def extract_phone_numbers(string):
    r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = r.findall(string)
    return [re.sub(r'\D', '', number) for number in phone_numbers]

In [9]:
#Function to extract Email address from a string using regular expressions
def extract_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(string)

In [10]:
#Converting pdf to string
resume_string = convert("resume.pdf")
resume_string1 = resume_string
#Removing commas in the resume for an effecient check
resume_string = resume_string.replace(',',' ')

In [11]:
#Converting all the charachters in lower case
resume_string = resume_string.lower()

In [12]:
#Information Extraction Function
def extract_information(string):
    string.replace (" ", "+")
    query = string
    soup = BeautifulSoup(urlopen("https://en.wikipedia.org/wiki/" + query), "html.parser")
    #creates soup and opens URL for Google. Begins search with site:wikipedia.com so only wikipedia
    #links show up. Uses html parser.
    for item in soup.find_all('div', attrs={'id' : "mw-content-text"}):
        print(item.find('p').get_text())
        print('\n')

In [13]:
with open('techatt.csv', 'rt', encoding='utf-8') as f:
    reader = csv.reader(f)
    your_listatt = list(reader)

In [14]:
with open('techskill.csv', 'rt', encoding='utf-8') as f:
    reader = csv.reader(f)
    your_list = list(reader)

In [15]:
with open('nontechnicalskills.csv', 'rt', encoding='utf-8') as f:
    reader = csv.reader(f)
    your_list1 = list(reader)

In [32]:
#Sets are used as it has a a constant time for lookup hence the overall the time for the total code will not exceed O(n)
s = set(your_list[0])
s1 = your_list
s2 = your_listatt
skillindex = []
skills = []
skillsatt = []
print('\n')
extract_name(resume_string1)
print('\n')

y = extract_phone_numbers(resume_string) #Phone Number as in resume
y1 = []







In [18]:
for i in range(len(y)):
    if(len(y[i])>9):
        y1.append(y[i])

In [19]:
print(y1)
print('\n')
print('Email id is')
print(extract_email_addresses(resume_string))

['9933995693']


Email id is
['ashaywalke@iitkgp.ac.in']


In [20]:
for word in resume_string.split(" "):
    if word in s:
        skills.append(word)

In [22]:
skills1 = list(set(skills))
print('\n')
print('\n')
np_a1 = np.array(your_list)







In [23]:
for i in range(len(skills1)):
    item_index = np.where(np_a1==skills1[i])
    skillindex.append(item_index[1][0])


In [24]:
nlen = len(skillindex)

In [25]:
print("Following are his/her Technical Skills\n\n")

for i in range(nlen):
    print(skills1[i])
    print(s2[0][skillindex[i]])
    print('\n')

Following are his/her Technical Skills


algorithms
Algorithms and Design Patterns


numpy
Mathematical Functions Library


c
Programming Language


python
Programmig Language


opencv
Computer Vision


scikit-learn
Machine Learning


matplotlib
Data Visualization


c++
Programming Language


matlab
Programming Language




In [26]:
#Sets are used as it has a a constant time for lookup hence the overall the time for the total code will not exceed O(n)
s1 = set(your_list1[0])
nontechskills = []

In [27]:
for word in resume_string.split(" "):
    if word in s1:
        nontechskills.append(word)

In [30]:
nontechskills = set(nontechskills)
print('\n')

print("Following are his/her Non Technical Skills")
list5 = list(nontechskills)
print('\n')
print(list5)



Following are his/her Non Technical Skills


[]
