##### Reference https://promptapi.com/blog/article/build-your-own-resume-parser-using-python-and-nlp
##### API for skills https://api.emsidata.com/

## Importing libraries

In [166]:
# !pip install pdfminer.six
# !pip install docx2txt
# !pip install pdf2image

In [1]:
import os

In [2]:
import docx2txt
# import nltk

In [3]:
import re

In [4]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = "D:/Program Files/Tesseract-OCR/tesseract"

In [4]:
from pdfminer.high_level import extract_text
# from pdf2image import convert_from_path

In [5]:
import requests
import json

## downloading required nltk packages

In [6]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_cahunker')
# nltk.download('words')

## Extract text from docx

In [7]:
def extract_text_from_docx(docx_path):
    txt = docx2txt.process(docx_path)
    if txt:
        return txt.replace('\t', ' ')
    return None

## Extract text from image

In [8]:
def extract_text_from_image(img_path):
    txt = pytesseract.image_to_string(img_path)
    if txt:
        return txt.replace('\t', ' ')
    return None

## Extract text from pdf

In [9]:
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

In [10]:

# def find_gender(name):
#     url = "https://api.genderize.io/?name="
#     n = name
#     r = requests.get(url+n).json()
#     gender =  r["gender"]
#     if gender:
#         return gender
#     else:
#         return "Cannot identify gender"
    

## Extract phone numbers

In [11]:
def extract_phone_number(resume_text):
    PHONE_REG = re.compile(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]')
    phone = re.findall(PHONE_REG, resume_text)
    if phone:
        number = ''.join(phone[0])

        if resume_text.find(number) >= 0 and len(number) < 16:
            return number
    return None


## Extract name 

In [12]:
#extracts file name(without ext) from file path 
def get_name(path):
    name = os.path.basename(path)
    new_name = ""
    for c in name:
        if c == '.':
            break
        else:
            new_name = new_name + c
    return new_name


## get extension name

In [13]:
def get_ext(path):
    name = os.path.basename(path)
    ext = os.path.splitext(name)[1]
    return ext

## Extract emails

In [14]:
def extract_emails(resume_text):
    EMAIL_REG = re.compile(r'[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+')
    return re.findall(EMAIL_REG, resume_text)

## Extract skills


In [15]:
def extract_skills(input_text):
    SKILLS_DB = [
    'machine learning',
    'data science',
    'python',
    'word',
    'excel',
    'English']
    
    stop_words = set(nltk.corpus.stopwords.words('english'))
    word_tokens = nltk.tokenize.word_tokenize(input_text)

    # remove the stop words
    filtered_tokens = [w for w in word_tokens if w not in stop_words]

    # remove the punctuation
    filtered_tokens = [w for w in word_tokens if w.isalpha()]

    # generate bigrams and trigrams (such as artificial intelligence)
    bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 2, 3)))

    # we create a set to keep the results in.
    found_skills = set()

    # we search for each token in our skills database
    for token in filtered_tokens:
        if token.lower() in SKILLS_DB:
            found_skills.add(token)
    
    # we search for each bigram and trigram in our skills database
    for ngram in bigrams_trigrams:
        if ngram.lower() in SKILLS_DB:
            found_skills.add(ngram)

    return found_skills


## Extract skills v2.0

In [16]:
## Limited use, requires creating account after limit
def extract_skill_list(text):
    #change the id and secret when exceed limit
    CLIENT_ID = "zmjv32jntsjkv9tg"
    CLIENT_SECRET= "rh9xc2Za"
#     Scope = "emsi_open"

    url = "https://auth.emsicloud.com/connect/token"
    payload = f"client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&grant_type=client_credentials&scope=emsi_open"
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    res = requests.request("POST", url, data=payload, headers=headers).json()
    token = res["access_token"]

    url = "https://emsiservices.com/skills/versions/latest/extract"

    data = {"text": text,
            "confidenceThreshold": 0.6}
    headers = {
        'Authorization': f"Bearer {token}",
        'Content-Type': "application/json"
        }

    response = requests.request("POST", url, data=json.dumps(data), headers=headers)

    skill_json = response.json()
    try:
        skill_set = []
        for i in range(len(skill_json["data"])):
            skill_set.append(skill_json["data"][i]["skill"]["name"])

        return skill_set
    except:
        print("Please create a new account and get the id and secret")
        return skill_json

## Display result

In [17]:
def show_result(text,path= None):
#     text = extract_text_from_docx('Input/Resume/doc/resume.docx')
    print("\n")
    if path != None:
        name = get_name(path)
        print(f"Name : {name}\n")
        
    skills = extract_skill_list(text)
    phone_number = extract_phone_number(text)    
    email = extract_emails(text)    
    
    print(f"Email : {email[0]} \n") 
    print(f"Phone number : {phone_number}\n")
    print(f"skills : {skills}")

## Auto detect and extract and display

In [18]:
def auto_extract_text(path):
    if os.path.exists(path) == False:
        print("File do not exist")
        return
    ext = get_ext(path)
    text = " "
    doc_ext_list = ['.doc', '.docx']
    image_ext_list = ['.jpg','.jpeg','.png']
    if ext in doc_ext_list:
        print("File type : Document")
        text = extract_text_from_docx(path)
    elif ext in image_ext_list:
        print("File type : Image")
        text = extract_text_from_image(path)
    elif ext == ".pdf":
        print("File type : PDF")
        print("**Note : Some data might me missing while extracting from using PDF, please review the output")
        text = extract_text_from_pdf(path)
        
    show_result(text,path)


In [115]:
auto_extract_text("Input/Resume/doc/resume.docx")

File type : Document

Name : resume

Email : maria123@gmail.com 

Phone number : (555)-555-5555

skills : ['Software Design', 'Relational Databases', 'Object-Oriented Programming (OOP)', 'MySQL', 'Data Exchange', 'Bootstrap (Front-End Framework)', 'C++ (Programming Language)', 'JavaScript (Programming Language)', 'Development Management', 'Database Management Systems', 'Application Programming Interface (API)', 'Pandas (Python Package)', 'Microsoft Visual Studio', 'Shell Script', 'Scikit-learn (Machine Learning Library)', 'Extensible Markup Language (XML)', 'PostgreSQL', 'Linux', 'Amazon Web Services', 'Stored Procedure', 'Algorithms', 'SQLAlchemy', 'PHP (Scripting Language)', 'HTML5', 'Python (Programming Language)', 'SQL (Programming Language)', 'Model View Controller', 'Seaborn', 'Ajax (Programming Language)', 'Web Servers', 'Django (Web Framework)', 'Eclipse (Software)', 'Extensible HyperText Markup Language (XHTML)', 'Unit Testing', 'SQLite', 'Angular (Web Framework)', 'MongoDB', 

In [102]:
auto_extract_text("Input/Resume/image/resume2.png")

File type : Image

Name : resume2

Email : youremail@gmail.com 

Phone number : 895.555.5555

skills : ['MySQL', 'Application Programming Interface (API)', 'LAMP (Software Bundle)', 'Network Planning And Design', 'Linux Servers', 'Docker (Software)', 'Linux', 'PHP (Scripting Language)', 'Python (Programming Language)', 'RESTful API', 'Full Stack Software Engineering', 'Web Services', 'Kohana', 'Systems Architecture', 'C# (Programming Language)', 'SQL (Programming Language)', 'Laravel', 'Hyper-V', 'Cascading Style Sheets (CSS)', 'JQuery', 'Software Engineering', 'JavaScript (Programming Language)', 'Management', 'Back End (Software Engineering)', 'Web Applications', 'Microsoft Access', 'Leadership', 'Web Development']


In [19]:
auto_extract_text("Input/Resume/doc/Software-Engineer-Resume.docx")

File type : Document


Name : Software-Engineer-Resume

Email : youremail@gmail.com 

Phone number : 895-555-5555

skills : ['MySQL', 'Bash (Scripting Language)', 'Application Programming Interface (API)', 'Vue.js', 'LAMP (Software Bundle)', 'Network Planning And Design', 'Linux Servers', 'Docker (Software)', 'Linux', 'Cascading Style Sheets (CSS)', 'PHP (Scripting Language)', 'Python (Programming Language)', 'RESTful API', 'Full Stack Software Engineering', 'Windows Servers', 'React.js', 'Kohana', 'Systems Architecture', 'C# (Programming Language)', 'Elasticsearch', 'Node.js', 'SQL (Programming Language)', 'Consulting', 'Laravel', 'Hyper-V', 'HTML5', 'Amazon Web Services', 'JQuery', 'MariaDB', 'Software Engineering', 'JavaScript (Programming Language)', 'SAS (Software)', 'Management', 'Back End (Software Engineering)', 'Microsoft Access', 'Web Applications', 'R (Programming Language)', 'Leadership', 'Web Development']
