### Ref

https://www.bluetickconsultants.com/resume-parsing-using-nlp.html


https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2


https://blog.apilayer.com/build-your-own-resume-parser-using-python-and-nlp/ 

https://youtu.be/4ssigWmExak

## Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

DRIVE_PATH = '/content/drive/MyDrive/machine-learning-playground/resume-scanner/'

## Utilities

### flatten list

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

## Reading Resume


### Installing pdfminer


In [None]:
!pip install pdfminer.six
!pip install nltk
!pip install numpy 

In [None]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def extract_names(txt):
    person_names = []
 
    for sent in nltk.sent_tokenize(txt):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
                person_names.append(
                    ' '.join(chunk_leave[0] for chunk_leave in chunk.leaves())
                )
 
    return person_names

### Scanning the pdf

In [None]:
# example_01.py
 
from pdfminer.high_level import extract_text
 
 
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

### Extracting phone numbers from resumes

In [None]:
import re

PHONE_REG = re.compile(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]')

def extract_phone_number(resume_text):
    phone = re.findall(PHONE_REG, resume_text)
 
    if phone:
        number = ''.join(phone[0])
 
        if resume_text.find(number) >= 0 and len(number) < 16:
            return number
    return None

### Extracting email addresses from resumes

In [None]:
import re

EMAIL_REG = re.compile(r'[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+')

def extract_emails(resume_text):
    return re.findall(EMAIL_REG, resume_text)

### Extracting Name

In [None]:
def extract_names(txt):
    person_names = []
 
    for sent in nltk.sent_tokenize(txt):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
                person_names.append(
                    ' '.join(chunk_leave[0] for chunk_leave in chunk.leaves())
                )
 
    return person_names

### Extracting skills from the resumes

In [None]:
import nltk

nltk.download('stopwords')

# TODO: read from csv
SKILLS_DB = [
    'php',
    'wordpress',
    'html',
    'javascript',
    'laravel',
    'css',
    'js',
    'symfony',
    'git',
    'yii',
    'aws',
    'kafka',
    'cakephp',
    'react',
    'vue',
    'node',
    'npm',
    'composer',
    'codeigniter',
    'drupal',
    'magento'
]

def extract_skills(input_text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    word_tokens = nltk.tokenize.word_tokenize(input_text)
 
    # remove the stop words
    filtered_tokens = [w for w in word_tokens if w not in stop_words]
 
    # remove the punctuation
    filtered_tokens = [w for w in word_tokens if w.isalpha()]
 
    # generate bigrams and trigrams (such as artificial intelligence)
    bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 2, 3)))
 
    # we create a set to keep the results in.
    found_skills = set()
 
    # we search for each token in our skills database
    for token in filtered_tokens:
        if token.lower() in SKILLS_DB:
            found_skills.add(token)
 
    # we search for each bigram and trigram in our skills database
    for ngram in bigrams_trigrams:
        if ngram.lower() in SKILLS_DB:
            found_skills.add(ngram)
 
    return found_skills

### Extracting education and schools from resumes

In [None]:
import nltk
 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

RESERVED_WORDS_EDU = [
    'school',
    'college',
    'science',
    'engineering',
    'bachelor',
    'master',
    'univers',
    'academy',
    'faculty',
    'institute',
]

def extract_education(input_text):
    organizations = []
 
    # first get all the organization names using nltk
    for sent in nltk.sent_tokenize(input_text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label') and chunk.label() == 'ORGANIZATION':
                organizations.append(' '.join(c[0] for c in chunk.leaves()))
 
    # we search for each bigram and trigram for reserved words
    # (college, university etc...)
    education = set()
    for org in organizations:
        for word in RESERVED_WORDS_EDU:
            if org.lower().find(word) >= 0:
                education.add(org)
 
    return education

### Extract Work Ex

In [None]:
import nltk
 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

RESERVED_WORDS_WORK = [
    'pvt',
    'consultant',
    'solutions',
    'limited',
    'technologies',
    'indust'

]

def extract_workex(input_text):
    organizations = []
 
    # first get all the organization names using nltk
    for sent in nltk.sent_tokenize(input_text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label') and chunk.label() == 'ORGANIZATION':
                organizations.append(' '.join(c[0] for c in chunk.leaves()))

    # we search for each bigram and trigram for reserved words
    # (college, university etc...)
    workex = set()
    for org in organizations:
        for word in RESERVED_WORDS_WORK:
            if org.lower().find(word) >= 0:
                workex.add(org)
 
    return workex

### Driver Code

In [None]:
import os

resumeFolder = DRIVE_PATH+'resume/'
filePath = resumeFolder+sorted(os.listdir(resumeFolder))[0]

if __name__ == '__main__':
    text = extract_text_from_pdf(filePath)
    names = extract_names(text)
    print(names)
    phone_number = extract_phone_number(text)
    print(phone_number)
    emails = extract_emails(text)
    print(emails)
    resumeSkillsList = extract_skills(text)
    skills = ', '.join(str(e) for e in resumeSkillsList)
    print(skills)
    education_information = extract_education(text)
    print(education_information)
    workex = extract_workex(text)
    print(workex)
    intro = names[0] + "\n" + emails[0] + "\n" + phone_number

## Detect Keywords

### skills driver method

In [None]:
def detectSkills(skillName, resumeSkillList):
  for skill in resumeSkillList:
    if skillName.lower() in skill.lower():
      return "Yes"
  return "No"

### Check Skills

In [None]:
print(resumeSkillsList)

skillList = {
  "javascript": [
      "javascript"
  ],
  'javascript-framework':[
      'react',
      'vue',
      'node',
      'express'
  ],
  "php": [
      "php"
  ],
  "wordpress":[
      "wordpress"
  ],
  "php-framework": [
      "laravel",
      "yii",
      "symphony",
      "codeigniter"
  ],
  "css": [
      "sass",
      "scss",
      "tailwind",
      "css"
  ],
  "html": [
      "html5",
      "html"
  ]
}

verdict = {}

for skillName, skillKeywords in skillList.items():
  for skillKeyword in skillKeywords:
    doesExists = detectSkills(skillKeyword, resumeSkillsList)
    if ("Yes" == doesExists):
      verdict[skillName] = "Yes"
      continue
    verdict[skillName] = "No"
print(verdict)

## Updating value in a csv standard format

### Installing Dependancies

In [None]:
pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

### Adding sheets API dependancies

In [None]:
from __future__ import print_function

import os.path
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google.oauth2 import service_account

### Creds for sheets API

In [None]:
import string

COLUMNS = list(string.ascii_uppercase)
INITIAL_ROW = "1"
SCORING_TAB_NAME = "automated_scoring"
KEYWORD_TAB_NAME = "automated_keywords"
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']
SERVICE_ACCOUNT_FILE = '/content/drive/MyDrive/machine-learning-playground/resume-scanner/resume-scanner-361910-4803b5ebb28d.json'
SPREADSHEET_ID = '1pdt2be42qZI0YmjDlke4FbTDQZt244HFtJV_D-SgolY'

creds = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)

service = build('sheets', 'v4', credentials=creds)
sheet = service.spreadsheets()

### Reading values from sheet

In [None]:
def readSheet(spreadsheet_id, range_name):
    try:
        result = sheet.values().get(spreadsheetId=SPREADSHEET_ID, range=range_name).execute()
        values = result.get('values', [])
        if not values:
            print('No data found.')
            return
        else:
            return values
    except HttpError as err:
        print(err)

### Write Data


In [None]:
def writeDate(spreadsheet_id, range_name, data):
  requestBody = {
      "values": data
  }
  request = sheet.values().update(spreadsheetId=spreadsheet_id, range=range_name, valueInputOption="USER_ENTERED", body=requestBody)
  response = request.execute()

### Driver Code

- check which column in row 1 is empty
- if no column found add 10 columns
- write data in that column

In [None]:
row1 = flatten(readSheet(SPREADSHEET_ID, "automated_scoring!1:1"))
starting_cell = COLUMNS[len(row1)]+INITIAL_ROW
print(starting_cell)
# data = [names[0]+'\n'+emails+'\n'+phone_number, '', '', '', '', skills]
data = [[intro],['Yes'],['Yes'],['Yes'],['Yes']]
write_range = SCORING_TAB_NAME+"!"+starting_cell
print(write_range)
values = writeDate(SPREADSHEET_ID, write_range, data)
print(values)