In [1]:
from pdfminer.high_level import extract_text
import docx2txt
import nltk
import re
import subprocess 

ModuleNotFoundError: No module named 'pdfminer'

## Retrieve resume text from pdf

In [2]:
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)


text = extract_text_from_pdf('Resume.pdf')
print(text)

About Me

Education

134A University Ave, St. John's, A1B 1Z5, Canada
(709) 986-7643  .  sikamal@mun.ca   . shawon.dev . github.com/shawonibnkamal

.

linkedin.com

Shawon Ibn Kamal

4th-year Computer Science student at Memorial University with
over 2 years of experience as a professional software developer in
Blue Communications Inc.

BSc. Computer Science (Honours, 3.81 CGPA), Memorial University of
Newfoundland, St. John's
January 2018 - April 2022 (Expected)

Employment History

Programmer Analyst at Blue Communications Inc., St. John's
July 2019 — Present

● Design, develop, document, analyze, create, test, and modify computer systems,

programs, and integrations by evaluating client needs.

● Work with Javascript, Laravel, PHP, SQL, ASP.NET to build enterprise applications.
● Manage projects through Git, CI/CD, and automate launching Ubuntu instances

using AWS SDK.

Data Programmer at Department of Psychology & Ecology, Memorial University of
Newfoundland, St. John's
September 2

## Retrieve candidate name

In [4]:
output = []
def extract_names(txt):
    person_names = []

    for sent in nltk.sent_tokenize(txt):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            output.append(chunk)
            if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
                person_names.append(
                    ' '.join(chunk_leave[0] for chunk_leave in chunk.leaves())
                )

    return person_names


In [5]:
names = extract_names(text)
if names:
    print(names)
#output

['Me', 'John', 'Shawon Ibn Kamal', 'Honours', 'Javascript', 'Laravel', 'Data', 'Ecology', 'Develop', 'Python', 'Full', 'Stack Developer', 'Matlab', 'Share', 'Shawon Notes', 'React', 'Laravel', 'Starcraft', 'Starcraft', 'Broodwar', 'Protoss', 'Zealot Rush', 'Starcraft', 'Broodwar', 'Protoss', 'Zealot Rush', 'Scraper', 'Best Buy', 'Spark Fund', 'Share', 'Python', 'Java', 'Javascript', 'Matlab Frameworks', 'Laravel', 'Express', 'Matplotlib', 'Pandas', 'Docker', 'Linux Server', 'Apache', 'Visual Studio', 'Photoshop']


## Extract phone-number

In [7]:
PHONE_REG = re.compile(r'[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]')

def extract_phone_number(resume_text):
    phone = re.findall(PHONE_REG, resume_text)

    if phone:
        number = ''.join(phone[0])

        if resume_text.find(number) >= 0 and len(number) < 16:
            return number
    return None


phone_number = extract_phone_number(text)
print(phone_number)

(709) 986-7643


## Extract email

In [34]:
EMAIL_REG = re.compile(r'[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+')
def extract_emails(resume_text):
    return re.findall(EMAIL_REG, resume_text)

emails = extract_emails(text)
if emails:
    print(emails[0])  # noqa: T001

sikamal@mun.ca


## Extract school

In [8]:
RESERVED_WORDS = [
    'school',
    'college',
    'univers',
    'academy',
    'faculty',
    'institute',
    'faculdades',
    'Schola',
    'schule',
    'lise',
    'lyceum',
    'lycee',
    'polytechnic',
    'kolej',
    'ünivers',
    'okul',
]

def extract_education(input_text):
    organizations = []

    # first get all the organization names using nltk
    for sent in nltk.sent_tokenize(input_text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label') and chunk.label() == 'ORGANIZATION':
                organizations.append(' '.join(c[0] for c in chunk.leaves()))

    # we search for each bigram and trigram for reserved words
    # (college, university etc...)
    education = set()
    for org in organizations:
        for word in RESERVED_WORDS:
            if org.lower().find(word) >= 0:
                education.add(org)

    return education




education_information = extract_education(text)
print(education_information)  # noqa: T001

{'Memorial University', 'University Ave'}


## Extract Job Titles

In [11]:
# you may read the database from a csv file or some other database
SKILLS_DB = [
    'programmer analyst',
    'data science',
    'developer',
    'full stack developer',
    'excel',
    'English',
    'data scientist',
    'accountant',
]


def extract_text_from_docx(docx_path):
    txt = docx2txt.process(docx_path)
    if txt:
        return txt.replace('\t', ' ')
    return None


def extract_skills(input_text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    word_tokens = nltk.tokenize.word_tokenize(input_text)

    # remove the stop words
    filtered_tokens = [w for w in word_tokens if w not in stop_words]

    # remove the punctuation
    filtered_tokens = [w for w in word_tokens if w.isalpha()]

    # generate bigrams and trigrams (such as artificial intelligence)
    bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 2, 3)))

    # we create a set to keep the results in.
    found_skills = set()

    # we search for each token in our skills database
    for token in filtered_tokens:
        if token.lower() in SKILLS_DB:
            found_skills.add(token)

    # we search for each bigram and trigram in our skills database
    for ngram in bigrams_trigrams:
        if ngram.lower() in SKILLS_DB:
            found_skills.add(ngram)

    return found_skills


skills = extract_skills(text)
print(skills)  # noqa: T001

{'developer', 'Developer', 'Programmer Analyst', 'Full Stack Developer'}
