# 1. Extract Address
- Step 1: Using regex to extract address info in the resume
- Step 2: Parsing the address components to a dictionary for look up, using usaddress package

In [2]:
import re
import usaddress

#Extract the address from resume file 
def extract_address (text):
    text = text.replace('\n', ' ')
    regex = re.compile(r"[0-9]+ .*[.,-]? .*[.,-]? ([A-Z]{2}|\w+)[.,-]? [0-9]{5}(-[0-9]{4})?")
    result = re.search(regex, text)
    if result:
        result = result.group()
    return result

#Parse the components
def parse_address(result):
    address = usaddress.tag(result)
    return address


# ========Testing========

In [3]:

text = []
text.append('''Hi, Mr. Sam D. Richards lives here, 44 West 22nd Street, New York, NY 12345-4567. 
Can you contact him now? If you need any help, call me on 123 456 7891''')
text.append(''' ABEBAW AYELE
6040 14th St NW Washington DC 20011
202-629-7212 	abex72@gmail.com''')
text.append('''Amanda Yu
    	                       9700 Skyhill Way· Rockville· MD 20850·301-502-8705·yubo0107@hotmail.com''')
text.append('''Miguel Lorenzo M. Aviles
1644 New Windsor Ct
Crofton, Maryland 21114
(703) 501-1932
maviles@umd.edu
''')
text.append('''Alexander Berger
3711 Campus Drive
College Park, MD 20742
240-338-2206
alexfberger@gmail.com
Objective
I am seeking an entry-level position where I can use my design and software skills to provide better and more intuitive application design for customers and for my team.
''')

In [4]:
#Extracting Address
address=[]
for i in range(len(text)):
    address.append(extract_address(text[i])) 
    print(address[i])

44 West 22nd Street, New York, NY 12345-4567
6040 14th St NW Washington DC 20011
9700 Skyhill Way· Rockville· MD 20850
1644 New Windsor Ct Crofton, Maryland 21114
3711 Campus Drive College Park, MD 20742


In [5]:
#Parse the component
address_components = []
for i in range(len(text)):
    address_components.append(parse_address(address[i]))
    print("Person {0}: {1}".format(i+1, parse_address(address[i])))

Person 1: (OrderedDict([('AddressNumber', '44'), ('StreetNamePreDirectional', 'West'), ('StreetName', '22nd'), ('StreetNamePostType', 'Street'), ('PlaceName', 'New York'), ('StateName', 'NY'), ('ZipCode', '12345-4567')]), 'Street Address')
Person 2: (OrderedDict([('AddressNumber', '6040'), ('StreetName', '14th'), ('StreetNamePostType', 'St'), ('StreetNamePostDirectional', 'NW'), ('PlaceName', 'Washington'), ('StateName', 'DC'), ('ZipCode', '20011')]), 'Street Address')
Person 3: (OrderedDict([('AddressNumber', '9700'), ('StreetName', 'Skyhill'), ('StreetNamePostType', 'Way·'), ('PlaceName', 'Rockville·'), ('StateName', 'MD'), ('ZipCode', '20850')]), 'Street Address')
Person 4: (OrderedDict([('AddressNumber', '1644'), ('StreetName', 'New Windsor'), ('StreetNamePostType', 'Ct'), ('PlaceName', 'Crofton'), ('StateName', 'Maryland'), ('ZipCode', '21114')]), 'Street Address')
Person 5: (OrderedDict([('AddressNumber', '3711'), ('StreetName', 'Campus'), ('StreetNamePostType', 'Drive'), ('Place

In [6]:
#Print the result
for i in range(len(address_components)):
    person = list(address_components[i][0].items())
    Address=''
    for j, item in enumerate(person):
        if person[j][0] == 'PlaceName':
            Placename = person[j][1]
        elif person[j][0] == 'StateName':
            State = person[j][1]
        elif person[j][0] == 'ZipCode':
            ZipCode = person[j][1]
        else:
            Address += person[j][1] + ' '

    print('\nPerson {}'.format(i+1))
    print("Address: {}".format(Address))
    print("City: {}".format(Placename))
    print("State: {}".format(State))
    print("ZipCode: {}".format(ZipCode))


Person 1
Address: 44 West 22nd Street 
City: New York
State: NY
ZipCode: 12345-4567

Person 2
Address: 6040 14th St NW 
City: Washington
State: DC
ZipCode: 20011

Person 3
Address: 9700 Skyhill Way· 
City: Rockville·
State: MD
ZipCode: 20850

Person 4
Address: 1644 New Windsor Ct 
City: Crofton
State: Maryland
ZipCode: 21114

Person 5
Address: 3711 Campus Drive 
City: College Park
State: MD
ZipCode: 20742


# 2. Extract Skills 

Combined both supervised and unsupervised learning to train the parsing algorithm. 
- Unsupervised: Word2Vec, KMeans (built in the above sections)
- Supervised: NaiveBayes 

In order to implement supervised learning, we need to prepare a big training data of skills to label the resume document for training purposes. First, we create a big list of all possible skills:

In [None]:
import nltk
import pandas as pd
import os
import codecs

data = pd.read_excel("Skills.xlsx", header=0)
skill_list = list(data['Skill Names'])
skill_list = set(skill_list)
skill_list= [skill.lower() for skill in skill_list]
sorted(skill_list)

In step 2: we import the resume document and preprocessing the text by removing all stopwords, special characters and lower all cases

In [None]:
import docx2txt
filename ='all_text1.txt'
trained_resume_path = os.path.join('Trained Resumes', filename)

In [None]:
#resume_text = docx2txt.process(test_resume_path)
resume_text = open(trained_resume_path, 'r', encoding='utf_8').read()

In [None]:
from nltk.corpus import stopwords

special_characters = ['!','#', '$', '%','&','*','-', '/', '=','?',
                      '^','.','_','`', '{', '|', '}','~', "'", ',', '(',')', ':', '•', '§' ]

In [None]:
# Processing text 

def resume_processing (resume_text):
    #tokenize sentences
    resume_sents = nltk.sent_tokenize(resume_text)

    #tokenize words
    resume_words = [nltk.word_tokenize(sent) for sent in resume_sents]
    
    #remove stopwords and special characters
    processed_resume=[]
    for sentence in resume_words:
        sent = [w.lower() for w in sentence 
                          if w.lower() not in stopwords.words('english') and w.lower() not in special_characters]
        processed_resume.append(sent)
    
    return processed_resume

Step 3: We apply bigram and trigram model for all text

In [None]:
unigram_resume = resume_processing(resume_text)
unigram_resume

In [None]:
from gensim.models import Phrases

#Create bigram model
bigram_model_path = 'bigram_model'

bigram_model = Phrases(unigram_resume)
bigram_model.save(bigram_model_path)

In [None]:
# Create bigram words
def create_bigram (unigram_resume):
    bigram_model = Phrases.load(bigram_model_path)
    bigram_resume = [bigram_model[sentence] for sentence in unigram_resume]
    return bigram_resume

In [None]:
bigram_resume = create_bigram(unigram_resume)

In [None]:
#Create trigram model 
trigram_model_path = 'trigram_model'

trigram_model = Phrases(bigram_resume)
trigram_model.save(trigram_model_path)

In [None]:
# Create trigram words
def create_trigram (bigram_resume):
    trigram_model = Phrases.load(trigram_model_path)
    trigram_resume = [trigram_model[sentence] for sentence in bigram_resume]
    return trigram_resume

In [None]:
trigram_resume = create_trigram(bigram_resume)
trigram_resume

For comparing the string, we need to normalize the text by removing the "_" in bigram and trigram words

In [None]:
import re

#Normalize bigram/trigram words 
def normalize_words (trigram_resume):
    for sentence in trigram_resume:
        for i, word in enumerate(sentence):   
            if len(re.findall(r'\w+\_\w+', word))!= 0:
                sentence[i] = re.sub('_', ' ', word)
    return trigram_resume

In [None]:
normalized_resume = normalize_words(trigram_resume)

For supervised learning, we need to have a trained data with the words being labeled. In step 4: we labeled all the words with 'skill' and 'not skill' in resume text by comparing string words with skill list

In [None]:
#label skills in the resume
def labeled_word (sentence):
    labels=[]
    for word in sentence:
        if word in skill_list:
            labels.append((word, 'skill'))
        else:
            labels.append((word, 'not skill'))
    return labels

In [None]:
labeled_words=[labeled_word(sentence) for sentence in normalized_resume]
labeled_words

Step 4: Extract features to fit in the algorithm
- First feature is to check if the word is in skill list
- Second feature examines the probability that the top 25 similar words generated from Word2Vec model will be in skill list
- Third feature is to check of the word is in skill cluster generated from KMeans
- Fourth and fifth feature are to check if the previous word and next word would be in skill list

In [None]:

def similar_prob(word):
    count = 0
    terms = get_related_terms(word,25)
    for w in terms:
        if skill_series.isin([w]).any():
            count+=1
    return count/25

In [None]:
def in_skill_cluster(word):
    if word in skills:
        return True
    return False

In [None]:
#extract featurres of skills 
def extract_features (sentence, i):
    features={}
    #first feature: evaluate if that word is in skill list
    features["({})in_skill_list".format(sentence[i])]= (sentence[i] in skill_list)
    
    if sentence[i] in res2vec.wv.vocab:
        features["probality_of_similar_words_skills"] = similar_prob(sentence[i])
        features["in_skill_cluster"] = in_skill_cluster(sentence[i])
    
    #if the word is in begining of the sentence, return <Start> for prev_word
    if i==0 and len(sentence)-1 != 0:
        features["prev_word_in_skill_list"]= '<Start>'
        features["next_word_in_skill_list"]= (sentence[i+1] in skill_list)
    
    #if the word is in begining of the sentence, return <End> for next_word
    elif i == len(sentence)-1 and  i != 0:
        features["prev_word_in_skill_list"]= (sentence[i-1] in skill_list)
        features["next_word_in_skill_list"]= '<End>'
    
    #if the sentence has only 1 word, return False for both prev_word and next_word
    elif i==0 and len(sentence)-1 == 0:
        features["prev_word_in_skill_list"]= False
        features["next_word_in_skill_list"]= False
    else:
        features["prev_word_in_skill_list"]= (sentence[i-1] in skill_list)
        features["next_word_in_skill_list"]= (sentence[i+1] in skill_list)
    return features

All features are then stored in feature set and split into training data and test data

In [None]:
%%time
featuresets=[]
for labeled_sent in labeled_words:
    unlabeled_sent = [word[0] for word in labeled_sent]
    for i, (w, label) in enumerate(labeled_sent):
        featuresets.append((extract_features(unlabeled_sent, i), label)) 

In [None]:
#Save the features in a file
featuresets_file = 'features_file.txt'
file = open(featuresets_file, 'w', encoding='utf_8')
file.write('\n'.join('%s %s' % item for item in featuresets ))

In [None]:
size = int(len(featuresets)*0.1)
train_set = featuresets[size:]
test_set = featuresets[:size]

In [None]:
train_set

Fit the training data to train the NaiveBayes algorithm

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

Evaluate the accuracy of the model based on labeled test data

In [None]:
nltk.classify.accuracy(classifier, test_set)

Show most informative features to determine if a word is skill or not. As seen from the result, a word that has more similiar words as skill will have a good chance to be a skill in the resume

In [None]:
classifier.show_most_informative_features(15)

# =========Testing=========

In [None]:
test_file =['sampleMechanical Engineering Resume.txt', 'desktop support engineer resume.txt','Henrydao -Resume.txt',
            'Electrical Engineering Student Resume.txt','Technical Consultant Resume.txt','Technical Manager Resume.txt',
           'Technical Support Resume.txt', 'Technical Writer Resume.txt', 'Yiyang (Eric) Zhou Resume 2017 Spring.txt']

def extract_skills(normalized_test_res, resume_number, filename):
    skills =[]
    for sent in normalized_test_res:
        for (i,_) in enumerate(sent):
            if classifier.classify(extract_features(sent, i))=='skill':
                skills.append(sent[i])
                extracted_skills = set(skills)
    print('\nResume {}:{} ({} skills)\n'.format(resume_number+1,filename, len(extracted_skills)), extracted_skills)
    
for i, filename in enumerate(test_file):
    test_resume_path= os.path.join('Test Resumes', filename)

    test_resume = open(test_resume_path, 'r').read()
    unigram_test_res = resume_processing(test_resume)
    bigram_test_res = create_bigram(unigram_test_res)
    trigram_test_res = create_trigram(bigram_test_res)
    normalized_test_res = normalize_words(trigram_test_res)
    extract_skills(normalized_test_res, i, filename)

# 3. Extract Companies
We will mostly use the text segmentation to strip out the block of text that contains work experience information based on the section headers.

In [None]:
import codecs
import os

filename = 'BrandonThomasResume.txt'
#Open file
def open_file(filename):
    resume = open(filename, 'r', errors='ignore').read()
    return resume

In [None]:
resume = open_file(filename)
print(resume)

We create a list of all possible header for work experience to identify the upper bound of the text block and its index

In [None]:
import pandas as pd

#Import different put of experience headers
data = pd.read_excel("Work Experience.xlsx", header=0)
experience_list = list(data['Example'])
experience_list

In [None]:
from fuzzywuzzy.process import dedupe

#Find the experience header
def find_exp_header (resume):
    exp_header_list=[]
    for word in experience_list:
        if resume.find(word) != -1:
            exp_header_list.append(word)
    
    #remove duplicates of experience header
    exp_header = list(dedupe(exp_header_list))
    return exp_header

In [None]:
exp_header = find_exp_header(resume)
exp_header

In [None]:
exp_header = (exp_header[0], resume.find(exp_header[0]))
exp_header

We create a list of all section headers to identify the next section header being the lower bound of the text block and its index

In [None]:
import re
import itertools

#List of all sections in a typical resume
section_list =['EDUCATION', 'Education', 'Skills', 'SKILLS', 'VOLUNTEER EXPERIENCE', 'Volunteer Experience',
              'Technical Skills', 'TECHNICAL SKILS', 'SUMMARY', 'summary', 'Professional Summary', 'PROFESSIONAL SUMMARY',
              'DEMONSTRATED SKILLS', 'Demonstrated Skills', 'Additional Information', 'ADDITIONAL INFORMATION', 
               'Leadership Experience', 'LEADERSHIP EXPERIENCE', 'REFERENCES', 'References', 
               'Certificates & Trainings', 'CERTIFICATE & TRAININGS', 'TRAINING', 'Training', 'Certificate', 'CERTIFICATE', 
               'RELEVANT COURSES', 'LANGUAGES', 'Relevant Courses', 'Languages', 'LEADERSHIP AND VOLUNTEER EXPERIENCE',
               'Leadership and Volunteer Experience', 'LEADERSHIP & VOLUNTEER EXPERIENCE', 'Leadership & Volunteer Experience',
               'EDUCATION AND TRAINING', 'Education and Training', 'Key Projects', 'KEY PROJECTS', 'RELEVANT ACADEMIC PROJECTS', 
               'Relevant Academic Projects', 'ACADEMIC PROJECTS', 'Academic Projects', 'EXTRACURRICULAR ACTIVITIES', 
               'Extracurricular Activities'
              ]

In [None]:
#Find next section header
def find_next_section (resume):
    #Find all capitalized words
    next_section_upper = re.findall(r'([A-Z]{3,}( [A-Z]+)?( [A-Z]+)?( [A-Z]+)?)', 
                                   resume[(exp_header[1] + len(exp_header[0])+ 1):])
    next_section_upper = list((itertools.chain.from_iterable(next_section_upper)))
    
    #Find all words with the first letter capitalized
    next_section_lower = re.findall(r'([A-Z]{1}\w+( [A-Z]{1}\w+)?( [A-Z]{1}\w+)?( [A-Z]{1}\w+)?)',
                                    resume[(exp_header[1] + len(exp_header[0])+ 1):])
    next_section_lower = list((itertools.chain.from_iterable(next_section_lower)))
    
    #Combine into a list
    next_section_list = next_section_upper + next_section_lower
    
    #if one of the items matches items in section list, that item is the next section header
    next_section=()
    for item in next_section_list:
        if item in section_list and (resume[resume.find(item)+len(item)]=='\n' or resume[resume.find(item)-1]=='\n'):
            next_section = (item, resume.find(item))
            break
    return next_section

In [None]:
next_section = find_next_section(resume)
next_section

In [None]:
def get_workexp_section(resume):
    if next_section:
        workexp_section = str(resume[(exp_header[1]+ len(exp_header[0])+ 1):next_section[1]])
    else:
        workexp_section = str(resume[(exp_header[1]+ len(exp_header[0])+ 1):])
    return workexp_section

We strip out that block of text as work experience info and remove all the details that starts with the bullet points. What information left are company information that we need

In [None]:
workexp_section = get_workexp_section(resume)
workexp_section = workexp_section.split('\n')
workexp_section

In [None]:
#Remove the detail and get the experience information
def get_exp_info(work_exp):
    company_info=[]
    temp_str=''
    for i, sent in enumerate(work_exp):
        if sent != '':
            #Everything before the bullet will be put into one sentence, for one company
            if not sent.startswith(('•','', u'\uf095', '§', '§')): 
                temp_str += sent + ' '
            else:
                if not work_exp[i-1].startswith(('•','', u'\uf095', '§', '§')):
                    company_info.append(temp_str)
                    temp_str=''
    return company_info

In [None]:
company_info = get_exp_info(workexp_section)
for i, company in enumerate(company_info):
    company = company.replace('\t', '')
    print('\nCompany {}:'.format(i+1), company)

Last step involves using Name-entity recognition to parse the components of company info, including company name, location, duration and the role

In [None]:
import spacy
from nltk.corpus import stopwords
nlp = spacy.load('en')

def extract_exp_info(company_info, filename):
    count = 0
    print(filename)
    for i, sent in enumerate(company_info):
        sent = sent.replace('\t', '')
        parsed_sent = nlp(sent)
        print('\nCompany {}'.format(i+1))
        
        company=''
        location=''
        time=''
        role=''
        for i ,token in enumerate(parsed_sent):
            if token.ent_type_ =='ORG':
                company += ' ' + str(token)
            elif token.ent_type_ =='GPE':
                location += ' ' + str(token)
            elif token.ent_type_ =='DATE' or token.ent_type_ =='TIME':
                time += ' ' + str(token)
            elif token.ent_type_ =='':
                if str(token).isalpha() and str(token) not in stopwords.words('english'):
                    role += ' ' + str(token)
        
        print('Company: {}'.format(company))
        print('Location: {}'.format(location))
        print('Time: {}'.format(time))
        print('Role: {}'.format(role))

In [None]:
extract_exp_info(company_info, filename)