In [27]:
import spacy
from spacy.vectors import Vectors
import os
import pandas as pd
import numpy as np
from numpy import empty
import en_core_web_sm
nlp = spacy.load('en_core_web_sm')
vectors = Vectors(shape=(10000, 300))
nlp.vocab.vectors = vectors
print(nlp.vocab.vectors.shape)
import re
re_c = re.compile(r'\w+')
import six

(10000, 300)


In [28]:
# switch for debug
flag_print = True

# switch to clear existing data
flag_clear = True

#threshold value for determining section
threshold = 0.5

In [29]:
# to get extract sections from the resume -- add or remove from  'similar_to' accordingly
similar_to = {
    'edu' : ['education', 'study', 'academics', 'institute', 'school', 'college'],
    'exp' : ['job', 'internship', 'training', 'research', 'career', 'profession', 'role'
             'project', 'responsibility', 'description', 'work experience', 'workshop', 'conference'],
    'skill' : ['skill', 'languages', 'technology', 'framework', 'tools', 'database'],
    'extra' : ['introduction', 'intro', 'achievement', 'hobby', 'links', 'additional', 
               'personal', 'award', 'objective', 'miscellaneous', 'interest']
}

unicode=str

list_of_sections = similar_to.keys()

# to bring similar_words to their normal forms
for section in list_of_sections:
    new_list = []
    
    for word in similar_to[section]:
        docx = nlp(unicode(word))
        new_list.append(docx[0].lemma_)
        
    if flag_print:
        print(section)
        print(new_list)
    similar_to[section] = new_list

edu
['education', 'study', 'academic', 'institute', 'school', 'college']
exp
['job', 'internship', 'training', 'research', 'career', 'profession', 'roleproject', 'responsibility', 'description', 'work', 'workshop', 'conference']
skill
['skill', 'language', 'technology', 'framework', 'tool', 'database']
extra
['introduction', 'intro', 'achievement', 'hobby', 'link', 'additional', 'personal', 'award', 'objective', 'miscellaneous', 'interest']


In [30]:
# function to remove unnecessary symbols and stopwords 
# function to return the words in a uniform 
def modify(word):
    try:
        symbols = '''~'`!@#$%^&*)(_+-=}{][|\:;",./<>?'''
        mod_word = ''
        
        for char in word:
            if (char not in symbols):
                mod_word += char.lower()

        docx = nlp(mod_word)

        if (len(mod_word) == 0 or docx[0].is_stop):
            return None
        else:
            return docx[0].lemma_
    except:
        return None 
    
if flag_print:
    test_words = ['Hello!!', '.,<>', 'India', 'of', '..freedoM..', 'e-mail']
    
    for word in test_words:
        print(word, '--returned-->', modify(word))

Hello!! --returned--> hello
.,<> --returned--> None
India --returned--> india
of --returned--> None
..freedoM.. --returned--> freedom
e-mail --returned--> email


In [31]:
# utility function to skip line when no alphabet present
def is_bad(line):
    for c in line:
        if (c.isalpha()):
            return False
    return True
      
if flag_print:
    test_words = ['.', '<.>', 'Speak', 'out', '"Eric"', 'freemail...']
    
    for word in test_words:
        print(word, '--returned-->', is_bad(word)) 

. --returned--> True
<.> --returned--> True
Speak --returned--> False
out --returned--> False
"Eric" --returned--> False
freemail... --returned--> False


In [32]:
%%time
dict_of_data_series = {}
flag_print = False

for file_name in os.listdir(os.getcwd()+'/Resumes/CVs'):
    if flag_print:
        print('\n')
        print('*'*25) 
        print(file_name) 
        print('*'*25) 
        
    main_file_handler = open('Resumes/CVs/'+file_name, 'r')  
    previous_section  = 'extra'
    
    curr_data_series = pd.Series([""]*len(list_of_sections), index=list_of_sections)
                   
    for line in main_file_handler:
        # skip line if empty
        if (len(line.strip()) == 0):
            continue
                
        # processing next line
        list_of_words_in_line = re_c.findall(line)
        list_of_imp_words_in_line  = []
        
        for i in range(len(list_of_words_in_line)):
            modified_word = modify(list_of_words_in_line[i])
            
            if (modified_word):
                list_of_imp_words_in_line.append(modified_word)

        curr_line = ' '.join(list_of_imp_words_in_line)
        doc = nlp(unicode(curr_line))
        section_value = {}
            
        # initializing section values to zero
        for section in list_of_sections:
            section_value[section] = 0.0
        section_value[None] = 0.0
            
        # updating section values    
        for token in doc:
            for section in list_of_sections:
                for word in similar_to[section]:
                    word_token = doc.vocab[unicode(word)]
                    section_value[section] = max(section_value[section], float(word_token.similarity(token)))

        # determining the next section based on section values and threshold
        most_likely_section = None
        for section in list_of_sections:

            if (section_value[most_likely_section] < section_value[section] and section_value[section] > threshold):
                most_likely_section = section
            
        # updating the section
        if (previous_section != most_likely_section and most_likely_section is not None):
            previous_section = most_likely_section
                

        # writing data to the pandas series
        try:
            docx = nlp(unicode(line))
        except:
            continue  # to handle the odd case of characters like 'x02', etc.
        mod_line = ''
        for token in docx:
                                                        #NameError: name 'is_empty' is not defined
            if (not token.is_stop):
                mod_line += token.lemma_ + ' '
        
        curr_data_series[previous_section] += mod_line
            
    dict_of_data_series[file_name] = curr_data_series
    if flag_print:
        print(curr_data_series)
    main_file_handler.close()
    
data_frame = pd.DataFrame(dict_of_data_series)
data_frame.to_csv('prc_data.csv', sep='\t')
#data_frame.head()







CPU times: user 5min 55s, sys: 80.6 ms, total: 5min 55s
Wall time: 5min 55s


In [33]:
data_frame.head()

Unnamed: 0,cv4,cv234,cv52,cv162,cv171,cv226,cv48,cv214,cv170,cv35,...,cv246,cv9,cv133,cv105,cv184,cv142,cv33,cv140,cv240,cv232
edu,Degree Institute / University Specialization Y...,education \n cummin College Engineering Women ...,education \n pass B.A purvanchal univerciti ...,education \n Bsc \n Mumbai university \n,,"education \n MCA \n govt . college Lucknow , U...",education \n,education \n \n pune university \n,"education \n A.C.PATIL COLLEGE Mumbai , Mahara...",education \n govt . Mahila Engg . College Ajme...,...,education \n Computer \n Savitribai Phule Pune...,education : \n Master Computers \n,"education \n B.tech \n Jntu , anantapur \n",,education \n Bachelor Engineering Computer \n ...,education \n Camellia Institute Technology \n ...,education \n M. Tech C.S.E \n Bharati Vidyapee...,• endow passion win evince demonstrate excelle...,education \n Computer \n Savitribai phule Pune...,education \n Maharishi Arvind Institute Engine...
exp,"problem analysis , use judgment ability solve ...",develop career aSoftware Engineer valuable tea...,"Job Profile \n Knowledge Accounts , Sale...",work experience \n fresher \n description ...,work experience \n End Developer \n fashion we...,work experience \n java developer \n work e - ...,work experience \n Software Developer \n revol...,work experience \n java \n MAXGEN - PUN \n Jun...,work experience \n fresher \n • B.E. Project :...,developer 3 + year experience industry Develop...,...,work experience \n Java Developer \n Sourcepep...,"desire Job Location : new york , New Jersey \t...",work experience \n Android Application develop...,• currently work Application developer Atos In...,"work experience \n End , End Developer \n Sept...",• presently work Application Development Analy...,work experience \n Software Developer \n D. G....,• Possess flair work hard contribute achieveme...,work experience \n Java Developer \n Universe ...,Pursue position Skills Knowledge develop Exper...
skill,Time management skill . \n Organization priori...,Bachelor Engineering Information Technology cu...,language \t\t - \t Hindi \ English . \n,,End Developer - End Technologies \n New Delhi ...,• detail orient multitaske professional except...,"Information Technology \n PVPIT , bavdhan pune...","skill \n java , c , c++ ( 1 year )","skill \n C ( 1 year ) , CSS ( 1 year ) , Datab...",".Net developer truworth technology - Jaipur , ...",...,"want learn , Excel skill growth organization ....",desire industry : Information Technology \t Sp...,"Palle Technologies - Bangalore , Karnataka \n ...","• Expertise Java , groovy language related fra...",eager learn implement skill lead firm software...,"• self - motivate , Confident good problem sol...","skill \n c # ( 3 year ) , Asp . net ( 3 year )...","technology : IBM Mobilefirst Platform ios , J2...","• Strong Skills Java SE , Java EE , Hibernate ...","Wipro Technologies - Bangalore , Karnataka \n ..."
extra,"JIGNESH M DUSARA Address : "" pitru - chhaya "" ...",Gayatri Joshi \n Java Developer - AllocateSoft...,RESUME \n rahul YADAV \n VILL- Agehata \n POST...,"Shubhada Jadhav \n Navi Mumbai , Maharashtra \...",Preeti Hans \n PERSONAL PROJECTS \n css3 \n Al...,"Arun Kumar \n Ghaziabad , Uttar Pradesh \n add...","Nisha shinge \n Pune , Maharashtra \n highly m...","Nitin Mhangare \n Pune , Maharashtra \n","Siddhesh Vichare \n Mumbai , Maharashtra \n mo...","Nidhi Mishra \n .net developer \n Jaipur , Raj...",...,"Nasrin Mulani \n Java developer \n Pune , Maha...",objective : \n • Proven record transform non -...,"Devasetti Venkatesh \n Bangalore , Karnataka \n",Ashay Jain \n application Developer \n Bhopal ...,"Naman Bajaj \n End , End Developer \n Indore ,...",Avik Das \n application developer ( ODI Develo...,Samadhan Kadam \n Dot Net Developer SQL Admini...,Santhosh Didigam \n Application Developer - IB...,Rohan Rane \n Software Developer Programmer \n...,Sandeep Jhalani \n Java Developer \n Bangalore...
