In [54]:
import spacy
from spacy.vectors import Vectors
import os
import pandas as pd
import numpy as np
nlp = spacy.load('en_core_web_sm')
vectors = Vectors(shape=(10000, 300))
nlp.vocab.vectors = vectors
print(nlp.vocab.vectors.shape)
import re
re_c = re.compile(r'\w+')

(10000, 300)


In [55]:
# switch for debug
flag_print = True

# switch to clear existing data
flag_clear = True

#threshold value for determining section
threshold = 0.5

In [56]:
# to get extract sections from the resume -- add or remove from  'similar_to' accordingly
similar_to = {
    'edu' : ['education', 'study', 'academics', 'institute', 'school', 'college'],
    'exp' : ['job', 'internship', 'training', 'research', 'career', 'profession', 'role'
             'project', 'responsibility', 'description', 'work experience', 'workshop', 'conference'],
    'skill' : ['skill', 'languages', 'technology', 'framework', 'tools', 'database'],
    'extra' : ['introduction', 'intro', 'achievement', 'hobby', 'links', 'additional', 
               'personal', 'award', 'objective', 'miscellaneous', 'interest']
}

list_of_sections = similar_to.keys()

# to bring similar_words to their normal forms
for section in list_of_sections:
    new_list = []
    
    for word in similar_to[section]:
        docx = nlp(word)
        new_list.append(docx[0].lemma_)
        
    if flag_print:
        print(section, new_list)
        
    similar_to[section] = new_list

extra ['introduction', 'intro', 'achievement', 'hobby', 'link', 'additional', 'personal', 'award', 'objective', 'miscellaneous', 'interest']
exp ['job', 'internship', 'training', 'research', 'carrer', 'profession', 'roleproject', 'responsibility', 'description', 'work', 'workshop', 'conference']
skill ['skill', 'language', 'technology', 'framework', 'tool', 'database']
edu ['education', 'study', 'academic', 'institute', 'school', 'college']


In [57]:
# function to remove unnecessary symbols and stopwords 
def modify(word):
    try:
        symbols = '''~'`!@#$%^&*)(_+-=}{][|\:;",./<>?'''
        mod_word = ''
        
        for char in word:
            if (char not in symbols):
                mod_word += char.lower()

        docx = nlp(mod_word)

        if (len(mod_word) == 0 or docx[0].is_stop):
            return None
        else:
            return docx[0].lemma_
    except:
        return None # to handle the odd case of characters like 'x02', etc.
    
if flag_print:
    test_words = ['Hello!!', '.,<>', 'India', 'of', '..freedoM..', 'e-mail']
    
    for word in test_words:
        print(word, '--returned-->', modify(word))

Hello!! --returned--> hello
.,<> --returned--> None
India --returned--> india
of --returned--> None
..freedoM.. --returned--> freedom
e-mail --returned--> email


In [58]:
# utility function to skip line when no alphabet present
def is_empty(line):
    for c in line:
        if (c.isalpha()):
            return False
    return True
      
if flag_print:
    test_words = ['.', '<.>', 'Speak', 'out', '"Eric"', 'freemail...']
    
    for word in test_words:
        print(word, '--returned-->', is_bad(word)) 

. --returned--> True
<.> --returned--> True
Speak --returned--> False
out --returned--> False
"Eric" --returned--> False
freemail... --returned--> False


In [69]:
dict_of_data_series = {}
flag_print = False

for file_name in os.listdir(os.getcwd()+'/CVs'):
    if flag_print:
        print('\n')
        print('*'*25) 
        print(file_name) 
        print('*'*25) 
        
    main_file_handler = open('CVs/'+file_name, 'r', encoding='latin-1')  
    previous_section  = 'extra'
    
    curr_data_series = pd.Series([""]*len(list_of_sections), index=list_of_sections)
                   
    for line in main_file_handler:
        # skip line if empty
        if (len(line.strip()) == 0 or is_empty(line)):
            continue
                
        # processing next line
        list_of_words_in_line = re_c.findall(line)
        list_of_imp_words_in_line  = []
        
        for i in range(len(list_of_words_in_line)):
            modified_word = modify(list_of_words_in_line[i])
            
            if (modified_word):
                list_of_imp_words_in_line.append(modified_word)

        curr_line = ' '.join(list_of_imp_words_in_line)
        doc = nlp(curr_line)
        section_value = {}
            
        # initializing section values to zero
        for section in list_of_sections:
            section_value[section] = 0.0
        section_value[None] = 0.0
            
        # updating section values    
        for token in doc:
            for section in list_of_sections:
                for word in similar_to[section]:
                    word_token = doc.vocab[word]
                    section_value[section] = max(section_value[section], float(word_token.similarity(token)))

        # determining the next section based on section values and threshold
        most_likely_section = None
        for section in list_of_sections:
            #print '>>', section, section_value[section]
            if (section_value[most_likely_section] < section_value[section] and section_value[section] > threshold):
                most_likely_section = section
            
        # updating the section
        if (previous_section != most_likely_section and most_likely_section is not None):
            previous_section = most_likely_section
                

        # writing data to the pandas series
        try:
            docx = nlp(line)
        except:
            continue  # to handle the odd case of characters like 'x02', etc.
        mod_line = ''
        for token in docx:
            if (not token.is_stop):
                mod_line += token.lemma_ + ' '
        
        curr_data_series[previous_section] += mod_line
            
    dict_of_data_series[file_name] = curr_data_series
    if flag_print:
        print(curr_data_series)
    main_file_handler.close()
    
data_frame = pd.DataFrame(dict_of_data_series)
data_frame.to_csv('prc_data.csv', sep='\t')
#data_frame.head()

In [71]:
data_frame.head()

Unnamed: 0,.DS_Store,cv1,cv10,cv100,cv101,cv102,cv103,cv104,cv105,cv106,...,cv90,cv91,cv92,cv93,cv94,cv95,cv96,cv97,cv98,cv99
extra,���bud1������������%����������������������...,mahboob alam \n software developer - dynamix i...,"deepali chaudhari \n mumbai , maharashtra \n","ramakrishnan k \n cuddalore , tamil nadu \n mc...",jayaprakash andamuthu \n application developer...,rajeev gupta \n additional information \n * se...,alaguraj ramachandran \n android application d...,aiswarya chandrasekaran \n application develop...,ashay jain \n application developer \n bhopal ...,"heena patel \n surat , gujarat \n contribute o...",...,karthikeyan p \n â¢ link end datum tool end ....,"bishvajit bakshi \n bangalore , karnataka \n a...","darpana nandy \n executive , data science - th...","harish venkataraman \n bangalore , karnataka \...",sharad kakran \n link \n https://github.com/sh...,"ketan bhatheja \n bangalore , karnataka \n - u...","rahul vhayaskar \n dhule , maharashtra \n link...",babli bisht \n android application developer h...,reshma patil \n jr. ios application developer ...,rajith r \n web application developer \n chenn...
exp,,-PRON- want work progressive organization -PRO...,work experience \n team member(software develo...,"-PRON- hard worker , enthusiastic , responsibl...",â¢ with good knowledge develop application an...,* currently work cognizant technology solution...,work experience \n application application dev...,â¢ good communication interpersonal skill eas...,â¢ currently work application developer atos ...,work experience \n iphone application develope...,...,senior research analyst relationship science (...,work experience \n data analyst \n in college ...,"work experience \n executive , data science \n...",work experience \n data science intern \n talv...,work experience \n data science intern \n,"work experience \n data science & analytics , ...",work experience \n android application develop...,â¢ to work key player challenging creative en...,work experience \n jr. ios application develop...,"dedicated work , innovative idea dynamic chall..."
skill,,"skill \n html5 ( 2 year ) , css3 ( 2 year ) , ...",be information technology \n nagpur university...,skill \n communication skills . \n fast learne...,b.sc computer technology \n diploma informatio...,application developer - cognizant technology s...,skill \n â¢ operating system : windows 10 ubu...,"â¢ primary skill constitute cobol , jcl , db2...","â¢ expertise java , groovy language related f...",to pursue growth experience field information ...,...,â¢ validate information give island ( factset...,"skill \n microsoft excel , microsoft ppt , mic...","skill \n data analysis ( less 1 year ) , data ...","tool - nlp , elasticsearch , python , alchemyl...","cappius technologies - hyderabad , telangana \...",- require data gathering database issue report...,"ecs technologies pvt . ltd. - pune , maharasht...","interglobe technologies - gurgaon , haryana \n...",technical skills:- \n tool : xcode ( 5 + ) \n ...,contribute organization new trend technology \...
edu,,education \n pg - diploma advance software dev...,education \n,education \n be \n dhanalakshmi srinivasan col...,education \n kongu arts science college erode ...,education \n b.e. engineering \n bengal engine...,education \n b.e computer science engg \n vel ...,education \n bachelor of engineering electroni...,,,...,education \n bachelor engineering computer sci...,education \n m.sc . statistics \n university a...,"india statistical institute - kolkata , west b...",education \n b.e. information science \n dayan...,analyzed customer transaction behavior shoppin...,education \n m.sc . economics \n madras school...,education \n b.e engineering \n north maharash...,education \n mtech electronics ( instrumentati...,education \n b.e \n indira college engineering...,"kodaikanal christian college - kodaikanal , ta..."
