In [1]:
import spacy
from spacy.vectors import Vectors
import os
import pandas as pd
import numpy as np
nlp = spacy.load('en_core_web_sm')
vectors = Vectors(shape=(10000, 300))
nlp.vocab.vectors = vectors
print(nlp.vocab.vectors.shape)
import re
re_c = re.compile(r'\w+')

2022-05-05 15:10:59.188266: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-05 15:10:59.188310: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


(10000, 300)


In [2]:
# switch for debug
flag_print = True

# switch to clear existing data
flag_clear = True

#threshold value for determining section
threshold = 0.5

In [3]:
# to get extract sections from the resume -- add or remove from  'similar_to' accordingly
similar_to = {
    'edu' : ['education', 'study', 'academics', 'institute', 'school', 'college'],
    'exp' : ['job', 'internship', 'training', 'research', 'career', 'profession', 'role'
             'project', 'responsibility', 'description', 'work experience', 'workshop', 'conference'],
    'skill' : ['skill', 'languages', 'technology', 'framework', 'tools', 'database'],
    'extra' : ['introduction', 'intro', 'achievement', 'hobby', 'links', 'additional', 
               'personal', 'award', 'objective', 'miscellaneous', 'interest']
}

list_of_sections = similar_to.keys()

# to bring similar_words to their normal forms
for section in list_of_sections:
    new_list = []
    
    for word in similar_to[section]:
        docx = nlp(word)
        new_list.append(docx[0].lemma_)
        
    if flag_print:
        print(section, new_list)
        
    similar_to[section] = new_list

edu ['education', 'study', 'academic', 'institute', 'school', 'college']
exp ['job', 'internship', 'training', 'research', 'career', 'profession', 'roleproject', 'responsibility', 'description', 'work', 'workshop', 'conference']
skill ['skill', 'language', 'technology', 'framework', 'tool', 'database']
extra ['introduction', 'intro', 'achievement', 'hobby', 'link', 'additional', 'personal', 'award', 'objective', 'miscellaneous', 'interest']


In [4]:
# function to remove unnecessary symbols and stopwords 
# function to return the words in a uniform 
def modify(word):
    try:
        symbols = '''~'`!@#$%^&*)(_+-=}{][|\:;",./<>?'''
        mod_word = ''
        
        for char in word:
            if (char not in symbols):
                mod_word += char.lower()

        docx = nlp(mod_word)

        if (len(mod_word) == 0 or docx[0].is_stop):
            return None
        else:
            return docx[0].lemma_
    except:
        return None # to handle the odd case of characters like 'x02', etc.
    
if flag_print:
    test_words = ['Hello!!', '.,<>', 'India', 'of', '..freedoM..', 'e-mail']
    
    for word in test_words:
        print(word, '--returned-->', modify(word))

Hello!! --returned--> hello
.,<> --returned--> None
India --returned--> india
of --returned--> None
..freedoM.. --returned--> freedom
e-mail --returned--> email


In [6]:
# utility function to skip line when no alphabet present
def is_bad(line):
    for c in line:
        if (c.isalpha()):
            return False
    return True
      
if flag_print:
    test_words = ['.', '<.>', 'Speak', 'out', '"Eric"', 'freemail...']
    
    for word in test_words:
        print(word, '--returned-->', is_bad(word)) 

. --returned--> True
<.> --returned--> True
Speak --returned--> False
out --returned--> False
"Eric" --returned--> False
freemail... --returned--> False


In [9]:
dict_of_data_series = {}
flag_print = False

for file_name in os.listdir(os.getcwd()+'/Data/CVs'):
    if flag_print:
        print('\n')
        print('*'*25) 
        print(file_name) 
        print('*'*25) 
        
    main_file_handler = open('Data/CVs/'+file_name, 'r', encoding='latin-1')  
    previous_section  = 'extra'
    
    curr_data_series = pd.Series([""]*len(list_of_sections), index=list_of_sections)
                   
    for line in main_file_handler:
        # skip line if empty
        if (len(line.strip()) == 0 or is_empty(line)):
            continue
                
        # processing next line
        list_of_words_in_line = re_c.findall(line)
        list_of_imp_words_in_line  = []
        
        for i in range(len(list_of_words_in_line)):
            modified_word = modify(list_of_words_in_line[i])
            
            if (modified_word):
                list_of_imp_words_in_line.append(modified_word)

        curr_line = ' '.join(list_of_imp_words_in_line)
        doc = nlp(curr_line)
        section_value = {}
            
        # initializing section values to zero
        for section in list_of_sections:
            section_value[section] = 0.0
        section_value[None] = 0.0
            
        # updating section values    
        for token in doc:
            for section in list_of_sections:
                for word in similar_to[section]:
                    word_token = doc.vocab[word]
                    section_value[section] = max(section_value[section], float(word_token.similarity(token)))

        # determining the next section based on section values and threshold
        most_likely_section = None
        for section in list_of_sections:
            #print '>>', section, section_value[section]
            if (section_value[most_likely_section] < section_value[section] and section_value[section] > threshold):
                most_likely_section = section
            
        # updating the section
        if (previous_section != most_likely_section and most_likely_section is not None):
            previous_section = most_likely_section
                

        # writing data to the pandas series
        try:
            docx = nlp(line)
        except:
            continue  # to handle the odd case of characters like 'x02', etc.
        mod_line = ''
        for token in docx:
            if (not token.is_stop):
                mod_line += token.lemma_ + ' '
        
        curr_data_series[previous_section] += mod_line
            
    dict_of_data_series[file_name] = curr_data_series
    if flag_print:
        print(curr_data_series)
    main_file_handler.close()
    
data_frame = pd.DataFrame(dict_of_data_series)
data_frame.to_csv('prc_data.csv', sep='\t')
#data_frame.head()

  section_value[section] = max(section_value[section], float(word_token.similarity(token)))


In [10]:
data_frame.head()

Unnamed: 0,cv172,cv53,cv9,cv139,cv221,cv162,cv149,cv12,cv134,cv206,...,cv166,cv212,cv151,cv21,cv123,cv198,cv249,cv32,cv243,cv214
edu,education \n shine Star Sr . Sec . School Jala...,education \n â¢ \n Indian Institute Technolog...,education : \n Master computer \n,Cambridge Assessment operate manage University...,"government Girls high school - Bangalore , Kar...",education \n bsc \n Mumbai university \n,education \n B.Tech ECE \n Vardhaman College E...,education \n Diploma Advance Computing Advance...,education \n b - tech Computer Science Enginee...,education \n MCA Computer Science \n M G Unive...,...,â¢ BCA Vaish College Rohtak ( Haryana ) aggre...,education \n MCA Computer Science \n TMU Morad...,education \n B.E RGPV \n Board & University Bh...,education \n bachelor computer application \n ...,"education \n Mahendra Engineering College , Pe...",perform creative challenging position organiza...,education \n M.C.A Computer Science \n VINOBA ...,education \n Guru Nanak Dev University \n,education \n B.Tech Computer Science Engineeri...,education \n \n pune university \n
exp,work experience \n end developer \n Nugen Comp...,internship \n â¢ \n IBM Research Lab \n New D...,"desire Job Location : new york , New Jersey \t...",â¢ Proficient write T - SQL code retrieval / ...,see challenging position establish company off...,work experience \n fresher \n Description ...,motivate good attitude develop new skill use e...,work experience \n Software Developer \n Ktech...,work experience \n Mobile Application develope...,work experience \n Java Developer \n Java Deve...,...,work experience \n fresher \n Vision Shine Inf...,work experience \n Java Developer \n phlox glo...,"look career , provide opportunity enhancement ...",work organization ability utilize progress org...,"* excellent skill system integration , datum m...",work experience \n End Developer & Web Designe...,work experience \n Java Developer \n Global At...,work experience \n Software Developer \n Softw...,pursue challenging career Computer Science Tec...,work experience \n java \n MAXGEN - PUN \n Jun...
skill,use knowledge skill environment promote learni...,Technical Skills \n â¢ \n language : \n C / C...,desire Industry : Information Technology \t sp...,â¢ 36 + month MSBI software development exper...,TECHNICAL SKILL \n Programming Language Core J...,,"skill \n Apache ( 1 year ) , API ( 1 year ) , ...",,"Irusu Technologies - Hyderabad , Andhra Prades...",seek position utilize skill ability organizati...,...,obtain position software developer utilize pre...,"skill \n Java ( 1 year ) , Advance java ( 1 ye...",3 month experience PHP web developer ( Framewo...,"skill \n HTML5.CSS3,JS , bootstrap , PHP , MyS...","* 11 + year worldwide experience industry , so...","Richestsoft Technologies - Mohali , Punjab \n ...","skill \n Core Java , JDBC , Servlet , JSP , Hi...","skill \n C , C++,Core Java , Android",software skill \n â¢ programing skill : c - P...,"skill \n java , c , c++ ( 1 year )"
extra,"Varun Dev \n Jalandhar , Punjab \n PERSONAL Am...",ashish Kumar Yadav ( 07cs3028 ) \n mobile : +9...,objective : \n â¢ prove record transform non ...,Chirag Lakhina \n application developer \n Jin...,"Reshma Manu \n Bangalore , Karnataka \n link \...","Shubhada Jadhav \n Navi Mumbai , Maharashtra \...","N N Sriteja \n hyderabad , ANDHRA PRADESH , 50...",Amar Mane \n Software Developer - KtechBeans S...,"Naveen Kumar Aluri \n Hyderabad , Telangana \n...","Linda Paul \n Ernakulam , Kerala \n Additional...",...,"Sandeep Kumar \n Rohtak , Haryana \n achieveme...","Sanjay Jain \n Moradabad , Uttar Pradesh \n","Mani Yadav \n Noida , Uttar Pradesh \n link \n...",Arjun Reddy \n software developer \n Bangalore...,less primary secondary Muthusamy \n applicatio...,Rajesh Kumar \n frontend developer 3 Years exp...,"Tarique Faisal \n Bangalore , Karnataka \n Add...","Mahima Kad \n Batala , Punjab \n","Pendyala Prathyusha \n Kakinada , Andhra Prade...","Nitin Mhangare \n Pune , Maharashtra \n"
