In [1]:
def extract_text_from_pdf(pdf_path):
    '''
    Helper function to extract the plain text from .pdf files

    :param pdf_path: path to PDF file to be extracted (remote or local)
    :return: iterator of string of extracted text
    '''
    if not isinstance(pdf_path, io.BytesIO):
        # extract text from local pdf file
        with open(pdf_path, 'rb') as fh:
            try:
                for page in PDFPage.get_pages(fh,caching=True,check_extractable=True):
                    
                    resource_manager = PDFResourceManager()
                    fake_file_handle = io.StringIO()
                    converter = TextConverter(resource_manager,fake_file_handle,codec='utf-8',laparams=LAParams())
                    page_interpreter = PDFPageInterpreter(resource_manager,converter)
                    page_interpreter.process_page(page)
                    text = fake_file_handle.getvalue()
                    yield text
                    # close open handles
                    converter.close()
                    fake_file_handle.close()
            except PDFSyntaxError:
                return
    else:
        # extract text from remote pdf file
        try:
            for page in PDFPage.get_pages(pdf_path,caching=True,check_extractable=True):
                resource_manager = PDFResourceManager()
                fake_file_handle = io.StringIO()
                converter = TextConverter(resource_manager,fake_file_handle,codec='utf-8',laparams=LAParams())
                page_interpreter = PDFPageInterpreter(resource_manager,converter)
                page_interpreter.process_page(page)
                text = fake_file_handle.getvalue()
                yield text

                # close open handles
                converter.close()
                fake_file_handle.close()
        except PDFSyntaxError:
            return

In [2]:
def get_number_of_pages(file_name):
    try:
        if isinstance(file_name, io.BytesIO):
            # for remote pdf file
            count = 0
            for page in PDFPage.get_pages(file_name,caching=True,check_extractable=True):
                count += 1
            return count
        else:
            # for local pdf file
            if file_name.endswith('.pdf'):
                count = 0
                with open(file_name, 'rb') as fh:
                    for page in PDFPage.get_pages(fh,caching=True,check_extractable=True):
                        count += 1
                return count
            else:
                return None
    except PDFSyntaxError:
        return None


In [3]:
def extract_text_from_docx(doc_path):
    '''
    Helper function to extract plain text from .docx files

    :param doc_path: path to .docx file to be extracted
    :return: string of extracted text
    '''
    try:
        temp = docx2txt.process(doc_path)
        text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
        return ' '.join(text)
    except KeyError:
        return ' '

In [4]:
def extract_text_from_doc(doc_path):
    '''
    Helper function to extract plain text from .doc files

    :param doc_path: path to .doc file to be extracted
    :return: string of extracted text
    '''
    try:
        try:
            import textract
        except ImportError:
            return ' '
        temp = textract.process(doc_path).decode('utf-8')
        text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
        return ' '.join(text)
    except KeyError:
        return ' '


In [5]:
def extract_text(file_path, extension):
    '''
    Wrapper function to detect the file extension and call text
    extraction function accordingly

    :param file_path: path of file of which text is to be extracted
    :param extension: extension of file `file_name`
    '''
    text = ''
    if extension == '.pdf':
        for page in extract_text_from_pdf(file_path):
            text += ' ' + page
    elif extension == '.docx':
        text = extract_text_from_docx(file_path)
    elif extension == '.doc':
        text = extract_text_from_doc(file_path)
    return text

In [6]:
def extract_entity_sections_professional(text):
    '''
    Helper function to extract all the raw text from sections of resume specifically for 
    professionals

    :param text: Raw text of resume
    :return: dictionary of entities
    '''
    text_split = [i.strip() for i in text.split('\n')]
    entities = {}
    key = False
    for phrase in text_split:
        if len(phrase) == 1:
            p_key = phrase
        else:
            p_key = set(phrase.lower().split()) & set(cs.RESUME_SECTIONS_PROFESSIONAL)
        try:
            p_key = list(p_key)[0]
        except IndexError:
            pass
        if p_key in cs.RESUME_SECTIONS_PROFESSIONAL:
            entities[p_key] = []
            key = p_key
        elif key and phrase.strip():
            entities[key].append(phrase)
    return entities

In [8]:
import os
import spacy
if __name__ == "__main__":   
    #Load The Spacy Model Path
    #nlp = spacy.load(os.path.dirname(os.path.abspath(__file__)))
    nlp=spacy.load('/home/needyin/Needyin_Resume_parsing/demo_resume_parsing')
    
#    for resumes in os.listdir('/home/needyin/Needyin_Resume_parsing/demo_resume_parsing/example_resumes/'):
#        text_raw = extract_text(resumes,'*/.pdf')
#        text = ' '.join(text_raw.split())
#        print(text)
#        print(resumes)
#    load Resume File(PDF,DOCX etc.)
    resumes = ('/home/needyin/Needyin_Resume_parsing/demo_resume_parsing/example_resumes/resume6.pdf')
#    text_raw = extract_text(resumes,'.docx')
    text_raw = extract_text(resumes,'.pdf')
#    Split and Extract the Text
    text = ' '.join(text_raw.split())
    #print(text)
#    basepath = 'example_resumes/'
#    for resumes in os.listdir(basepath):
#        if os.path.isfile(os.path.join(basepath, resumes)):
#            print(resumes)
#    for resumes in os.listdir('/home/needyin/Needyin_Resume_parsing/demo_resume_parsing/example_resumes/'):
#        text_raw = extract_text(resumes,'*.pdf')
#        text = ' '.join(text_raw.split())
    #entity   = extract_entity_sections_grad(text_raw)
    entity   = extract_entity_sections_professional(text_raw)
    doc2 =nlp(text_raw)
    entities = {}
    for ent in doc2.ents:
        if ent.label_ not in entities.keys():
            entities[ent.label_] = [ent.text]
        else:                                
            entities[ent.label_].append(ent.text)
            for key in entities.keys():
                entities[key] = list(set(entities[key]))
    print(entities)
    #print(educat)
#print(doc2.ents)
#Save The output in the JSON Data
with open('/home/needyin/Needyin_Resume_parsing/demo_resume_parsing/sample123.json','w',encoding='utf-8') as f:
    data=json.dump(entities,f,sort_keys=False,indent=4)
    #json.dump(data,f,ensure_ascii=False,separators=(',', ': '),sort_keys=True,indent=4)


NameError: name 'io' is not defined