In [58]:
import boto3
import boto3
import time

def startJob(s3BucketName, objectName):
    response = None
    client = boto3.client('textract',region_name='ap-south-1')
    response = client.start_document_text_detection(
    DocumentLocation={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': objectName
        }
    })

    return response["JobId"]

def isJobComplete(jobId):
    # For production use cases, use SNS based notification 
    # Details at: https://docs.aws.amazon.com/textract/latest/dg/api-async.html
    time.sleep(5)
    client = boto3.client('textract',region_name='ap-south-1')
    response = client.get_document_text_detection(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))

    while(status == "IN_PROGRESS"):
        time.sleep(5)
        response = client.get_document_text_detection(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))

    return status

def getJobResults(jobId):

    pages = []

    client = boto3.client('textract',region_name='ap-south-1')
    response = client.get_document_text_detection(JobId=jobId)
    
    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']

    while(nextToken):

        response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)

        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']

    return pages

# Document


s3BucketName = "textract-console-ap-south-1-f12c9455-7ed3-4350-9a19-89a52cb444a"


client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')
result = paginator.paginate(Bucket=s3BucketName)
list_of_documents=[]
for page in result:
    if "Contents" in page:
        for key in page[ "Contents" ]:
            keyString = key[ "Key" ]
            #print (keyString)
            list_of_documents.append(keyString)
filecounter=1
for documentName in list_of_documents:
    jobId = startJob(s3BucketName, documentName)
    print("Started job with id: {}".format(jobId))
    if(isJobComplete(jobId)):
        response = getJobResults(jobId)

    #print(response)
    file = []
    # Print detected text
    for resultPage in response:
        for item in resultPage["Blocks"]:
            if item["BlockType"] == "LINE":
              #  print ('\033[94m' +  item["Text"] + '\033[0m')
                file.append(item["Text"])
    # with open("response3.txt", "w") as f:
    #     f.write(str(file))
    #     f.close()
    f_name = 'resume'+'_'+str(filecounter)+'.txt'
    filecounter+=filecounter
    with open(f_name, 'w') as f:
        for x in file:
            s = "".join(map(str, x))
            f.write(s+'\n')
        f.close()
    print ( "writing file completed " ,f_name )

Started job with id: cc7509c971d43ee032e49ed9bce751b2a0d4f256dad1d0b6c16d9e496b51a1a7
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
writing file completed  resume_1.txt
Started job with id: edd2d488d88aedd3e013b6c574b7a3fefa161db04f63987325d642f69451bca5
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
writing file completed  resume_2.txt


In [54]:
import os
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tag import pos_tag
import string
import spacy
import en_core_web_sm


In [73]:
def load_files():
    path = "E:\DataScience\Job_Assignments"
    files = []
    for i in os.listdir(path):
        if os.path.isfile(os.path.join(path,i)) and i.startswith('resume'):
            files.append(i)
    return files

def preprocess(file_content):
    c = file_content.split('\n')
    p = string.punctuation
    t = str.maketrans(p,len(p)*' ')
    l=''
    for x in c:
        s = x.translate(t) 
        #print(s)
        words = word_tokenize(s)
        words = [w.strip() for w in words]
        for w in words:
            l = l+' ' +w
    return l

file_list =   load_files()
for filename_read in file_list:
    fp = open(filename_read,"r",encoding="utf-8")
    file_content = fp.read()
    print(filename_read)
    print('********************************')
    
    p_text = preprocess(file_content)
    
    #using Spacy
    print('*********Using Spacy************')
    nlp = en_core_web_sm.load()
    doc = nlp(p_text)
    for x in doc.ents:
        if len(x.text)>3 and (x.label_ == 'ORG'or x.label_ =='DATE'):
            print(x.text) 
    
    print('*********Using RegexpParser Pos tagging************')
    pattern = r"""
            Date : {<NNP><CD><NNP>?<CD>}
           ORG : {<NNP.>+<IN.*>?<NNP.*>?}
           """
    
    word_token = nltk.word_tokenize(p_text)
    tagged_words = nltk.pos_tag(word_token)
    
    cp = nltk.RegexpParser(pattern)
    cs = cp.parse(tagged_words)
    
    for a in cs:
        if isinstance(a, nltk.tree.Tree):
            if a.label() == "Date" or a.label()=='ORG':
                print(" ".join([lf[0] for lf in a.leaves()]))

resume_1.txt
********************************
*********Using Spacy************
Krishna Sai Gandhi Group Of Institutions
GPA 68
Saraswathi Junior
ES6 Backend
STATE MANAGEMENT
25th
Microsoft
React CloudArmor Supporting Reputation Based Trust Management for Cloud Services Trust Management
Privacy Security and Availability
*********Using RegexpParser Pos tagging************
GPA 8 3
Projects Women
Services Trust
resume_2.txt
********************************
*********Using Spacy************
February 2019
March 2019
July 2016 2018
American Telephone Telegraph Company
National Institute of Designing Exam
May 2012 June 2016
March 2010
April 2012 11th and 12th
March 2010
April 2012 10
Sree Ayyappa Public School Bokaro Bhilai
*********Using RegexpParser Pos tagging************
February 2019 March 2019
July 2016 2018
May 2012 June 2016
Bhilai 8 6
March 2010 April 2012
Bhilai 78 2
March 2010 April 2012
Bhilai 9 8
