## Import Statements

In [None]:
import re
import json
import csv
import boto3
import time
import pandas as pd
result = {}
import os

#### Make Sure that the "saved_folder" is already created in the same S3 bucket

### INPUT: enter initial document details

In [None]:
#initial document details
s3Name = #s3BucketName
docuName = #document name
saved_folder = #Folder to put textract outputs
box_file_name = #Box Name
csv_file_name = #Name of CSV output file

## Textract

#### Assuming API (Boto3) is already authenticated
#### refer to https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html for Boto3 authentication
#### refer to https://docs.aws.amazon.com/textract/latest/dg/async.html for textract documentation

In [None]:
#start Textract job
def startJob(s3BucketName, objectName):
    response = None
    client = boto3.client(service_name='textract', region_name='us-west-2')
    response = client.start_document_text_detection(
    DocumentLocation={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': objectName
        }
    })

    return response["JobId"]

In [None]:
#get current status of the Textract job, checking in 5 second intervals
def isJobComplete(jobId):
    time.sleep(5)
    client = boto3.client(service_name='textract', region_name='us-west-2')
    response = client.get_document_text_detection(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))

    while(status == "IN_PROGRESS"):
        time.sleep(5)
        response = client.get_document_text_detection(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))

    return status

In [None]:
#once job is complete get the results
def getJobResults(jobId):
    pages = []
    time.sleep(5)
    client = boto3.client(service_name='textract', region_name='us-west-2')
    response = client.get_document_text_detection(JobId=jobId)
    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']

    while(nextToken):
        time.sleep(5)
        response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']

    return pages

In [None]:
#start textract and check if job is complete
def start_textract(s3BucketName, documentName):
    jobId = startJob(s3BucketName, documentName)
    print("Started job with id: {}".format(jobId))
    if(isJobComplete(jobId)):
        response = getJobResults(jobId)
        print ('uploading...')

    return(response)

In [None]:
#upload the finished textract files back to the s3 bucket
#this is needed for comprehend to run later
def upload_text1(response):
    text=[]
    page=[]
    file_name = "Output1.txt"
    text_file = open(file_name, "w")
    count = 0
    for resultPage in response:
        for item in resultPage["Blocks"]:
            if item["BlockType"] == "LINE":
                text.append(item["Text"])
                page.append(item['Page'])
                if page[count] == page[count-1]:
                    text_file.write(item["Text"] + '\n')
                    count=count+1
                else:
                    text_file.close()
                    s3_client = boto3.client('s3')
                    response = s3_client.upload_file(file_name, s3Name, saved_folder + "/{}".format(file_name))
                    file_name = "Output" + str(item['Page']) + ".txt"
                    text_file = open(file_name, "w")
                    text_file.write(item["Text"] +'\n')
                    count=count+1

    #print(text)
    text_file.close()
    print('finished')

In [None]:
#main function that calls other helper functions
def convert_card_image():
    result = start_textract(s3Name, docuName)
    upload_text1(result)

### RUN: function below to start textract

In [None]:
#run this call to start textract and upload the files to s3 bucket when finished
convert_card_image()

### INPUT: AWS Comprehend
#### The below values are sample inputs give. The dARN, input, and outputs are specific to your model and file locations. 
#### For more information, refer: https://docs.aws.amazon.com/comprehend/latest/dg/API_DetectEntities.html#comprehend-DetectEntities-request-EndpointArn

In [None]:
#general comprehend
general_dARN = "arn:aws:iam::12312132:role/test-role"
general_s3_input = "s3://input-path"
general_s3_output = "s3://output-path"


#decision comprehend
decision_dARN = "arn:aws:iam::12312132:role/test-role"
decision_ARN = "arn:aws:comprehend:us-west-2:12312312:testmodel"
decision_s3_input = "s3://input-path"
decision_s3_output = "s3://output-path"

#bankloc comprehend
bankloc_dARN = "arn:aws:iam::12312132:role/test-role"
bankloc_ARN = "arn:aws:comprehend:us-west-2:12312312:testmodel"
bankloc_s3_input = "s3://input-path"
bankloc_s3_output = "s3://output-path"

### Start Comprehend Models

#### refer to https://docs.aws.amazon.com/comprehend/latest/dg/functionality.html for general comprehend documentation
#### refer to https://docs.aws.amazon.com/comprehend/latest/dg/auto-ml.html for custom comprehend documentation

In [None]:
#starts general comprehend analysis job
def start_general_comprehend():
    comprehend = boto3.client(service_name='comprehend', region_name='us-west-2')
    response = comprehend.start_entities_detection_job(
        DataAccessRoleArn = general_dARN,
        LanguageCode = "en",
        InputDataConfig= { 
            "InputFormat": "ONE_DOC_PER_LINE",
            "S3Uri": general_s3_input  
        },
        OutputDataConfig= { 
            "S3Uri": general_s3_output
        }
    )
    return response["JobId"]

In [None]:
#starts decision comprehend analysis job (decision entitiy only) 
def start_decision_comprehend():
    comprehend = boto3.client(service_name='comprehend', region_name='us-west-2')
    response = comprehend.start_entities_detection_job(
        DataAccessRoleArn = decision_dARN,
        EntityRecognizerArn= decision_ARN,
        LanguageCode = "en",
        InputDataConfig= { 
            "InputFormat": "ONE_DOC_PER_LINE",
            "S3Uri": decision_s3_input 
        },
        OutputDataConfig= { 
            "S3Uri": decision_s3_output
        }
    )
    return response["JobId"]

In [None]:
#starts bankloc comprehend analysis job (other entities) 
def start_bankloc_comprehend():
    comprehend = boto3.client(service_name='comprehend', region_name='us-west-2')
    response = comprehend.start_entities_detection_job(
        DataAccessRoleArn = bankloc_dARN,
        EntityRecognizerArn= bankloc_ARN,
        LanguageCode = "en",
        InputDataConfig= { 
            "InputFormat": "ONE_DOC_PER_LINE",
            "S3Uri": bankloc_s3_input
        },
        OutputDataConfig= { 
            "S3Uri": bankloc_s3_output
        }
    )
    return response["JobId"]

In [None]:
#get current status of the comprehend job, checking in 20 second intervals
def isJobComplete_comprehend(jobID, type_comprehend):
    time.sleep(1)
    client = boto3.client(service_name='comprehend', region_name='us-west-2')
    response = client.describe_entities_detection_job(JobId=jobID)
    
    while(response['EntitiesDetectionJobProperties']['JobStatus'] == "IN_PROGRESS"):
        response = client.describe_entities_detection_job(JobId=jobID)
        print("working...")
        time.sleep(20)
    
    return response['EntitiesDetectionJobProperties']['JobStatus']

In [None]:
#function to start all comprehends together and print when each is done
def start_all_comprehend():
    general_id = start_general_comprehend()
    decision_id = start_decision_comprehend()
    bankloc_id = start_bankloc_comprehend()
    general = isJobComplete_comprehend(general_id, "general")
    decision = isJobComplete_comprehend(decision_id, "decision")
    bankloc = isJobComplete_comprehend(bankloc_id, "comprehend")
    print(general)
    print(decision)
    print(bankloc)
    
    

### RUN: Start all comprehend functions

In [None]:
#calling function from above
start_all_comprehend()

### INPUT: Comprehend Output File Names

In [None]:
bankloc_text_output = "BankLoc.txt"
general_text_output = "General.txt"
decision_text_output = "Decision.txt"

### Parse through Comprehend Outputs

In [None]:
#parse through general comprehend and identify banks or locations
def initial_comprehend(file_name):
    final_output = {}
    file1 = open("General.txt")
    Lines = file1.readlines()
    for line in Lines:
        line_dict = json.loads(line)
        if line_dict["File"] == file_name:
            final_output[line_dict["Line"]] = {}
            for entitiy in line_dict["Entities"]:
                final_output[line_dict["Line"]][entitiy["Type"]] = entitiy["Text"]
    return final_output

In [None]:
def if_normal(file_name):
    bankloc_results = ["No", "No"]
    basic = initial_comprehend(file_name)
    for line in basic:
        if bankloc_results[0] == "No":
            if "ORGANIZATION" in basic[line]:
                bankloc_results[0] = basic[line]["ORGANIZATION"]
        if bankloc_results[1] == "No":
            if "LOCATION" in basic[line]:
                bankloc_results[1] = basic[line]["LOCATION"]
    return bankloc_results

In [None]:
#parse through custom comprehend and identify decision
def if_decision (file_name, line_num):
    file1 = open("Decision.txt")
    Lines = file1.readlines()
    bank = "NA"
    decision = ""
    for line in Lines:
        line_dict = json.loads(line)
        if line_dict["File"] == file_name:
            if line_dict["Line"] == line_num:
                for text_type in line_dict["Entities"]:
                    if "Type" in text_type:
                        if text_type["Type"] == "Decision":
                            decision = text_type["Text"]
    return decision

In [None]:
#parse through bankloc comprehend
def if_bankloc(file_name):
    file1 = open("BankLoc.txt")
    Lines = file1.readlines()
    bank = "No"
    loc = "No"
    for line in Lines:
        line = str(line)
        line_dict = json.loads(line)
        if line_dict["File"] == file_name:
            for text_type in line_dict["Entities"]:
                if "Type" in text_type:
                    if text_type["Type"] == "BANK":
                        bank = text_type["Text"]
                    elif text_type["Type"] == "LOCATION":
                        loc = text_type["Text"]
    return ([bank, loc])

In [None]:
#compare custom and general comprhend outputs to see if they are similar
def compare(file_name):
    bankloc_results = if_bankloc(file_name)
    normal_results = if_normal(file_name)
    if bankloc_results[0] == "No":
        bankloc_results[0] = normal_results[0]
    if bankloc_results[1] == "No":
        bankloc_results[1] = normal_results[1]
    return bankloc_results

### Regex Functions

In [None]:
#returns abbrev if one exists within the line number (line_num) of the file (file_name)
def abbrev_finder(file_name, line_num):
    file1 = open(file_name, 'r')
    Lines = file1.readlines()
    
    line = Lines[line_num]
    abbrev = re.search('[A-z]+.?-.?[0-9]+',line)
    if abbrev:
        return([abbrev.group(0), line])
    else:
        return [None,line]

In [None]:
#returns paid if one exists within the line number (line_num) of the file (file_name)
def paid_finder(file_name, num):
    file1 = open(file_name, 'r')
    Lines = file1.readlines()
    line = Lines[num]
    paid = re.search('PAID+',line)
    if paid:
        return paid.group(0)
    else:
        return ""

In [None]:
#returns bank if one exists within the line number (line_num) of the file (file_name)
def get_bank(file_name):
    file1 = open(file_name, 'r')
    Lines = file1.readlines()
    for line in Lines:
        bank2 = re.search('^Bank.*', line)
        if bank2:
            if "Loan" not in bank2.group(0):
                return(bank2.group(0).split(" - ")[0])
        else:
            bank2 = re.search('^bank.*',line)
            if bank2:
                if "Loan" not in bank2.group(0):
                    return(bank2.group(0).split(" - ")[0])
        bank = re.search('.+?(?=Bank)',line)
        if bank:
            return(bank.group(0) + "Bank")
        else:
            bank = re.search('.+?(?=bank)',line)
            if bank:
                return(bank.group(0) + "Bank")
            else:
                continue
    return("")

In [None]:
#identifies if county is in the line
def county_in(file_name):
    file1 = open(file_name)
    Lines = file1.readlines()
    for line in Lines:
        if 'County' in line or 'county' in line:
            return [1,line]
            
    return [0,""]

### Combine Regex + Comprehend Functions

In [None]:
#combine regex + comprehend outputs to get all identified entities 
def rest_finder(file):
    file2 = open('output18.txt','r')
    lines = file2.readlines()
    for line in lines:
        line_dict = json.loads(line)
        line_int = int(line_dict["Line"])
        line_number = "line: " + str(line_dict["Line"])
        result[line_number] = {}
        for entity in line_dict["Entities"]:
            result[line_number][entity["Type"]] = entity["Text"]
            result[line_number]["Abbrev"] = abbrev_finder(line_int)
            result[line_number]["Paid"] = paid_finder(line_int)[0]
            result[line_number]["Line"] = paid_finder(line_int)[1]
            result[line_number]["Page Number"] = paid_finder(line_int)[2]
    #return (result)
    decision()
    return (result)

### Compile into CSV

In [None]:
#compile outputs into one singular csv file
def get_csv_test():
    wd = os.getcwd()
    files = [i for i in os.listdir(wd) if i.endswith("txt")]
    with open (csv_file_name, 'w', newline = '') as file:
        writer = csv.writer(file)
        writer.writerow(["Bank Name", "Location", "Date", "Abbrev", "Decision", "Amount", "Paid", "Box Number", "Page Number", "Line", "Bank_Confidence", "Loc_Confidence", "Line", "County", "County Text"])
        counter = 1
        print(files)
        for file_name in files:
            if file_name == "Decision.txt" or file_name == "BankLocCode.txt" or file_name == "requirements.txt":
                continue
            print("Working on " + str(counter))
            counter += 1
            both = compare(file_name)
            bankloc = if_bankloc(file_name)
            normal = if_normal(file_name)
            if bankloc[0] == normal[0]:
                bank_c = 1
            else:
                bank_c = 0
            if bankloc[1] == normal[1]:
                loc_c = 1
            else:
                loc_c = 0
            general = initial_comprehend(file_name)
            for line in general:
                print(line)
                date = ""
                abbrev = ""
                decision = ""
                amount = ""
                paid = ""
                box_number = box_file_name
                page_number = re.search('[0-9]+',file_name).group(0)
                line_str = ""
                line_text = ""
                if 'DATE' in general[line] or "QUANTITY" in general[line]:
                    if 'DATE' in general[line]:
                        date = general[line]['DATE']
                    if 'QUANTITY' in general[line]:
                        amount = general[line]['QUANTITY']
                    print(file_name + " , " + str(line))
                    abbrev_total = abbrev_finder(file_name, int(line))
                    abbrev = abbrev_total[0]
                    paid = paid_finder(file_name, int(line))
                    decision = if_decision(file_name, int(line))
                    line_str = str(int(line))
                    line_text = [abbrev_total[1]][0]
                    county = county_in(file_name)
                    writer.writerow([both[0], both[1], date, abbrev, decision, amount, paid, box_number, page_number, line_str, str(bank_c), str(loc_c), line_text, county[0], county[1]])

### RUN: function to download csv

In [None]:
#calling above function
get_csv_test()