In [1]:
from boto3 import s3
import boto3
import pandas as pd
import json
import re

data_bucket, input_bucket, output_bucket = '', ''
prefix = ''
s3_client = boto3.client('s3')

w_dict = {}



def collect_labeler_output_w(workflow):
    # Update code to iterate over batches in workflow
    # for each batch , run code below
    print(f"Collecting labeler output of all batches in workflow {workflow}")
    bucket = s3.Bucket(data_bucket)
    # dont change code unless it works with previous workflow format
    # now a workflow corresponds to a list of batches
    # so this function takes as input a workflow, if workflow
    # is a batch, same as before,
    # w_dict is dictionary with keys workflows, values batches in them
    # for batch in w_dict['workflow']
    for b in w_dict[workflow]:
        # collect batch labeler output and save
        # Create dictionary for storing fields
        true_value_fields = {f"True Value {n+1}": "" for n in range(7)}
        change_reason_fields = {f"Change Reason {n+1}": "" for n in range(7)}
        fields = {}
        fields.update({"RICOH_DCN": []})
        fields.update(true_value_fields)
        fields.update(change_reason_fields)
        fields.update(
            {
                "ADDRESSEE": "",
                "ADDRESS_LINE_1": "",
                "ADDRESS_LINE_2": "",
                "CITY": "",
                "STATE": "",
                "ZIP_CODE_4": "",
                "REGULATORY_APPROVAL_ID": "",
                "sub": "",
                "submissionTime": 0,
                "timeSpentInSeconds": 0,
                "ADDRESSEE_confidence": 0,
                "ADDRESS_LINE_1_confidence": 0,
                "ADDRESS_LINE_2_confidence": 0,
                "CITY_confidence": 0,
                "STATE_confidence": 0,
                "ZIP_CODE_4_confidence": 0,
                "REGULATORY_APPROVAL_ID_confidence": 0,
            }
        )
        # Create dataframe schema (columns) for holding data
        df = pd.DataFrame(fields)
        # Search for objects belonging to this workflow
        # for file in batch:
        #    collect_data(file)

        # List objects in the bucket with the given prefix
        response = s3_client.list_objects_v2(
            Bucket=input_bucket, Prefix=f"{prefix}/{b}"
        )

        # Iterate over the objects and grab the filenames ending in .TIF
        tif_files_in_batch = []
        for obj in response["Contents"]:
            key = obj["Key"]
            if key.endswith(".TIF"):
                key = key.rsplit("/")[-1]
                tif_files_in_batch.append(key)

        for x in tif_files_in_batch:
            # for each file in batch, find corresponding output
            for obj in bucket.objects.filter(Prefix=f"a2i/output/{workflow}"):
                # print(f'Looking for: {x[:-4]}/output.json')
                # print(f'Found: {obj.key}')
                if obj.key.endswith(f"{x[:-4]}/output.json"):
                    # print(f'This ends has desired ending: {obj.key}')
                    # Load json output from S3
                    content_object = s3.Object(bucket.name, obj.key)
                    file_content = content_object.get()["Body"].read().decode("utf-8")
                    json_content = json.loads(file_content)

                    # need to get only the 9 digit filename that starts with C, with no file ending
                    results = re.findall(r"[C]\d{9}", obj.key)
                    if results:
                        ricoh_dcn = results[0]
                    else:
                        ricoh_dcn = obj.key

                    # look up batch using ricoh
                    # batch name = batch_dict['ricoh_dcn']
                    # will be used below for
                    # Dictionary to store data for this document
                    data = fields.copy()
                    # print(fields)

                    # Add Ricoh_dcn
                    data.update({"RICOH_DCN": ricoh_dcn})

                    # NOTE: ConditionSatisfied is better called rules checked because even if not satisfied.
                    # we get the rule appearing here.
                    # store data from this document
                    # Store the actual values returned by textract
                    num_rules_checked = len(
                        json_content["inputContent"]["Results"]["ConditionSatisfied"]
                    )
                    for n in range(num_rules_checked):
                        f = json_content["inputContent"]["Results"][
                            "ConditionSatisfied"
                        ][n]["field_name"]
                        v = json_content["inputContent"]["Results"][
                            "ConditionSatisfied"
                        ][n]["field_value"]

                        data[f] = v

                    # Get actual confidence scores
                    num_confidence_scores = len(
                        json_content["inputContent"]["Results"]["ConditionMissed"]
                    )
                    for n in range(num_confidence_scores):
                        f_name = json_content["inputContent"]["Results"][
                            "ConditionMissed"
                        ][n]["field_name"]
                        f = f_name + "_confidence"
                        msg = json_content["inputContent"]["Results"][
                            "ConditionMissed"
                        ][n]["message"]
                        r = re.compile(r"\d+\.\d*")
                        v = r.findall(msg)
                        if not v:
                            r = re.compile(r"\s+\d+")
                            v = r.findall(msg)
                        data[f] = v[0]

                    # Corrections by labeler are 'humanAnswers'
                    if "humanAnswers" in json_content:
                        num_human_answers = len(json_content["humanAnswers"])
                        # print(num_human_answers)
                    else:
                        num_human_answers = 0
                        # print('no human answers')

                    # Get worker information about this particular labeling job
                    for n in range(num_human_answers):
                        # Get worker data
                        data["submissionTime"] = json_content["humanAnswers"][n][
                            "submissionTime"
                        ]
                        data["timeSpentInSeconds"] = json_content["humanAnswers"][n][
                            "timeSpentInSeconds"
                        ]
                        data["sub"] = json_content["humanAnswers"][n]["workerMetadata"][
                            "identityData"
                        ]["sub"]

                        # Get Answer information and add to data for this document
                        answers = json_content["humanAnswers"][n]["answerContent"]
                        if answers:
                            # add True Value to data
                            for k, o in answers.items():
                                data[k] = o
                    df = df.append(data, ignore_index=True)

        df.to_csv("labeler-output-" + b + ".csv", index=False)
        s3_location = f"{prefix}/{b}/output/labeler-output.csv"
        s3_client.upload_file(
            "./" + "labeler-output-" + b + ".csv", output_bucket, s3_location
        )