In [3]:
import boto3
import pandas as pd
import re
import json
import pandas as pd
from functools import reduce
from numpy import nan
from io import StringIO

pd.set_option('display.max_colwidth', 100)

s3 = boto3.resource('s3')
s3_client = boto3.client('s3')

# default_user_pool_id to use if it is missing in the records
default_user_pool_id = 'us-west-2_1j6F4wzAn'

# Create a Cognito client using the session
cognito_client = boto3.client('cognito-idp')



In [4]:
# the date here will be possibly of a greater range since we are filtering by the labelers submission date
# so need to look into a wider range of dates
LABELERS_OUTPUT_PREFIX = 'a2i/output/'
LABELERS_DATA_OUTPUT_FILENAME = 'labeler-output.csv'
OUTPUT_PREFIX = 'reports/'
LABELERS_DATA_OUTPUT_PREFIX = f'{OUTPUT_PREFIX}labelers_output_data/'



LABELERS_OUTPUT_START_DAY = '07102023'
LABELERS_OUTPUT_END_DAY = '07242023'
START_DAY = '07212023'
END_DAY = '07222023'
INTERVAL_HOUR = 23
START_DAY_TIME = f'{START_DAY[4:8]}-{START_DAY[:2]}-{START_DAY[2:4]}T{INTERVAL_HOUR}'
END_DAY_TIME = f'{END_DAY[4:8]}-{END_DAY[:2]}-{END_DAY[2:4]}T{INTERVAL_HOUR}'
OUTPUT_METRICS_KEY = f'{OUTPUT_PREFIX}{START_DAY}-{END_DAY}-labeler-metrics.csv'


START_DAY_ANSWERS_SUMMARY = '07012023'
END_DAY_ANSWERS_SUMMARY = '07042023'
CHANGE_REASON_KEY = f'{OUTPUT_PREFIX}{START_DAY_ANSWERS_SUMMARY}-{END_DAY_ANSWERS_SUMMARY}-change_reason.csv'
OVERALL_TRUE_VALUES_KEY= f'{OUTPUT_PREFIX}{START_DAY_ANSWERS_SUMMARY}-{END_DAY_ANSWERS_SUMMARY}-true_values.csv'


DEFAULT_SEPARATOR = ','
OUTPUT_BUCKET_NAME =  "ibp-textract-prod1-output"
OUTPUT_PREFIX = f'reports/'
EMAIL_DICT_FILE = f'{OUTPUT_PREFIX}emails_dict.json'



In [5]:
#functions to load and save jsons and csv files to bucket / local file system

def load_json(bucket_name, key):
    print(f'Loading from s3://{bucket_name}/{key}')
    try:
        s3_object = s3_client.get_object(Bucket=bucket_name, Key=key)
        data = s3_object['Body'].read().decode('utf-8')
        print(f'Finished loading.')
        return json.loads(data)
    
    except Exception as e:
        print(f'Error reading from file {e}')
        return {}
    
def save_json(bucket_name, dictionary, key):
    print(f'Saving to s3://{bucket_name}/{key}')
    try:
        # Convert Dictionary to JSON String
        data_string = json.dumps(dictionary, indent=2, default=str)
        # s3_client.put_object(
        #     Bucket=bucket_name, 
        #     Key=key,
        #     Body=data_string
        # )
        print(f'Finished saving.')
    except Exception as e:
        print(f'Error saving file {e}')

def load_labelers_output_dataframe_from_csv(bucket_name, prefix, dates, filename, separator):
    output_dataframe = pd.DataFrame()
    for date in dates:
        object_key = f'{prefix}{date}/{filename}'
        print(f'Loading labelers\' output from s3://{bucket_name}/{object_key}')
        try:
            s3_object = s3_client.get_object(Bucket=bucket_name, Key=object_key)
            data = s3_object['Body'].read().decode('utf-8')
            dataframe = pd.read_csv(StringIO(data), sep=separator, low_memory=False)
            output_dataframe = pd.concat((output_dataframe, dataframe))
        except Exception as e:
            print(f'Output for {date} not loaded {e}')
    print(f'Finished loading.')
    return output_dataframe
    
def save_dataframe_to_csv(bucket_name, dataframe, key, separator):
    print(f'Saving to s3://{bucket_name}/{key}')
    csv_buffer = StringIO()
    try:
        dataframe.to_csv(csv_buffer, index=False, sep=separator)
        csv_buffer.seek(0)
        # s3_client.put_object(Body=csv_buffer.getvalue(), Bucket=bucket_name, Key=key)
        csv_buffer.close()
        print(f'Finished saving.')
    except Exception as e:
        print(f'Error saving file {e}')

In [6]:
from datetime import datetime, timedelta

def get_dates_between(start_date, end_date):
    dates = []
    date_format = "%m%d%Y"
    start_datetime = datetime.strptime(start_date, date_format)
    end_datetime = datetime.strptime(end_date, date_format)
    current_datetime = start_datetime

    while current_datetime <= end_datetime:
        dates.append(current_datetime.strftime(date_format))
        current_datetime += timedelta(days=1)

    return tuple(dates)

labelers_output_dates = get_dates_between(LABELERS_OUTPUT_START_DAY, LABELERS_OUTPUT_END_DAY)
dates_summary = get_dates_between(START_DAY_ANSWERS_SUMMARY, END_DAY_ANSWERS_SUMMARY)

In [7]:
# gets user email from cognito userpool with user_pool_id based on user's sub.
def get_user_from_identity_issuer(sub, user_pool_id):
    # Lookup user details
    response = cognito_client.list_users(
        UserPoolId=user_pool_id,
        Filter= 'sub = "' + sub + '"'
    )

    email, given_name, family_name = None, None, None
    # Extract the user's email and name
    for user in response['Users']:
        for attr in user['Attributes']:
            if attr['Name'] == 'email':
                email = attr['Value']
            elif attr['Name'] == 'name' and attr['Value'] and attr['Value'].find('@') != -1: 
                email = attr['Value']
            elif attr['Name'] == 'given_name': 
                given_name = attr['Value']
            elif attr['Name'] == 'family_name': 
                family_name = attr['Value']
 
    return email, (f'{given_name} {family_name}' if given_name or family_name else None)

In [8]:
df = pd.read_csv('outputs/01012023-07212023-labeler-output.csv', low_memory=False)

In [9]:
restrict_1 = df[df['sub'] == '0f5910e7-bca8-4364-bcfa-9eec0253085c']
restrict_2 = df[df['sub'] == 'b149f627-83f8-4f38-ba25-9afcea08b4fb']
restrict_3 = df[df['sub'] == '5d92fd56-4e08-4b36-b30b-87810ad13261']

In [10]:
restrict_1.shape[0]

12443

In [11]:
answers_range = (1, 2, 3, 4, 5, 6, 7)
change_reason_codes = ('A', 'B', 'C', 'D', 'F')
change_reason_codes_chars_set = {'A', 'B', 'C', 'D', 'F', ' ', ',', 'a', 'b', 'c', 'd', 'f', '\n', '\t'}

CHANGE_REASON_COL_NAME = "Change Reason"
TOTAL_COL_NAME = 'Total'


from itertools import chain, combinations

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

# function to detect if an answer uses a particular change reason code 
def has_answer_code(answer, code):
    answer_chars_set = set(str(answer))
    if len(answer_chars_set.difference(change_reason_codes_chars_set)):
        return False
    
    return answer == code or any([letter.upper() == code for letter in answer])

# based on a2i output.json-s files creates change reason statistics (A, B, C, D, F) and saves it to CHANGE_REASON_KEY s3 object (file)
def change_reason(df):
    # df = load_labelers_output_dataframe_from_csv(
    #     OUTPUT_BUCKET_NAME,
    #     LABELERS_DATA_OUTPUT_PREFIX,
    #     dates_summary,
    #     LABELERS_DATA_OUTPUT_FILENAME,
    #     DEFAULT_SEPARATOR,
    # )
    # df
    if not isinstance(df, pd.DataFrame):
        print('No data found.')
        return
    
    print('Generating report.')
    # df['date_key_part'] = df['output_file_s3_key'].apply(lambda val: val[len(LABELERS_OUTPUT_PREFIX):len(LABELERS_OUTPUT_PREFIX) + 8])
    # df = df[df['date_key_part'].isin(dates)]
 
    change_reason_col_names = tuple(f'{CHANGE_REASON_COL_NAME} {answer}' for answer in answers_range)
    change_reason_col_names = tuple(change_reason_col_name 
        for change_reason_col_name in change_reason_col_names if change_reason_col_name in set(df.columns.values)
    )
    for change_reason_code in change_reason_codes:
        df.loc[:, change_reason_code] = df.loc[:, change_reason_col_names].apply(
            lambda row: any([has_answer_code(row[change_reason_col_name], change_reason_code) for change_reason_col_name in change_reason_col_names]),
            axis=1,
        )
    df[TOTAL_COL_NAME] = True    
    special_cols = [TOTAL_COL_NAME]  + list(change_reason_codes)
    for subset in powerset(change_reason_codes):
        subset_string = ''.join(subset)
        if len(subset_string) > 1:
            special_cols.append(subset_string + '_all')
            df[subset_string + '_all'] = None
            special_cols.append(subset_string + '_any')
            df[subset_string + '_any'] = None
        else:
            continue
    
        df.loc[:, subset_string + '_all'] = df.loc[:, change_reason_codes].apply(
            lambda row: all([row[change_reason_code] for change_reason_code in subset]),
            axis=1,
        )
        df.loc[:, subset_string + '_any'] = df.loc[:, change_reason_codes].apply(
            lambda row: any([row[change_reason_code] for change_reason_code in subset]),
            axis=1,
        )

    
    df_1 = df[special_cols].sum()
    df_1 = df_1.to_frame().T
    df_2 = (df[special_cols].sum() / df_1.loc[0, TOTAL_COL_NAME]) if df_1.loc[0, TOTAL_COL_NAME] > 0 else df[special_cols].sum() 
    df_2 = df_2.apply(lambda val: f'{round(val * 100, 2)} %').to_frame().T
    df = pd.concat((df_1, df_2))
    df_0 = pd.DataFrame.from_dict({'': ['Total Code Occurences', 'Percentage']})
    df = df.reset_index()
    df = df.drop(['index'], axis=1)
    df = pd.concat((df_0, df), axis=1)
    save_dataframe_to_csv(OUTPUT_BUCKET_NAME, df, CHANGE_REASON_COL_NAME, DEFAULT_SEPARATOR)
    return df


In [12]:
pd.concat((change_reason(restrict_1), change_reason(restrict_2), change_reason(restrict_3), change_reason(df)))

Generating report.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, change_reason_code] = df.loc[:, change_reason_col_names].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, change_reason_code] = df.loc[:, change_reason_col_names].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, change_reason_code] = df.loc[:, change_reason_c

Saving to s3://ibp-textract-prod1-output/Change Reason
Finished saving.
Generating report.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, change_reason_code] = df.loc[:, change_reason_col_names].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, change_reason_code] = df.loc[:, change_reason_col_names].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, change_reason_code] = df.loc[:, change_reason_c

Saving to s3://ibp-textract-prod1-output/Change Reason
Finished saving.
Generating report.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, change_reason_code] = df.loc[:, change_reason_col_names].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, change_reason_code] = df.loc[:, change_reason_col_names].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, change_reason_code] = df.loc[:, change_reason_c

Saving to s3://ibp-textract-prod1-output/Change Reason
Finished saving.
Generating report.
Saving to s3://ibp-textract-prod1-output/Change Reason
Finished saving.


Unnamed: 0,Unnamed: 1,Total,A,B,C,D,F,AB_all,AB_any,AC_all,...,ABCF_all,ABCF_any,ABDF_all,ABDF_any,ACDF_all,ACDF_any,BCDF_all,BCDF_any,ABCDF_all,ABCDF_any
0,Total Code Occurences,12443,0,0,0,0,1,0,0,0,...,0,1,0,1,0,1,0,1,0,1
1,Percentage,100.0 %,0.0 %,0.0 %,0.0 %,0.0 %,0.01 %,0.0 %,0.0 %,0.0 %,...,0.0 %,0.01 %,0.0 %,0.01 %,0.0 %,0.01 %,0.0 %,0.01 %,0.0 %,0.01 %
0,Total Code Occurences,26642,134,152,209,81,301,130,156,0,...,0,605,0,407,0,592,0,613,0,614
1,Percentage,100.0 %,0.5 %,0.57 %,0.78 %,0.3 %,1.13 %,0.49 %,0.59 %,0.0 %,...,0.0 %,2.27 %,0.0 %,1.53 %,0.0 %,2.22 %,0.0 %,2.3 %,0.0 %,2.3 %
0,Total Code Occurences,9493,202,344,165,205,42,125,421,1,...,0,622,1,564,0,507,0,651,0,723
1,Percentage,100.0 %,2.13 %,3.62 %,1.74 %,2.16 %,0.44 %,1.32 %,4.43 %,0.01 %,...,0.0 %,6.55 %,0.01 %,5.94 %,0.0 %,5.34 %,0.0 %,6.86 %,0.0 %,7.62 %
0,Total Code Occurences,608405,10375,16407,12243,9753,17817,5949,20833,704,...,263,46786,203,43874,40,43715,40,49703,37,53414
1,Percentage,100.0 %,1.71 %,2.7 %,2.01 %,1.6 %,2.93 %,0.98 %,3.42 %,0.12 %,...,0.04 %,7.69 %,0.03 %,7.21 %,0.01 %,7.19 %,0.01 %,8.17 %,0.01 %,8.78 %


In [13]:
change_reason(df).head()

Generating report.
Saving to s3://ibp-textract-prod1-output/Change Reason
Finished saving.


Unnamed: 0,Unnamed: 1,Total,A,B,C,D,F,AB_all,AB_any,AC_all,...,ABCF_all,ABCF_any,ABDF_all,ABDF_any,ACDF_all,ACDF_any,BCDF_all,BCDF_any,ABCDF_all,ABCDF_any
0,Total Code Occurences,608405,10375,16407,12243,9753,17817,5949,20833,704,...,263,46786,203,43874,40,43715,40,49703,37,53414
1,Percentage,100.0 %,1.71 %,2.7 %,2.01 %,1.6 %,2.93 %,0.98 %,3.42 %,0.12 %,...,0.04 %,7.69 %,0.03 %,7.21 %,0.01 %,7.19 %,0.01 %,8.17 %,0.01 %,8.78 %


In [14]:
# counting the total number of documents processed by textract for given dates

def get_total(dates):
    target_bucket_name = 'ibp-textract-prod1-target'
    num_docs = 0
    for date in dates:
        for obj in s3.Bucket(target_bucket_name).objects.filter(Prefix=date):
            if obj.key.endswith('.TIF'):
                num_docs += 1
        print(f'Total number of documents up to {date} is {num_docs}')
    return num_docs

In [15]:
true_values_range = (1, 2, 3, 4, 5, 6, 7)

DESIRED_COL_NAMES = (
    "Sum of Number RNC ADDRESSEE",
    "Sum of Number RNC ADDRESS_LINE_1",
    "Sum of Number RNC ADDRESS_LINE_2",
    "Sum of Number RNC CITY",
    "Sum of Number RNC STATE",
    "Sum of Number RNC ZIP_CODE_4",
    "Sum of Number RNC REGID",
)

TOTAL_NO_OF_DOCUMENTS_COL_NAME = "Sum of Total Number of Documents"

TRUE_VALUE_COL_NAME = "True Value"

col_rename_dict = {
    f'{TRUE_VALUE_COL_NAME} {true_value}': DESIRED_COL_NAMES[i-1]
    for i, true_value in enumerate(true_values_range)
}
col_rename_dict[TOTAL_COL_NAME] = "Sum of Number verified by workers overall"

# def has_been_touched(true_value):
#     return not true_value.isna()

# based on a2i output.json-s files creates true value statistics (that is, the number of touches/checks/corrections performed by labelers)
# and saves it to OVERALL_TRUE_VALUES_KEY s3 object (file)
# we are just checking the entries for being notnull.
def overall_true_value_entries(df):
    print('Generating report.')
    output_dataframe = pd.DataFrame()
    for date in dates_summary:
        # df = load_labelers_output_dataframe_from_csv(
        #     OUTPUT_BUCKET_NAME,
        #     LABELERS_DATA_OUTPUT_PREFIX,
        #     (date,),
        #     LABELERS_DATA_OUTPUT_FILENAME,
        #     DEFAULT_SEPARATOR,
        # )
        if not isinstance(df, pd.DataFrame):
            print('No data found.')
            return

        # df['date_key_part'] = df['output_file_s3_key'].apply(lambda val: val[len(LABELERS_OUTPUT_PREFIX):len(LABELERS_OUTPUT_PREFIX) + 8])
        # df = df[df['date_key_part'].isin(labelers_output_dates)]

        true_value_col_names = (f'{TRUE_VALUE_COL_NAME} {true_value}' for true_value in true_values_range)
        true_value_col_names = tuple(true_value_col_name
            for true_value_col_name in true_value_col_names if true_value_col_name in set(df.columns.values)
        )
        for true_value_col_name in true_value_col_names:
            df.loc[:, true_value_col_name] = df.loc[:, true_value_col_name].notnull()
        df[TOTAL_COL_NAME] = True    
        special_cols = [TOTAL_COL_NAME] + list(true_value_col_names)

        has_no_data = df.shape[0] == 0
        df = df[special_cols].sum()
        df = df.to_frame().T
        if has_no_data:
            df[TOTAL_COL_NAME] = 0
            df[TOTAL_COL_NAME] = df[TOTAL_COL_NAME].astype(int)
        
        df[TOTAL_NO_OF_DOCUMENTS_COL_NAME] = 0 # get_total((date,))
        # df = df.astype(str)
        
        output_dataframe = pd.concat((output_dataframe, df))
        break
        
        
    output_dataframe = output_dataframe.rename(columns=col_rename_dict)
    for col_name in DESIRED_COL_NAMES[::-1]:
        if col_name in set(output_dataframe.columns.values):
            # shift column 'name' to first position
            first_column = output_dataframe.pop(col_name)
            output_dataframe.insert(0, col_name, first_column)


    first_column= output_dataframe.pop(col_rename_dict[TOTAL_COL_NAME])
    output_dataframe.insert(0, col_rename_dict[TOTAL_COL_NAME], first_column)
    first_column = output_dataframe.pop(TOTAL_NO_OF_DOCUMENTS_COL_NAME)
    output_dataframe.insert(0, TOTAL_NO_OF_DOCUMENTS_COL_NAME, first_column)

    output_dataframe = output_dataframe.fillna(0)

    for col in output_dataframe.columns.values:
        if col != 'Sum of Number verified by workers overall':
            output_dataframe[col] = output_dataframe[col] / output_dataframe['Sum of Number verified by workers overall']

    save_dataframe_to_csv(OUTPUT_BUCKET_NAME, output_dataframe, OVERALL_TRUE_VALUES_KEY, DEFAULT_SEPARATOR)
    return output_dataframe 

In [16]:
df = pd.read_csv('outputs/01012023-07212023-labeler-output.csv', low_memory=False)
restrict_1 = df[df['sub'] == '0f5910e7-bca8-4364-bcfa-9eec0253085c']
restrict_2 = df[df['sub'] == 'b149f627-83f8-4f38-ba25-9afcea08b4fb']
restrict_3 = df[df['sub'] == '5d92fd56-4e08-4b36-b30b-87810ad13261']
pd.concat((overall_true_value_entries(restrict_1), overall_true_value_entries(restrict_2), overall_true_value_entries(restrict_3), overall_true_value_entries(df)))

Generating report.
Saving to s3://ibp-textract-prod1-output/reports/07012023-07042023-true_values.csv
Finished saving.
Generating report.
Saving to s3://ibp-textract-prod1-output/reports/07012023-07042023-true_values.csv
Finished saving.
Generating report.
Saving to s3://ibp-textract-prod1-output/reports/07012023-07042023-true_values.csv
Finished saving.
Generating report.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[TOTAL_COL_NAME] = True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[TOTAL_COL_NAME] = True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[TOTAL_COL_NAME] = True


Saving to s3://ibp-textract-prod1-output/reports/07012023-07042023-true_values.csv
Finished saving.


Unnamed: 0,Sum of Total Number of Documents,Sum of Number verified by workers overall,Sum of Number RNC ADDRESSEE,Sum of Number RNC ADDRESS_LINE_1,Sum of Number RNC ADDRESS_LINE_2,Sum of Number RNC CITY,Sum of Number RNC STATE,Sum of Number RNC ZIP_CODE_4,Sum of Number RNC REGID
0,0.0,12443,0.021779,0.005385,0.042755,0.030459,0.033111,0.011251,0.038013
0,0.0,26642,0.045867,0.002515,0.064372,0.04643,0.045455,0.138428,0.065123
0,0.0,9493,0.042347,0.012114,0.06173,0.049299,0.044559,0.2288,0.072264
0,0.0,608405,0.040121,0.009254,0.059778,0.037575,0.038538,0.211972,0.075525
