In [None]:
import boto3
import pandas as pd
import re
import json
import pandas as pd
from functools import reduce
from numpy import nan
from io import StringIO

pd.set_option('display.max_colwidth', 100)

s3 = boto3.resource('s3')
s3_client = boto3.client('s3')

# default_user_pool_id to use if it is missing in the records
default_user_pool_id = 'us-west-2_1j6F4wzAn'

# Create a Cognito client using the session
cognito_client = boto3.client('cognito-idp')



In [None]:
# the date here will be possibly of a greater range since we are filtering by the labelers submission date
# so need to look into a wider range of dates
LABELERS_OUTPUT_PREFIX = 'a2i/output/'
LABELERS_DATA_OUTPUT_FILENAME = 'labeler-output.csv'
OUTPUT_PREFIX = 'reports/'
LABELERS_DATA_OUTPUT_PREFIX = f'{OUTPUT_PREFIX}labelers_output_data/'



LABELERS_OUTPUT_START_DAY = '07102023'
LABELERS_OUTPUT_END_DAY = '07242023'
START_DAY = '07212023'
END_DAY = '07222023'
INTERVAL_HOUR = 23
START_DAY_TIME = f'{START_DAY[4:8]}-{START_DAY[:2]}-{START_DAY[2:4]}T{INTERVAL_HOUR}'
END_DAY_TIME = f'{END_DAY[4:8]}-{END_DAY[:2]}-{END_DAY[2:4]}T{INTERVAL_HOUR}'
OUTPUT_METRICS_KEY = f'{OUTPUT_PREFIX}{START_DAY}-{END_DAY}-labeler-metrics.csv'


START_DAY_ANSWERS_SUMMARY = '07012023'
END_DAY_ANSWERS_SUMMARY = '07042023'
CHANGE_REASON_KEY = f'{OUTPUT_PREFIX}{START_DAY_ANSWERS_SUMMARY}-{END_DAY_ANSWERS_SUMMARY}-change_reason.csv'
OVERALL_TRUE_VALUES_KEY= f'{OUTPUT_PREFIX}{START_DAY_ANSWERS_SUMMARY}-{END_DAY_ANSWERS_SUMMARY}-true_values.csv'


DEFAULT_SEPARATOR = ','
OUTPUT_BUCKET_NAME =  "ibp-textract-prod1-output"
OUTPUT_PREFIX = f'reports/'
EMAIL_DICT_FILE = f'{OUTPUT_PREFIX}emails_dict.json'



In [None]:
#functions to load and save jsons and csv files to bucket / local file system

def load_json(bucket_name, key):
    print(f'Loading from s3://{bucket_name}/{key}')
    try:
        s3_object = s3_client.get_object(Bucket=bucket_name, Key=key)
        data = s3_object['Body'].read().decode('utf-8')
        print(f'Finished loading.')
        return json.loads(data)
    
    except Exception as e:
        print(f'Error reading from file {e}')
        return {}
    
def save_json(bucket_name, dictionary, key):
    print(f'Saving to s3://{bucket_name}/{key}')
    try:
        # Convert Dictionary to JSON String
        data_string = json.dumps(dictionary, indent=2, default=str)
        # s3_client.put_object(
        #     Bucket=bucket_name, 
        #     Key=key,
        #     Body=data_string
        # )
        print(f'Finished saving.')
    except Exception as e:
        print(f'Error saving file {e}')

def load_labelers_output_dataframe_from_csv(bucket_name, prefix, dates, filename, separator):
    output_dataframe = pd.DataFrame()
    for date in dates:
        object_key = f'{prefix}{date}/{filename}'
        print(f'Loading labelers\' output from s3://{bucket_name}/{object_key}')
        try:
            s3_object = s3_client.get_object(Bucket=bucket_name, Key=object_key)
            data = s3_object['Body'].read().decode('utf-8')
            dataframe = pd.read_csv(StringIO(data), sep=separator, low_memory=False)
            output_dataframe = pd.concat((output_dataframe, dataframe))
        except Exception as e:
            print(f'Output for {date} not loaded {e}')
    print(f'Finished loading.')
    return output_dataframe
    
def save_dataframe_to_csv(bucket_name, dataframe, key, separator):
    print(f'Saving to s3://{bucket_name}/{key}')
    csv_buffer = StringIO()
    try:
        dataframe.to_csv(csv_buffer, index=False, sep=separator)
        csv_buffer.seek(0)
        # s3_client.put_object(Body=csv_buffer.getvalue(), Bucket=bucket_name, Key=key)
        csv_buffer.close()
        print(f'Finished saving.')
    except Exception as e:
        print(f'Error saving file {e}')

In [None]:
from datetime import datetime, timedelta

def get_dates_between(start_date, end_date):
    dates = []
    date_format = "%m%d%Y"
    start_datetime = datetime.strptime(start_date, date_format)
    end_datetime = datetime.strptime(end_date, date_format)
    current_datetime = start_datetime

    while current_datetime <= end_datetime:
        dates.append(current_datetime.strftime(date_format))
        current_datetime += timedelta(days=1)

    return tuple(dates)

labelers_output_dates = get_dates_between(LABELERS_OUTPUT_START_DAY, LABELERS_OUTPUT_END_DAY)
dates_summary = get_dates_between(START_DAY_ANSWERS_SUMMARY, END_DAY_ANSWERS_SUMMARY)

In [None]:
# gets user email from cognito userpool with user_pool_id based on user's sub.
def get_user_from_identity_issuer(sub, user_pool_id):
    # Lookup user details
    response = cognito_client.list_users(
        UserPoolId=user_pool_id,
        Filter= 'sub = "' + sub + '"'
    )

    email, given_name, family_name = None, None, None
    # Extract the user's email and name
    for user in response['Users']:
        for attr in user['Attributes']:
            if attr['Name'] == 'email':
                email = attr['Value']
            elif attr['Name'] == 'name' and attr['Value'] and attr['Value'].find('@') != -1: 
                email = attr['Value']
            elif attr['Name'] == 'given_name': 
                given_name = attr['Value']
            elif attr['Name'] == 'family_name': 
                family_name = attr['Value']
 
    return email, (f'{given_name} {family_name}' if given_name or family_name else None)

In [None]:
df = pd.read_csv('outputs/01012023-07212023-labeler-output.csv', low_memory=False)



dates = get_dates_between("07152023", "07212023")


df.head()

In [None]:
df['prefix_date'] = df['output_file_s3_key'].apply(lambda val: val[len('a2i/output/'):len('a2i/output/')+8])

df = df[df['prefix_date'].isin(dates)]

In [None]:
def get_submission_time_delta(row):
    prefix_date = row['prefix_date'][:4] + '2023' 
    start_date = datetime.strptime(prefix_date, "%m%d%Y")
    try:
        submission_date = datetime.strptime(row['submissionTime'], "%Y-%m-%dT%H:%M:%SZ")
    except:
        submission_date = datetime.strptime(row['submissionTime'], "%Y-%m-%dT%H:%M:%S.%fZ")
    submission_delta = submission_date - start_date
    return int(submission_delta.days*24 + submission_delta.seconds//3600)

df['time_diff'] = df.loc[:, ('prefix_date', 'submissionTime')].apply(get_submission_time_delta, axis=1)

In [None]:

df['time_diff'].head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots(figsize=(15,5))
df['time_diff'].hist(bins=48)
ax.set_xticks(np.arange(0, 47, 1))
# plt.xticks(rotation=45)
df[df['time_diff'] > 1].shape[0]


In [None]:
restrict_1 = df[df['sub'] == '0f5910e7-bca8-4364-bcfa-9eec0253085c']
restrict_2 = df[df['sub'] == 'b149f627-83f8-4f38-ba25-9afcea08b4fb']
restrict_3 = df[df['sub'] == '5d92fd56-4e08-4b36-b30b-87810ad13261']

In [None]:
restrict_1.shape[0]

In [66]:
answers_range = (1, 2, 3, 4, 5, 6, 7)
change_reason_codes = ('A', 'B', 'C', 'D', 'F')
change_reason_codes_chars_set = {'A', 'B', 'C', 'D', 'F', ' ', ',', 'a', 'b', 'c', 'd', 'f', '\n', '\t'}

CHANGE_REASON_COL_NAME = "Change Reason"
TOTAL_COL_NAME = 'Total'


from itertools import chain, combinations

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

# function to detect if an answer uses a particular change reason code 
def has_answer_code(answer, code):
    answer_chars_set = set(str(answer))
    if len(answer_chars_set.difference(change_reason_codes_chars_set)):
        return False
    
    return answer == code or any([letter.upper() == code for letter in answer])

# based on a2i output.json-s files creates change reason statistics (A, B, C, D, F) and saves it to CHANGE_REASON_KEY s3 object (file)
def change_reason(name, df):
    # df = load_labelers_output_dataframe_from_csv(
    #     OUTPUT_BUCKET_NAME,
    #     LABELERS_DATA_OUTPUT_PREFIX,
    #     dates_summary,
    #     LABELERS_DATA_OUTPUT_FILENAME,
    #     DEFAULT_SEPARATOR,
    # )
    # df
    if not isinstance(df, pd.DataFrame):
        print('No data found.')
        return
    
    print('Generating report.')
    # df['date_key_part'] = df['output_file_s3_key'].apply(lambda val: val[len(LABELERS_OUTPUT_PREFIX):len(LABELERS_OUTPUT_PREFIX) + 8])
    # df = df[df['date_key_part'].isin(dates)]
 
    change_reason_col_names = tuple(f'{CHANGE_REASON_COL_NAME} {answer}' for answer in answers_range)
    change_reason_col_names = tuple(change_reason_col_name 
        for change_reason_col_name in change_reason_col_names if change_reason_col_name in set(df.columns.values)
    )
    for change_reason_code in change_reason_codes:
        df.loc[:, change_reason_code] = df.loc[:, change_reason_col_names].apply(
            lambda row: any([has_answer_code(row[change_reason_col_name], change_reason_code) for change_reason_col_name in change_reason_col_names]),
            axis=1,
        )
    df[TOTAL_COL_NAME] = True    
    special_cols = [TOTAL_COL_NAME]  + list(change_reason_codes)
    # for subset in powerset(change_reason_codes):
    #     subset_string = ''.join(subset)
    #     if len(subset_string) > 1:
    #         special_cols.append(subset_string + '_all')
    #         df[subset_string + '_all'] = None
    #         special_cols.append(subset_string + '_any')
    #         df[subset_string + '_any'] = None
    #     else:
    #         continue
    
    #     df.loc[:, subset_string + '_all'] = df.loc[:, change_reason_codes].apply(
    #         lambda row: all([row[change_reason_code] for change_reason_code in subset]),
    #         axis=1,
    #     )
    #     df.loc[:, subset_string + '_any'] = df.loc[:, change_reason_codes].apply(
    #         lambda row: any([row[change_reason_code] for change_reason_code in subset]),
    #         axis=1,
    #     )

    
    df_1 = df[special_cols].sum()
    df_1 = df_1.to_frame().T
    df_2 = (df[special_cols].sum() / df_1.loc[0, TOTAL_COL_NAME]) if df_1.loc[0, TOTAL_COL_NAME] > 0 else df[special_cols].sum() 
    df_2 = df_2.apply(lambda val: f'{round(val * 100, 2)} %').to_frame().T
    df = df_2 # pd.concat((df_1, df_2))
    df_0 = pd.DataFrame.from_dict({'': [f'Total Code Occurences for {name}']})
    df = df.reset_index()
    df = df.drop(['index'], axis=1)
    df = pd.concat((df_0, df), axis=1)
    save_dataframe_to_csv(OUTPUT_BUCKET_NAME, df, CHANGE_REASON_COL_NAME, DEFAULT_SEPARATOR)
    return df


In [None]:
pd.concat((change_reason(restrict_1), change_reason(restrict_2), change_reason(restrict_3), change_reason(df)))

In [None]:
change_reason(df).head()

In [None]:
# counting the total number of documents processed by textract for given dates

def get_total(dates):
    target_bucket_name = 'ibp-textract-prod1-target'
    num_docs = 0
    for date in dates:
        for obj in s3.Bucket(target_bucket_name).objects.filter(Prefix=date):
            if obj.key.endswith('.TIF'):
                num_docs += 1
        print(f'Total number of documents up to {date} is {num_docs}')
    return num_docs

In [None]:
true_values_range = (1, 2, 3, 4, 5, 6, 7)

DESIRED_COL_NAMES = (
    "Sum of Number RNC ADDRESSEE",
    "Sum of Number RNC ADDRESS_LINE_1",
    "Sum of Number RNC ADDRESS_LINE_2",
    "Sum of Number RNC CITY",
    "Sum of Number RNC STATE",
    "Sum of Number RNC ZIP_CODE_4",
    "Sum of Number RNC REGID",
)

TOTAL_NO_OF_DOCUMENTS_COL_NAME = "Name"

TRUE_VALUE_COL_NAME = "True Value"

col_rename_dict = {
    f'{TRUE_VALUE_COL_NAME} {true_value}': DESIRED_COL_NAMES[i-1]
    for i, true_value in enumerate(true_values_range)
}
col_rename_dict[TOTAL_COL_NAME] = 'Sum of Number verified by worker overall'

# def has_been_touched(true_value):
#     return not true_value.isna()

# based on a2i output.json-s files creates true value statistics (that is, the number of touches/checks/corrections performed by labelers)
# and saves it to OVERALL_TRUE_VALUES_KEY s3 object (file)
# we are just checking the entries for being notnull.
def overall_true_value_entries(name, df):
    print('Generating report.')
    output_dataframe = pd.DataFrame()
    for date in dates_summary:
        # df = load_labelers_output_dataframe_from_csv(
        #     OUTPUT_BUCKET_NAME,
        #     LABELERS_DATA_OUTPUT_PREFIX,
        #     (date,),
        #     LABELERS_DATA_OUTPUT_FILENAME,
        #     DEFAULT_SEPARATOR,
        # )
        if not isinstance(df, pd.DataFrame):
            print('No data found.')
            return

        # df['date_key_part'] = df['output_file_s3_key'].apply(lambda val: val[len(LABELERS_OUTPUT_PREFIX):len(LABELERS_OUTPUT_PREFIX) + 8])
        # df = df[df['date_key_part'].isin(labelers_output_dates)]

        true_value_col_names = (f'{TRUE_VALUE_COL_NAME} {true_value}' for true_value in true_values_range)
        true_value_col_names = tuple(true_value_col_name
            for true_value_col_name in true_value_col_names if true_value_col_name in set(df.columns.values)
        )
        for true_value_col_name in true_value_col_names:
            df.loc[:, true_value_col_name] = df.loc[:, true_value_col_name].notnull()
        df[TOTAL_COL_NAME] = True    
        special_cols = [TOTAL_COL_NAME] + list(true_value_col_names)

        has_no_data = df.shape[0] == 0
        df = df[special_cols].sum()
        df = df.to_frame().T
        if has_no_data:
            df[TOTAL_COL_NAME] = 0
            df[TOTAL_COL_NAME] = df[TOTAL_COL_NAME].astype(int)
        
        df[TOTAL_NO_OF_DOCUMENTS_COL_NAME] = name # get_total((date,))
        # df = df.astype(str)
        
        output_dataframe = pd.concat((output_dataframe, df))
        break
        
        
    output_dataframe = output_dataframe.rename(columns=col_rename_dict)
    for col_name in DESIRED_COL_NAMES[::-1]:
        if col_name in set(output_dataframe.columns.values):
            # shift column 'name' to first position
            first_column = output_dataframe.pop(col_name)
            output_dataframe.insert(0, col_name, first_column)


    first_column= output_dataframe.pop(col_rename_dict[TOTAL_COL_NAME])
    output_dataframe.insert(0, col_rename_dict[TOTAL_COL_NAME], first_column)
    first_column = output_dataframe.pop(TOTAL_NO_OF_DOCUMENTS_COL_NAME)
    output_dataframe.insert(0, TOTAL_NO_OF_DOCUMENTS_COL_NAME, first_column)

    output_dataframe = output_dataframe.fillna(0)

    for col in output_dataframe.columns.values:
        if col != 'Sum of Number verified by workers overall' and col != TOTAL_NO_OF_DOCUMENTS_COL_NAME:
            output_dataframe[col] = output_dataframe[col] / output_dataframe['Sum of Number verified by workers overall']
            output_dataframe[col] = output_dataframe.loc[:, col].apply(lambda x: f'{round(x * 100, 2)} %' )
            

    save_dataframe_to_csv(OUTPUT_BUCKET_NAME, output_dataframe, OVERALL_TRUE_VALUES_KEY, DEFAULT_SEPARATOR)
    return output_dataframe 

In [None]:
df = pd.read_csv('outputs/01012023-07212023-labeler-output.csv', low_memory=False)
restrict_1 = df[df['sub'] == '0f5910e7-bca8-4364-bcfa-9eec0253085c']
restrict_2 = df[df['sub'] == 'b149f627-83f8-4f38-ba25-9afcea08b4fb']
restrict_3 = df[df['sub'] == '5d92fd56-4e08-4b36-b30b-87810ad13261']
pd.concat((overall_true_value_entries(restrict_1), overall_true_value_entries(restrict_2), overall_true_value_entries(restrict_3), overall_true_value_entries(df)))

In [65]:
import warnings
warnings.filterwarnings('ignore')
from numpy import nan

df = pd.read_csv('outputs/01012023-07212023-labeler-output.csv', low_memory=False)


sub_cvs = pd.read_csv('outputs/01012023-07212023-labeler-metrics.csv')
subs = sub_cvs['sub'].to_list()

name = {}
for _, row in sub_cvs.iterrows():
    if str(row['sub']) != 'nan':
        name[row['sub']] = row['name']




In [None]:
# print(subs)
df_out = overall_true_value_entries('Everybody', df.copy())
for sub in name:
    if sub != 'nan':
        restriction_to_sub = df[df['sub'] == sub]
        df_out = pd.concat((df_out, overall_true_value_entries(name[sub], restriction_to_sub)))

df_out.head(30)

df_out.to_csv('06012023-07212021-corrections-per-labeler.csv')

In [None]:
df_out.to_csv('06012023-07212021-corrections-per-labeler.csv', index=False)

In [71]:
# df_out = change_reason('Everybody', df.copy())
# for sub in name:
#     if sub != 'nan':
#         restriction_to_sub = df[df['sub'] == sub]
#         df_out = pd.concat((df_out, change_reason(name[sub], restriction_to_sub)))

df_out.head(30)
df_out.to_csv('06012023-07212021-codes-for-textract-per-labeler.csv', index=False)

In [68]:
df_out.head(20)

Unnamed: 0,Unnamed: 1,Total,A,B,C,D,F
0,Total Code Occurences for Everybody,100.0 %,1.71 %,2.7 %,2.01 %,1.6 %,2.93 %
0,Total Code Occurences for Ashmini Bissoondyal,100.0 %,0.0 %,0.0 %,0.0 %,0.0 %,0.01 %
0,Total Code Occurences for Brian Graffs,100.0 %,0.5 %,0.57 %,0.78 %,0.3 %,1.13 %
0,Total Code Occurences for Jessika Stone,100.0 %,0.0 %,0.0 %,0.1 %,0.15 %,0.0 %
0,Total Code Occurences for Shamara Nichouls,100.0 %,1.76 %,1.75 %,2.06 %,2.07 %,0.8 %
0,Total Code Occurences for James Slaughter,100.0 %,0.55 %,0.65 %,1.04 %,2.21 %,0.16 %
0,Total Code Occurences for Cassandra Horton,100.0 %,0.36 %,0.12 %,0.17 %,0.38 %,0.85 %
0,Total Code Occurences for Rayne Miller,100.0 %,2.26 %,5.89 %,2.03 %,0.42 %,7.22 %
0,Total Code Occurences for Shannon Ansberry,100.0 %,0.75 %,2.23 %,0.6 %,1.19 %,0.97 %
0,Total Code Occurences for Gavin Weiss,100.0 %,0.85 %,6.69 %,0.98 %,0.25 %,0.95 %


In [72]:
col_rename_dict

{'True Value 1': 'Sum of Number RNC REGID',
 'True Value 2': 'Sum of Number RNC ADDRESSEE',
 'True Value 3': 'Sum of Number RNC ADDRESS_LINE_1',
 'True Value 4': 'Sum of Number RNC ADDRESS_LINE_2',
 'True Value 5': 'Sum of Number RNC CITY',
 'True Value 6': 'Sum of Number RNC STATE',
 'True Value 7': 'Sum of Number RNC ZIP_CODE_4',
 'Total': 'Sum of Number verified by workers overall'}