In [1]:
import json
import os
import re
from pathlib import Path
from typing import Any, Dict, List, Union  # From mypy library
import datetime

import numpy as np
import pandas as pd
import requests
import xmltodict  # type: ignore
from dotenv import find_dotenv, load_dotenv
from jinja2 import Template
from PIL import Image, ImageSequence  # type: ignore
from fuzzywuzzy import fuzz

from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype, is_integer_dtype, is_object_dtype, is_string_dtype

## Set up test environment

In [3]:
load_dotenv(find_dotenv())

api_key = os.getenv('XTRACTA_API_KEY')
database_id = os.getenv('XTRACTA_DATABASE_ID')
header_workflow = os.getenv('XTRACTA_HEADER_ID')
line_workflow = os.getenv('XTRACTA_LINE_ID')


## Functions for interacting with Xtracta's API

### Upload file

Uploads a PDf or image file for extraction. The classifier field is used if you want to assign a specific classifier to the document rather than letting Xtracta make its own classification decision

In [33]:
def upload_file(api_key, workflow_id, filename, field_data=""):
    upload_url = "https://api-app.xtracta.com/v1/documents/upload"
    file = {"userfile": open(filename, mode="rb")}
    data = {
        "api_key": api_key,
        "workflow_id": workflow_id,
        "field_data": field_data,
    }
    r = requests.post(upload_url, data=data, files=file)
    if r.status_code != 200:
        print(r.status_code)
        return r.text
    else:
        response = xmltodict.parse(r.text)
        return response["xml"]["document_id"]

In [9]:
# test_document_id = upload_file(api_key, header_workflow, test_ip / test_file)
test_document_id = 100705962


In [5]:
def get_document(api_key: str, document_id: str):
    
    """retrieves the full xml document from Xtracta and converts it to a dict"""
    
    documents_url = "https://api-app.xtracta.com/v1/documents"
    data = {"api_key": api_key, "document_id": document_id}
    try:
        r = requests.post(documents_url, data=data)
        response = xmltodict.parse(r.text)
        return response
    except Exception as e:
        return e.args

In [86]:
test_document = get_document(api_key, test_document_id)
test_document["documents_response"]["document"]['document_id']

'100705962'

In [12]:
def get_xtracta_status(
    api_key: str,
    workflow_id: str,
    status: str,
    api_download_status: str = "active",
    detailed: int = 0,
    documents_order: str = "asc",
) -> list:
    """Returns a list of all Xtracta documents with a particular status"""
    documents_url = "https://api-app.xtracta.com/v1/documents"
    data = {
        "api_key": api_key,
        "workflow_id": workflow_id,
        "document_status": status,
        "api_download_status": api_download_status,
        "items_per_page": 1000,
        "detailed": detailed,
        "documents_order": documents_order,
    }
    try:
        r = requests.post(documents_url, data=data)
        response = xmltodict.parse(r.text)
    except Exception as e:
        return [e.__str__]

    try:
        response_content = response["documents_response"]["document"]
        if type(response_content) == list:
            return response_content
        else:
            return [response_content]
    except Exception as e:
        if type(e).__name__ == "KeyError":
            return [f"No {status} documents in queue"]
        else:
            return [e]

In [88]:
test_output_list = get_xtracta_status(api_key, header_workflow, 'output')
test_output_list[0]

OrderedDict([('@revision', '3'),
             ('document_id', '100767031'),
             ('document_status', 'output'),
             ('number_of_pages', '3'),
             ('api_download_status', 'active'),
             ('free_form', None),
             ('classification', None),
             ('classification_class', 'awaiting classification'),
             ('classification_design', 'undetected')])

## Build the output dictionary from Xtracta data

In [51]:
def create_output(document: Dict[Any, Any]) -> Dict[Any, Any]:
    """Returns a dictionary with document_id, status and version as top level values 
    and remaining fields as key value pairs in a header section"""
    output = {}
    header_dict = document["documents_response"]["document"]["field_data"]["field"]
    header = transform_dict(header_dict)
    output["document_id"] = document["documents_response"]["document"]["document_id"]
    output["status"] = document["documents_response"]["document"]["document_status"]
    output["version"] = document["documents_response"]["document"]["@revision"]
    output["header"] = header
    if output['status'] in ['qa', 'reject', 'output']:
        output = build_out_output(document, output)
    return output


def transform_dict(start_dict):
    end_dict = {}
    for item in start_dict:
        end_dict[item["field_name"]] = item["field_value"]
    return end_dict

def build_out_output(document, output):
    try:
        output['stem'] = output['header']['filename'].split('.')[0]
    except:
        output['stem'] = output['header']['filename']
    output['new_filename'] = f"{output['header']['supplier_id']}-{output['header']['invoice_number']}"
    try:
        output['header']['email_date']
    except:
        current_date = datetime.datetime.today().strftime('%Y-%m-%d')
        output['header']['email_date'] = current_date
    output['document_url'] = document['documents_response']['document']['document_url']
    output['image_urls'] = get_image_urls(document['documents_response']['document']['image_url'])
    return output

def get_image_urls(image_urls):
    if type(image_urls) != list:
        image_urls = [image_urls]
    return image_urls

In [52]:
test_document = get_document(api_key, test_document_id)
test_output = create_output(test_document)
test_output

{'document_id': '100705962',
 'status': 'output',
 'version': '3',
 'header': {'supplier_abn': '76052484483',
  'abn_from_db_by_po': '76052484483',
  'supplier': 'RELAY MONITORING SYSTEMS PTY LTD',
  'supplier_id': '009098',
  'invoice_date': '2019-01-09',
  'invoice_number': '42504',
  'po_number': 'T81293',
  'net_total': '14820.00',
  'gst_total': '1482.00',
  'gross_total': '16302.00',
  'freight': '50.00',
  'bsb_number': '063159',
  'bank_account_number': '10006635',
  'line_count': '3',
  'filename': '20190124RelayMonitoring',
  'account_number': None,
  'period_start_date': None,
  'period_end_date': None,
  'email_date': '2019-02-01'},
 'stem': '20190124RelayMonitoring',
 'new_filename': '009098-42504',
 'document_url': 'https://web1-akl.xtracta.com/akl_northcote_storage2_datasource4/1/fe/92/iq205042603-6WMH3CAqF.pdf',
 'image_urls': ['https://web1-akl.xtracta.com/akl_northcote_storage2_datasource4/1/15/c9/ds100705962-iq205042603-6WMH3CAqF4900-1-800.jpg']}

In [20]:
def open_document_ui(api_key: str, document_id: str) -> str:
    """Opens the Xtracta UI to fix and train documents"""
    documents_url = "https://api-app.xtracta.com/v1/documents/ui"
    data = {
        "api_key": api_key,
        "document_id": int(document_id),
        "buttons": "output,archive",
        "no_lockout": 1,
        "expire": 86400,
    }
    r = requests.post(documents_url, data=data)
    response = xmltodict.parse(r.text)
    return response["documents_response"]["url"]

In [21]:
test_document = get_document(api_key, test_document_id)
test_output = create_output(test_document)
display(test_output)
if test_output['status'] in ['reject', 'output']:
    display(open_document_ui(api_key, test_document_id))

{'document_id': '100705962',
 'status': 'output',
 'version': '3',
 'header': {'supplier_abn': '76052484483',
  'abn_from_db_by_po': '76052484483',
  'supplier': 'RELAY MONITORING SYSTEMS PTY LTD',
  'supplier_id': '009098',
  'invoice_date': '2019-01-09',
  'invoice_number': '42504',
  'po_number': 'T81293',
  'net_total': '14820.00',
  'gst_total': '1482.00',
  'gross_total': '16302.00',
  'freight': '50.00',
  'bsb_number': '063159',
  'bank_account_number': '10006635',
  'line_count': '3',
  'filename': '20190124RelayMonitoring',
  'account_number': None,
  'period_start_date': None,
  'period_end_date': None}}

'https://app.mybusinessautomated.net/main/an_entry/index/d/H9j9r_rBSStOZ.n45pDHCHNZ.KiLlSuA7tHYaLSoGcmMYhVYBU0aYbwXQc0vkrRVJuG1TZDbJMr7va29hDN9Xg--'

In [24]:
def open_dashboard_ui(api_key: str, workflow_id: str) -> str:
    """Opens the Xtracta UI to fix and train documents"""
    documents_url = "https://app.mybusinessautomated.net/v1/documents/ui"
    data = {
        "api_key": api_key,
        "workflow_id": int(line_workflow),
        "buttons": "output,archive",
        "no_lockout": 1,
        "expire": 86400,
    }
    r = requests.post(documents_url, data=data)
    response = xmltodict.parse(r.text)
    return response["documents_response"]["url"]

In [25]:
open_dashboard_ui(api_key, header_workflow)

KeyError: 'url'

In [27]:
def update_document(
    api_key: str, document_id: str, delete: int = 0, api_download_status: str = "active"
) -> Dict[str, str]:
    """Updates document on Xtracta"""
    documents_url = "https://api-app.xtracta.com/v1/documents/update"
    data = {
        "api_key": api_key,
        "document_id": int(document_id),
        "delete": delete,
        "api_download_status": api_download_status,
    }
    r = requests.post(documents_url, data=data)
    response = xmltodict.parse(r.text)
    return response["documents_response"]

In [28]:
update_document(api_key, test_document_id, api_download_status='active')

OrderedDict([('status', '200'),
             ('message', 'The request has been successfully processed')])

In [58]:
def get_lines(document):
    try:
        lines_dict = document['documents_response']['document']['field_data']['field_set']['row']
    except:
        lines_dict = document['documents_response']['document']['field_data']['field']
    lines = []
    for line_dict in lines_dict:
        line = transform_dict(line_dict['field'])
        if line['line_net']:
            lines.append(line)
    return lines

In [59]:
# test_line_document_id = upload_file(api_key, line_workflow, test_ip / test_file)
test_line_document_id = 100199853
test_line_document = get_document(api_key, test_line_document_id)
# print(test_line_document['documents_response']['document']['field_data']['field'][0])
test_lines = get_lines(test_line_document)
test_lines

[{'po_item': '3602406',
  'description': 'DC-DC CONVERTER PRIMARY SWITCHED 48-60V-VP DC 24V-0/P DC 1A',
  'qty': '1',
  'line_net': '211.41',
  'po_line_number': None}]

## Build output once in output status

In [35]:
test_document = get_document(api_key, test_document_id)
test_output = create_output(test_document)
full_test_output = build_out_output(test_document, test_output)
full_test_output

{'document_id': '100705962',
 'status': 'output',
 'version': '3',
 'header': {'supplier_abn': '76052484483',
  'abn_from_db_by_po': '76052484483',
  'supplier': 'RELAY MONITORING SYSTEMS PTY LTD',
  'supplier_id': '009098',
  'invoice_date': '2019-01-09',
  'invoice_number': '42504',
  'po_number': 'T81293',
  'net_total': '14820.00',
  'gst_total': '1482.00',
  'gross_total': '16302.00',
  'freight': '50.00',
  'bsb_number': '063159',
  'bank_account_number': '10006635',
  'line_count': '3',
  'filename': '20190124RelayMonitoring',
  'account_number': None,
  'period_start_date': None,
  'period_end_date': None,
  'emaildate': '2019-01-24'},
 'stem': '20190124RelayMonitoring',
 'new_filename': '009098-42504',
 'document_url': 'https://web1-akl.xtracta.com/akl_northcote_storage2_datasource4/1/fe/92/iq205042603-6WMH3CAqF.pdf',
 'image_urls': ['https://web1-akl.xtracta.com/akl_northcote_storage2_datasource4/1/15/c9/ds100705962-iq205042603-6WMH3CAqF4900-1-800.jpg']}

## Pull company name and location from filename

In [36]:
def add_company_location(output):
    company_extract = re.compile(r'.*\[<<(.*)>>\].*')
    company_location = company_extract.match(output['header']['email_subject'])[1]
    try:
        output['company'], output['location'] = company_location.split('-')
    except:
        output['company'] = company_location
        output['location'] = 'NA'
    return output

In [37]:
full_test_output['header']['email_subject'] = '234 [<<ABC-123>>] Here is a subject'
full_test_output = add_company_location(full_test_output)
full_test_output

{'document_id': '100705962',
 'status': 'output',
 'version': '3',
 'header': {'supplier_abn': '76052484483',
  'abn_from_db_by_po': '76052484483',
  'supplier': 'RELAY MONITORING SYSTEMS PTY LTD',
  'supplier_id': '009098',
  'invoice_date': '2019-01-09',
  'invoice_number': '42504',
  'po_number': 'T81293',
  'net_total': '14820.00',
  'gst_total': '1482.00',
  'gross_total': '16302.00',
  'freight': '50.00',
  'bsb_number': '063159',
  'bank_account_number': '10006635',
  'line_count': '3',
  'filename': '20190124RelayMonitoring',
  'account_number': None,
  'period_start_date': None,
  'period_end_date': None,
  'emaildate': '2019-01-24',
  'email_subject': '234 [<<ABC-123>>] Here is a subject'},
 'stem': '20190124RelayMonitoring',
 'new_filename': '009098-42504',
 'document_url': 'https://web1-akl.xtracta.com/akl_northcote_storage2_datasource4/1/fe/92/iq205042603-6WMH3CAqF.pdf',
 'image_urls': ['https://web1-akl.xtracta.com/akl_northcote_storage2_datasource4/1/15/c9/ds100705962-iq

## Prepare email text for upload into Xtracta

In [76]:
def email_to_dict(filename):
    email_json = json.loads(filename.read_text())
    email_json['BodyText'] = email_json['BodyText'].replace('-', ' ')
    email_json['BodyText'].replace(' \n', '\n')
    email_json['Subject'] = email_json['Subject'].replace('-', ' ')
    return email_json


def get_email_xml(email_dict):
    fields = {'field': 
              [
               {'@name': 'email_date',
                '#text': email_dict['Date']
               },
               {'@name': 'email_from',
                '#text': email_dict['From']
               },
               {'@name': 'email_subject',
                '#text': email_dict['Subject']
               },
               {'@name': 'email_body',
                '#text': email_dict['BodyText']
               }
             ]}
    field_data = {'field_data': fields}
    field_xml = xmltodict.unparse(field_data, pretty=True)
    rows = field_xml.split('\n')
    modified_xml = ('\n').join(rows[1:])
    return modified_xml

In [77]:
p = Path.cwd()

filename = p / 'input' / '20190131_invoice.json'
email_dict = email_to_dict(filename)
get_email_xml(email_dict)

'<field_data>\n\t<field name="email_date">2019-01-27</field>\n\t<field name="email_from">priority.credit@blah.com</field>\n\t<field name="email_subject">20190125.290857.P29126671 Invoice .Company.</field>\n\t<field name="email_body">Please find attached Invoice P29126671 dated 25 Jan 2019..All account enquiries should be emailed to priority.credit.blah.com.</field>\n</field_data>'

## Write JSON files, create TIFs and move PDFs

In [38]:
def write_json_simple(filename, output):
    filename = filename.with_suffix('.json')
    with open(f"{filename}", "w") as f:
        f.write(json.dumps(output, indent=4))
    return filename

In [39]:
test_json_file = write_json_simple(test_ip / test_output['header']['filename'], test_output)
test_json_file

WindowsPath('C:/Users/hudge/Desktop/2019_projects/pipomatic/hudge/xtracta/input/20190124RelayMonitoring.json')

In [40]:
def move_from_input(api_key, document, ip, lp, op, jp):
    output = create_output(document)
    json_source = (ip / output['header']['filename']).with_suffix('.json')
    pdf_source = (ip / output['header']['filename']).with_suffix('.pdf')
    if document['documents_response']['document']['document_status'] == 'output':
        output = build_out_output(document, output)
        json_destination = json_destination = (op / output['new_filename']).with_suffix('.json')
        if not output['header']['line_count']:
            output['header']['line_count'] = 1
        if float(output['header']['line_count']) > 1:
            json_destination = (lp / output['new_filename']).with_suffix('.json')
            pdf_destination = (lp / output['new_filename']).with_suffix('.pdf')
            if pdf_source.exists():
                pdf_destination.write_bytes(pdf_source.read_bytes())
        with open(f"{json_destination}", "w") as f:
            f.write(json.dumps(output, indent=4))
        save_tif(output, op)
        if json_destination.exists() and json_source.exists():
            json_source.unlink()
            if pdf_source.exists():
                pdf_source.unlink()
        return 'File moved to output / lines'
    elif document['documents_response']['document']['document_status'] == 'qa':
        json_destination = (jp / output['header']['filename']).with_suffix('.json')
        json_source.replace(json_destination)
        if json_destination.exists() and json_source.exists():
            json_source.unlink()
            if pdf_source.exists():
                pdf_source.unlink()
            return 'File moved to junk'
    else:
        return 'File not moved'
    
        
def create_tif_image(image_urls):
    images = []
    for i, url in enumerate(image_urls):
        r = requests.get(url, stream=True)
        if i == 0:
            im = Image.open(r.raw)
        else:
            images.append(Image.open(r.raw))
    return im, images

def save_tif(output, op):
    new_name = (op / output['new_filename']).with_suffix('.tif')
    im, images = create_tif_image(output['image_urls'])
    im.save(f'{new_name}', save_all=True, append_images=images)
    return im, images
    

In [41]:
move_files(test_sp, test_file, test_ip)
write_json_simple(test_ip / test_output['header']['filename'], test_output)
move_from_input(api_key, test_document, test_ip, test_lp, test_op, test_jp)

NameError: name 'move_files' is not defined

## Formatting XML for upload into Xtracta's database

Take a list of dicts and format it for uploading to Xtracta's database API

In [44]:
def update_database_data(api_key, database_id, out, refresh):
    documents_url = 'https://api-app.xtracta.com/v1/databases/data_add'
    data = {'api_key': api_key, 'database_id': int(database_id), 'data': out, 'refresh': refresh}
    r = requests.post(documents_url, data=data)
    response = xmltodict.parse(r.text)
    return response

In [45]:
def build_xml_data(supplier_data_dict):
    xml_rows = []
    for row in supplier_data_dict:
        po = {'column': [{'@id': '55261', '#text': f"{row['po_number']}"}, 
                         {'@id': '55264', '#text': f"{row['supplier_number']}"},
                         {'@id': '60223', '#text': f"{row['line_number']}"},
                         {'@id': '58133', '#text': f"{row['abn']}"},
                         {'@id': '58134', '#text': f"{row['bsb']}"},
                         {'@id': '58135', '#text': f"{row['bank_account']}"},
                         {'@id': '58242', '#text': f"{row['supplier_name']}"}]}
        xml_rows.append(po)
    xml_data = {'xml': {'row': xml_rows}}
    return xmltodict.unparse(xml_data, pretty=True)

## Put PO line numbers in lines

In [64]:
def get_best_match(output, po_lines):
    """
    Matches each line in an invoice with the closest match from a PO. The match is first attempted by stock_code.
    If that fails, it attempts a fuzzy match against description.
    
    Parameters
    ----------
    
    output: dict
        The output is a dict of the invoice. It contains a lines element that holds lines. Each line has a po_item 
        that matches the stock_code column in the po_lines dataframe. 
        
    po_lines: dataframe
        po_lines is a dataframe that lists the po_lines from the PO linked to the invoice. The dataframe has a stock_code
        column and a stock_description column that is used to match to the po_item and desciption elements in the lines
        element of the output dictionary.
        
    returns
    -------
    
    Returns the output dictionary with the po_line_number element filled in.
    
    """
    is_po = po_lines['po_number'] == output['header']['po_number']
    filtered_pos = po_lines[is_po]
    filtered_pos = filtered_pos[['stock_code', 'stock_description', 'qty', 'line_number']]
    filtered_pos_list = filtered_pos.to_dict(orient='records')
    filtered_pos = filtered_pos.set_index('stock_code')
    display(filtered_pos.head())
    filtered_pos_dict = filtered_pos.to_dict(orient='index')
    matches = []
    for i, line in enumerate(output['lines']):
        best_match_line_number = 0
        best_match_result = 0
        if filtered_pos_dict.get(line['po_item'], '') != '': # remove +'y' for production
            output['lines'][i]['po_line_number'] = filtered_pos_dict[line['po_item']]['line_number']
        else:
#             print(line['description'])
            best_match_line_number = ''
            for j, po_line in enumerate(filtered_pos_list):
                match_result = fuzz.partial_ratio(line['description'], po_line['stock_description'])
                try: 
                    if float(line['qty']) == float(po_line['qty']):
                        match_result += 10
                except:
                    pass
#                 print(float(line['qty']), float(po_line['qty']), match_result)            
                if match_result > best_match_result and match_result not in matches:
                    best_match_result = match_result
                    best_match_line_number = j+1
#                 print(' --', match_result, po_line['stock_description'])
#             print(best_match_result, best_match_line_number)
#             print()
            output['lines'][i]['po_line_number'] = best_match_line_number
            matches.append(best_match_line_number)
#     print(matches)
    return output

In [65]:
test_od = Path('C:/Users/hudge/Desktop/2019_projects/transgrid_ap/other_data')
po_lines = pd.read_csv(test_od / 'po_line_data.csv')
po_lines.stock_code = po_lines.stock_code.apply(lambda x: '{0:.2f}'.format(x).rstrip('0').rstrip('.'))
po_lines.stock_code = po_lines.stock_code.astype('str')
po_lines.head()

Unnamed: 0.1,Unnamed: 0,po_number,line_number,po_date,supplier_number,supplier_name,stock_code,qty,item_name_line,stock_description
0,107787,T81768,1,2019-01-11,30056,L & H GROUP T/A AUSLEC,1055722,20.0,"WHEEL, ABRASIVE:","100 MM X 2.5 MM X 16 MM, REINFORCED CUTTING OF..."
1,107788,T81768,2,2019-01-11,30056,L & H GROUP T/A AUSLEC,360553,1.0,"BOOTS, SAFETY:","SIZE 9 1/2, HI LEG, ZIP SIDED, LACE UP BOOT, N..."
2,107789,T81768,3,2019-01-11,30056,L & H GROUP T/A AUSLEC,359955,5.0,"SHIRT, UTILITY:","SIZE-L,TWO TONE(OPEN) WITH REFLECTIVE TAPE,YEL..."
3,107790,T81768,4,2019-01-11,30056,L & H GROUP T/A AUSLEC,3006624,240.0,"TISSUE, FACIAL:","2 PLY,210 MM X 205 MM, 100 TISSUES PER BOX."
4,107791,T81768,5,2019-01-11,30056,L & H GROUP T/A AUSLEC,3584513,6.0,"SHIRT, UTILITY:",SIZE-2XL(46CM NECK)TWOTONE(CLOSED)FRONT YELLOW...


In [66]:
document = get_document(api_key, 100705962)
output = create_output(document)
output['lines'] = get_lines(document)
output

{'document_id': '100705962',
 'status': 'output',
 'version': '3',
 'header': {'supplier_abn': '76052484483',
  'abn_from_db_by_po': '76052484483',
  'supplier': 'RELAY MONITORING SYSTEMS PTY LTD',
  'supplier_id': '009098',
  'invoice_date': '2019-01-09',
  'invoice_number': '42504',
  'po_number': 'T81293',
  'net_total': '14820.00',
  'gst_total': '1482.00',
  'gross_total': '16302.00',
  'freight': '50.00',
  'bsb_number': '063159',
  'bank_account_number': '10006635',
  'line_count': '3',
  'filename': '20190124RelayMonitoring',
  'account_number': None,
  'period_start_date': None,
  'period_end_date': None,
  'email_date': '2019-02-01'},
 'stem': '20190124RelayMonitoring',
 'new_filename': '009098-42504',
 'document_url': 'https://web1-akl.xtracta.com/akl_northcote_storage2_datasource4/1/fe/92/iq205042603-6WMH3CAqF.pdf',
 'image_urls': ['https://web1-akl.xtracta.com/akl_northcote_storage2_datasource4/1/15/c9/ds100705962-iq205042603-6WMH3CAqF4900-1-800.jpg'],
 'lines': [{'po_item

In [67]:
get_best_match(output, po_lines)

Unnamed: 0_level_0,stock_description,qty,line_number
stock_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3550233,"PROTECTION RELAY,2HS/L14, RMS LATCHING MULTITR...",6.0,1
3550258,"PROTECTION RELAY,RMS HIGH IMPEDANCE DIFFERENTI...",1.0,2
362386,METROSIL C/W STABILISING RESISTOR 4K OHMS 100W...,2.0,3


{'document_id': '100705962',
 'status': 'output',
 'version': '3',
 'header': {'supplier_abn': '76052484483',
  'abn_from_db_by_po': '76052484483',
  'supplier': 'RELAY MONITORING SYSTEMS PTY LTD',
  'supplier_id': '009098',
  'invoice_date': '2019-01-09',
  'invoice_number': '42504',
  'po_number': 'T81293',
  'net_total': '14820.00',
  'gst_total': '1482.00',
  'gross_total': '16302.00',
  'freight': '50.00',
  'bsb_number': '063159',
  'bank_account_number': '10006635',
  'line_count': '3',
  'filename': '20190124RelayMonitoring',
  'account_number': None,
  'period_start_date': None,
  'period_end_date': None,
  'email_date': '2019-02-01'},
 'stem': '20190124RelayMonitoring',
 'new_filename': '009098-42504',
 'document_url': 'https://web1-akl.xtracta.com/akl_northcote_storage2_datasource4/1/fe/92/iq205042603-6WMH3CAqF.pdf',
 'image_urls': ['https://web1-akl.xtracta.com/akl_northcote_storage2_datasource4/1/15/c9/ds100705962-iq205042603-6WMH3CAqF4900-1-800.jpg'],
 'lines': [{'po_item

## Build HTML file for handling rejections

In [None]:
def create_html_section(data, html_template):
    template = Template(html_template)
    html = template.render(data=data)
    return html


reject_queue_html_template = """
    <div class="column is-full">
        <p><strong>Total number of rejects in queue: {{data.reject_count}}</strong></p>
    </div>
    <div class="column is-full">  
        <h2><strong>{{data.output.header.supplier}}</strong></h2>
        <p><strong>Invoice number:</strong> {{data.output.header.invoice_number}}</p>
    </div>
    <div class="column is-two-fifths">
    <section class="section has-text-right">
        <p><strong>Net:</strong> {{"$%.2f"|format(data.output.header.net_total|float)}}</p>
        <p><strong>GST:</strong> {{"$%.2f"|format(data.output.header.gst_total|float)}}</p>
        <p><strong>Total:</strong> {{"$%.2f"|format(data.output.header.gross_total|float)}}</p>
    </section>
    <section class="section">
        <table class="table">
        <thead><tr><th>Field</th><th>Message</th></tr></thead>
        <tbody>
        {% for message in data.messages %}
        <tr>
            <th>{{message.field}}</th>
            <td>{{message.message}}</td>
        </tr>
        {% endfor %}
        </tbody>
        </table>
    </section>
    </div>
    <div class="column is-three-fifths has-text-centered">
        <p><a href="{{data.review_link}}" target="_blank">Review invoice</a></p>
        <p><img src="{{data.invoice_image}}" alt="Invoice Image" width="250"></p>
    </div>
"""


def get_reject_html(api_key, workflow_id, status, html_template):
    queue = get_xtracta_status(api_key, workflow_id, status)
    reject_count = len(queue)
    document_id = queue[0]["document_id"]
    reasons = queue[0]["rejection"]["reason"]
    messages = get_reject_info(api_key, document_id, reasons)
    document = get_document(api_key, document_id)
    image_url = document["documents_response"]["document"]["image_url"][0]
    output = create_output(document)
    review_link = open_document_ui(api_key, document_id)
    data = {
        "output": output,
        "reject_count": reject_count,
        "review_link": review_link,
        "invoice_image": image_url,
        "messages": messages,
    }
    html = create_html_section(data=data, html_template=html_template)
    return html


In [None]:
def get_reject_info(api_key, document_id, reasons):
    messages = []
    document = get_document(api_key, document_id)
    field_ids = get_field_ids(document)
    if type(reasons) != list:
        field_id = reasons["linked_field"]["field_id"]
        message = reasons["message"]
        messages.append({"field": field_ids[field_id], "message": message})
    else:
        for sub_item in reasons:
            field_id = sub_item["linked_field"]["field_id"]
            message = sub_item["message"]
            messages.append({"field": field_ids[field_id], "message": message})
    return messages


def get_field_ids(document):
    field_ids = {}
    fields = document["documents_response"]["document"]["field_data"]["field"]
    for field in fields:
        field_ids[field["field_id"]] = field["field_name"]
    return field_ids

## Build code

The remaining cells load the code to PIP

In [4]:
!jupyter nbconvert \
    --TagRemovePreprocessor.enabled=True \
    --TagRemovePreprocessor.remove_cell_tags="['build', 'test']" \
    --TemplateExporter.exclude_output=True \
    --to python "xtracta.ipynb"

first_line = """'Xtracta package'

__version__ = '3.3'

"""
script_file = Path.cwd() / 'xtracta.py'
script = script_file.read_text()
script_file.write_text(first_line + script)
username = script_file.parent.parent.name
system_name = script_file.parent.name
standardised_script_name = f'pipomatic_{username}_{system_name}.py'
script_file.replace(script_file.parent / standardised_script_name)
standardised_script_name

[NbConvertApp] Converting notebook xtracta.ipynb to python
[NbConvertApp] Writing 18289 bytes to xtracta.py


'pipomatic_hudge_xtracta.py'

In [5]:
!black "pipomatic_hudge_xtracta.py"

reformatted pipomatic_hudge_xtracta.py
All done! \u2728 \U0001f370 \u2728
1 file reformatted.


In [None]:
# !flit publish