In [1]:
# general
from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Body
from typing import List, Annotated
import asyncio
import random
import tempfile
import shutil
import os
import fitz
import io
import base64
import datetime
import hashlib
import time
import anyio
from PIL import Image

# urllib3
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Azure AI Document Intelligence
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Custom Utils
from customutils import *

In [2]:
import pandas as pd

businessLine = 'FBI'

# DEFINE PATH
pdfpath = f"versionAchive/V0.9-alpha/testFiles/{businessLine}/"
# LIST ALL PDF
lspdf = [f for f in os.listdir(pdfpath) if f.endswith('.pdf')]
# DF
dfPDF = pd.DataFrame()
dfPDF['FULL_PATH'] = [pdfpath+f for f in lspdf]
dfPDF['FILE_SIZE_KB'] = dfPDF['FULL_PATH'].apply(lambda x: round(os.path.getsize(x) / 1024, 2))
dfPDF['FILE_NAME'] = lspdf
dfPDF['RESPONSE'] = None

dfPDF = dfPDF.iloc[:5]

dfPDF.head()

Unnamed: 0,FULL_PATH,FILE_SIZE_KB,FILE_NAME,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,98.78,PIM000003031-Actimalt Liquid Regular.pdf,
1,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,18.05,PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf,
2,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,94.88,PIM000003729-4. Acerola JC_TH.pdf,
3,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,142.46,PIM000003890-ACCELERZYME CPG.pdf,
4,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,71.86,PIM000007436-Specification_1101103010101_Apple...,


In [3]:
for i in range(len(dfPDF)):
    try:
        full_path = dfPDF['FULL_PATH'][i]
        file_name = dfPDF['FILE_NAME'][i]
        #url = "http://127.0.0.1:8000/v1_get_products_and_suppliers"
        url = "https://web-app-basic-b3-dksh-raw-tds-parser-acgchxgncqdjb2ew.southeastasia-01.azurewebsites.net/v1_get_products_and_suppliers"
        input_secret = os.getenv('CUSTOM_SECRET1')
        files = [('inputListDocumentation', open(full_path, 'rb'))]
        data = {'inputSecret': input_secret}
        response = requests.post(url, files=files, data=data, verify=False)
        dfPDF['RESPONSE'].iat[i] = response.json()['products_and_suppliers']
        print('SUCCESS ', file_name)
    except:
        print('ERROR   ', file_name)
dfPDF.head()

SUCCESS  PIM000003031-Actimalt Liquid Regular.pdf
SUCCESS  PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf
SUCCESS  PIM000003729-4. Acerola JC_TH.pdf
SUCCESS  PIM000003890-ACCELERZYME CPG.pdf
SUCCESS  PIM000007436-Specification_1101103010101_Apple juice concentrate_Austria Juice Standard - Stock quality - Medium acidity.pdf


Unnamed: 0,FULL_PATH,FILE_SIZE_KB,FILE_NAME,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,98.78,PIM000003031-Actimalt Liquid Regular.pdf,"[{'PRODUCT_NAME': 'ACTIMALT LIQUID REGULAR', '..."
1,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,18.05,PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf,[{'PRODUCT_NAME': 'FAMB279 N&A BROWN SUGAR TYP...
2,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,94.88,PIM000003729-4. Acerola JC_TH.pdf,"[{'PRODUCT_NAME': 'Acerola juice concentrate, ..."
3,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,142.46,PIM000003890-ACCELERZYME CPG.pdf,"[{'PRODUCT_NAME': 'Accelerzyme CPG BF', 'SUPPL..."
4,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,71.86,PIM000007436-Specification_1101103010101_Apple...,"[{'PRODUCT_NAME': 'Apple juice concentrate', '..."


In [4]:
lsdf = []
for i in range(len(dfPDF)):
    row = dfPDF.iloc[i]
    if row['RESPONSE'] is None:
        continue
    for prod in row['RESPONSE']:
        df = pd.DataFrame({'FULL_PATH':[row['FULL_PATH']],
                           'FILE_NAME':[row['FILE_NAME']],
                           'PRODUCT':[prod['PRODUCT_NAME']],
                           'SUPPLIER':[prod['SUPPLIER_NAME']]})
        lsdf.append(df)
dfPROD = pd.concat(lsdf, ignore_index=True)
dfPROD.head()

Unnamed: 0,FULL_PATH,FILE_NAME,PRODUCT,SUPPLIER
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003031-Actimalt Liquid Regular.pdf,ACTIMALT LIQUID REGULAR,Muntons
1,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf,FAMB279 N&A BROWN SUGAR TYPE FL,"WILD Flavors, Inc."
2,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003729-4. Acerola JC_TH.pdf,"Acerola juice concentrate, clarified R=64-68",SVZ Tomaszow Sp. z o.o.
3,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003890-ACCELERZYME CPG.pdf,Accelerzyme CPG BF,DSM Food Specialties B.V.
4,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000007436-Specification_1101103010101_Apple...,Apple juice concentrate,AUSTRIA JUICE GmbH


In [None]:
##############################
# OPTION1: USING FILE UPLOAD #
##############################

# dfPROD['RESPONSE'] = None
# for i in range(len(dfPROD)):
#     try:
#         row = dfPROD.iloc[i]
#         full_path = row['FULL_PATH']
#         file_name = row['FILE_NAME']
#         #url = "http://127.0.0.1:8000/v1_parse_pim_fields"
#         url = "https://web-app-basic-b3-dksh-raw-tds-parser-acgchxgncqdjb2ew.southeastasia-01.azurewebsites.net/v1_get_products_and_suppliers"
#         data = {
#             "inputProductName": row['PRODUCT'],
#             "inputBusinessLine": businessLine,
#             "inputSecret": os.getenv('CUSTOM_SECRET1'),
#             # booleans sent as strings in multipart forms
#             "inputWebSearch": "false",
#             "inputParallel": "true"}
#         files = [("inputListDocumentation", ("PIM00000671-TDS.pdf", open(row["FULL_PATH"], "rb"), "application/pdf"))]
#         response = requests.post(url, data=data, files=files, timeout=300, verify=False)
#         dfPROD['RESPONSE'].iat[i] = response.json()
#         print('SUCCESS ', file_name)
#     except:
#         print('FAILURE ', file_name)
# dfPROD

SUCCESS  PIM000003031-Actimalt Liquid Regular.pdf
SUCCESS  PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf
SUCCESS  PIM000003729-4. Acerola JC_TH.pdf
SUCCESS  PIM000003890-ACCELERZYME CPG.pdf
SUCCESS  PIM000007436-Specification_1101103010101_Apple juice concentrate_Austria Juice Standard - Stock quality - Medium acidity.pdf


In [None]:
######################
# OPTION2: USING B64 #
######################

# dfPROD['RESPONSE'] = None
# for i in range(len(dfPROD)):
#     try:
#         row = dfPROD.iloc[i]
#         full_path = row['FULL_PATH']
#         file_name = row['FILE_NAME']
#         pdf_base64 = pdf_to_base64(full_path)
#         url = "http://127.0.0.1:8000/v1_parse_pim_fields_b64"
#         #url = "https://web-app-basic-b3-dksh-raw-tds-parser-acgchxgncqdjb2ew.southeastasia-01.azurewebsites.net/v1_get_products_and_suppliers_v64"
#         data = {
#             "inputProductName": row['PRODUCT'],
#             "inputBusinessLine": businessLine,
#             "inputSecret": os.getenv('CUSTOM_SECRET1'),
#             "inputListDocumentationB64": [pdf_base64],
#             "inputWebSearch": "false",
#             "inputParallel": "true"}
#         response = requests.post(url, data=data, timeout=300, verify=False)
#         dfPROD['RESPONSE'].iat[i] = response.json()
#         print('SUCCESS ', file_name)
#     except:
#         print('FAILURE ', file_name)
# dfPROD

SUCCESS  PIM000003031-Actimalt Liquid Regular.pdf
SUCCESS  PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf
SUCCESS  PIM000003729-4. Acerola JC_TH.pdf
SUCCESS  PIM000003890-ACCELERZYME CPG.pdf
SUCCESS  PIM000007436-Specification_1101103010101_Apple juice concentrate_Austria Juice Standard - Stock quality - Medium acidity.pdf


Unnamed: 0,FULL_PATH,FILE_NAME,PRODUCT,SUPPLIER,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003031-Actimalt Liquid Regular.pdf,ACTIMALT LIQUID REGULAR,Muntons,{'inputProductName': 'ACTIMALT LIQUID REGULAR'...
1,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf,FAMB279 N&A BROWN SUGAR TYPE FL,"WILD Flavors, Inc.",{'inputProductName': 'FAMB279 N&A BROWN SUGAR ...
2,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003729-4. Acerola JC_TH.pdf,"Acerola juice concentrate, clarified R=64-68",SVZ Tomaszow Sp. z o.o.,{'inputProductName': 'Acerola juice concentrat...
3,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003890-ACCELERZYME CPG.pdf,Accelerzyme CPG BF,DSM Food Specialties B.V.,"{'inputProductName': 'Accelerzyme CPG BF', 'in..."
4,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000007436-Specification_1101103010101_Apple...,Apple juice concentrate,AUSTRIA JUICE GmbH,{'inputProductName': 'Apple juice concentrate'...


In [23]:
data

{'inputProductName': 'Apple juice concentrate',
 'inputBusinessLine': 'FBI',
 'inputSecret': 'hasZ6CI3N0R15bJmsK3UYJDX8VunScundHHUR9Agu05E0q3H1yqSBXiEcmpUTM5U',
 'inputListDocumentationB64': ['JVBERi0xLjcKjp2jtMXW5/gKMiAwIG9iagpbL0lDQ0Jhc2VkIDMgMCBSXQplbmRvYmoKMyAwIG9iago8PAovRmlsdGVyIC9GbGF0ZURlY29kZSAKL0xlbmd0aCAyNTk2IAovTiAzIAo+PgpzdHJlYW0KeJydlndUU9kWh8+9N71QkhCKlNBraFICSA29SJEuKjEJEErAkAAiNkRUcERRkaYIMijggKNDkbEiioUBUbHrBBlE1HFwFBuWSWStGd+8ee/Nm98f935rn73P3Wfvfda6AJD8gwXCTFgJgAyhWBTh58WIjYtnYAcBDPAAA2wA4HCzs0IW+EYCmQJ82IxsmRP4F726DiD5+yrTP4zBAP+flLlZIjEAUJiM5/L42VwZF8k4PVecJbdPyZi2NE3OMErOIlmCMlaTc/IsW3z2mWUPOfMyhDwZy3PO4mXw5Nwn4405Er6MkWAZF+cI+LkyviZjg3RJhkDGb+SxGXxONgAoktwu5nNTZGwtY5IoMoIt43kA4EjJX/DSL1jMzxPLD8XOzFouEiSniBkmXFOGjZMTi+HPz03ni8XMMA43jSPiMdiZGVkc4XIAZs/8WRR5bRmyIjvYODk4MG0tbb4o1H9d/JuS93aWXoR/7hlEH/jD9ld+mQ0AsKZltdn6h21pFQBd6wFQu/2HzWAvAIqyvnUOfXEeunxeUsTiLGcrq9zcXEsBn2spL+jv+p8Of0NffM9Svt3v5WF485M4knQxQ143bmZ6pkTEyM7icPkM5p+H+B8H/nUeFhH8JL6IL5RFRMumTCBMlrVbyBOIBZl