In [19]:
# GENERAL
from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Body
from typing import List, Annotated
import asyncio
import random
import tempfile
import shutil
import os
import fitz
import io
import base64
import datetime
import hashlib
import time
import anyio
from PIL import Image

# URLLIB3
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# AZURE AI DOCUMENT INTELLIGENCE
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient

# LOAD ENV VARIABLES
from dotenv import load_dotenv
load_dotenv()

# CUSTOM UTILS
from customutils import *

In [30]:
import pandas as pd

businessLine = 'PCI'

# DEFINE PATH
pdfpath = f"versionAchive/V0.9-alpha/testFiles/PCI/"
# LIST ALL PDF
lspdf = [f for f in os.listdir(pdfpath) if f.endswith('.pdf')]
# DF
dfPDF = pd.DataFrame()
dfPDF['FULL_PATH'] = [pdfpath+f for f in lspdf]
dfPDF['FILE_SIZE_KB'] = dfPDF['FULL_PATH'].apply(lambda x: round(os.path.getsize(x) / 1024, 2))
dfPDF['FILE_NAME'] = lspdf
dfPDF['RESPONSE'] = None

# dfPDF = dfPDF.iloc[:3]

dfPDF.head()

Unnamed: 0,FULL_PATH,FILE_SIZE_KB,FILE_NAME,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,115.51,PIM000000425-COLOR CLAY Cydonia Natural TDS.pdf,
1,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,2006.8,PIM000000425-COLOR CLAY Natural Brochure.pdf,
2,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,152.21,PIM000000671-TDS Active Juice CAVOLO NERO (KAL...,
3,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,9231.18,PIM000000677-AQUPEC_MG_N40R_TDS.pdf,
4,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,2917.63,PIM000000948-TDS_Actimulsi GA20_Assessa.pdf,


In [31]:
dfPDF = dfPDF.iloc[:5]

In [32]:
for i in range(len(dfPDF)):
    try:
        full_path = dfPDF['FULL_PATH'][i]
        file_name = dfPDF['FILE_NAME'][i]
        url = "http://127.0.0.1:8000/v1_get_products_and_suppliers"
        #url = "https://web-app-basic-b3-dksh-raw-tds-parser-acgchxgncqdjb2ew.southeastasia-01.azurewebsites.net/v1_get_products_and_suppliers"
        input_secret = os.getenv('CUSTOM_SECRET1')
        files = [('inputListDocumentation', open(full_path, 'rb'))]
        data = {'inputSecret': input_secret}
        response = requests.post(url, files=files, data=data, verify=False)
        dfPDF['RESPONSE'].iat[i] = response.json()['products_and_suppliers']
        print('SUCCESS ', file_name)
    except:
        print('ERROR   ', file_name)
dfPDF.head()

SUCCESS  PIM000000425-COLOR CLAY Cydonia Natural TDS.pdf
SUCCESS  PIM000000425-COLOR CLAY Natural Brochure.pdf
SUCCESS  PIM000000671-TDS Active Juice CAVOLO NERO (KALE).pdf
SUCCESS  PIM000000677-AQUPEC_MG_N40R_TDS.pdf
SUCCESS  PIM000000948-TDS_Actimulsi GA20_Assessa.pdf


Unnamed: 0,FULL_PATH,FILE_SIZE_KB,FILE_NAME,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,115.51,PIM000000425-COLOR CLAY Cydonia Natural TDS.pdf,"[{'PRODUCT_NAME': 'Color Clay® CYDONIA', 'SUPP..."
1,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,2006.8,PIM000000425-COLOR CLAY Natural Brochure.pdf,"[{'PRODUCT_NAME': 'Color Clay®', 'SUPPLIER_NAM..."
2,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,152.21,PIM000000671-TDS Active Juice CAVOLO NERO (KAL...,[{'PRODUCT_NAME': 'ACTIVE JUICE CAVOLO NERO (K...
3,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,9231.18,PIM000000677-AQUPEC_MG_N40R_TDS.pdf,"[{'PRODUCT_NAME': 'AQUPEC MG N40R', 'SUPPLIER_..."
4,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,2917.63,PIM000000948-TDS_Actimulsi GA20_Assessa.pdf,"[{'PRODUCT_NAME': 'ACTIMULSI GA 20', 'SUPPLIER..."


In [33]:
lsdf = []
for i in range(len(dfPDF)):
    row = dfPDF.iloc[i]
    if row['RESPONSE'] is None:
        continue
    for prod in row['RESPONSE']:
        df = pd.DataFrame({'FULL_PATH':[row['FULL_PATH']],
                           'FILE_NAME':[row['FILE_NAME']],
                           'PRODUCT':[prod['PRODUCT_NAME']],
                           'SUPPLIER':[prod['SUPPLIER_NAME']]})
        lsdf.append(df)
dfPROD = pd.concat(lsdf, ignore_index=True)
dfPROD['B64'] = [pdf_to_base64(x) for x in dfPROD['FULL_PATH']]
dfPROD['B64_LEN'] = [len(x) for x in dfPROD['B64']]
dfPROD.head()

Unnamed: 0,FULL_PATH,FILE_NAME,PRODUCT,SUPPLIER,B64,B64_LEN
0,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,PIM000000425-COLOR CLAY Cydonia Natural TDS.pdf,Color Clay® CYDONIA,"DKSH Marketing Services Spain, S.A.U.",JVBERi0xLjUNJeLjz9MNCjIxOSAwIG9iag08PC9MaW5lYX...,157712
1,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,PIM000000425-COLOR CLAY Natural Brochure.pdf,Color Clay®,Colorclay S.L.,JVBERi0xLjMNJeLjz9MNCjMyIDAgb2JqDTw8L0xpbmVhcm...,2739952
2,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,PIM000000671-TDS Active Juice CAVOLO NERO (KAL...,ACTIVE JUICE CAVOLO NERO (KALE),Phenbiox s.r.l.,JVBERi0xLjUNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0...,207820
3,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,PIM000000677-AQUPEC_MG_N40R_TDS.pdf,AQUPEC MG N40R,"Sumitomo Seika Chemicals Co., Ltd.",JVBERi0xLjYNJeLjz9MNCjcyMzEgMCBvYmoNPDwvTGluZW...,12603636
4,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,PIM000000948-TDS_Actimulsi GA20_Assessa.pdf,ACTIMULSI GA 20,"ASSESSA INDÚSTRIA, COMÉRCIO E EXPORTAÇÃO LTDA.",JVBERi0xLjYNJeLjz9MNCjkxIDAgb2JqDTw8L0xpbmVhcm...,3983540


In [None]:
# dfPROD = dfPROD.iloc[:2]
# dfPROD

Unnamed: 0,FULL_PATH,FILE_NAME,PRODUCT,SUPPLIER,B64,B64_LEN
0,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,PIM000000425-COLOR CLAY Cydonia Natural TDS.pdf,Color Clay® CYDONIA,"DKSH Marketing Services Spain, S.A.U.",JVBERi0xLjUNJeLjz9MNCjIxOSAwIG9iag08PC9MaW5lYX...,157712
1,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,PIM000000425-COLOR CLAY Natural Brochure.pdf,Cedrus,Color Clay®,JVBERi0xLjMNJeLjz9MNCjMyIDAgb2JqDTw8L0xpbmVhcm...,2739952


In [34]:
##############################
# OPTION1: USING FILE UPLOAD #
##############################

dfPROD['RESPONSE'] = None
for i in range(len(dfPROD)):
    try:
        row = dfPROD.iloc[i]
        full_path = row['FULL_PATH']
        file_name = row['FILE_NAME']
        url = "http://127.0.0.1:8000/v1_parse_pim_fields"
        # url = "https://web-app-basic-b3-dksh-raw-tds-parser-acgchxgncqdjb2ew.southeastasia-01.azurewebsites.net/v1_parse_pim_fields"
        data = {
            "inputProductName": row['PRODUCT'],
            "inputBusinessLine": businessLine,
            "inputSecret": os.getenv('CUSTOM_SECRET1'),
            # booleans sent as strings in multipart forms
            "inputWebSearch": "false",
            "inputParallel": "true"}
        files = [("inputListDocumentation", (file_name, open(row["FULL_PATH"], "rb"), "application/pdf"))]
        response = requests.post(url, data=data, files=files, timeout=300, verify=False)
        dfPROD['RESPONSE'].iat[i] = response.json()
        print('SUCCESS ', file_name)
    except:
        print('FAILURE ', file_name)
dfPROD

SUCCESS  PIM000000425-COLOR CLAY Cydonia Natural TDS.pdf
SUCCESS  PIM000000425-COLOR CLAY Natural Brochure.pdf
SUCCESS  PIM000000671-TDS Active Juice CAVOLO NERO (KALE).pdf
SUCCESS  PIM000000677-AQUPEC_MG_N40R_TDS.pdf
SUCCESS  PIM000000948-TDS_Actimulsi GA20_Assessa.pdf


Unnamed: 0,FULL_PATH,FILE_NAME,PRODUCT,SUPPLIER,B64,B64_LEN,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,PIM000000425-COLOR CLAY Cydonia Natural TDS.pdf,Color Clay® CYDONIA,"DKSH Marketing Services Spain, S.A.U.",JVBERi0xLjUNJeLjz9MNCjIxOSAwIG9iag08PC9MaW5lYX...,157712,"{'inputProductName': 'Color Clay® CYDONIA', 'i..."
1,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,PIM000000425-COLOR CLAY Natural Brochure.pdf,Color Clay®,Colorclay S.L.,JVBERi0xLjMNJeLjz9MNCjMyIDAgb2JqDTw8L0xpbmVhcm...,2739952,"{'inputProductName': 'Color Clay®', 'inputBusi..."
2,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,PIM000000671-TDS Active Juice CAVOLO NERO (KAL...,ACTIVE JUICE CAVOLO NERO (KALE),Phenbiox s.r.l.,JVBERi0xLjUNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0...,207820,{'inputProductName': 'ACTIVE JUICE CAVOLO NERO...
3,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,PIM000000677-AQUPEC_MG_N40R_TDS.pdf,AQUPEC MG N40R,"Sumitomo Seika Chemicals Co., Ltd.",JVBERi0xLjYNJeLjz9MNCjcyMzEgMCBvYmoNPDwvTGluZW...,12603636,"{'inputProductName': 'AQUPEC MG N40R', 'inputB..."
4,versionAchive/V0.9-alpha/testFiles/PCI/PIM0000...,PIM000000948-TDS_Actimulsi GA20_Assessa.pdf,ACTIMULSI GA 20,"ASSESSA INDÚSTRIA, COMÉRCIO E EXPORTAÇÃO LTDA.",JVBERi0xLjYNJeLjz9MNCjkxIDAgb2JqDTw8L0xpbmVhcm...,3983540,"{'inputProductName': 'ACTIMULSI GA 20', 'input..."


In [27]:
######################
# OPTION2: USING B64 #
######################

dfPROD['RESPONSE'] = None
for i in range(len(dfPROD)):
    try:
        row = dfPROD.iloc[i]
        full_path = row['FULL_PATH']
        file_name = row['FILE_NAME']
        pdf_base64 = row['B64']
        #url = "http://127.0.0.1:8000/v1_parse_pim_fields_b64"
        url = "https://web-app-basic-b3-dksh-raw-tds-parser-acgchxgncqdjb2ew.southeastasia-01.azurewebsites.net/v1_parse_pim_fields_b64"
        data = {
            "inputProductName": row['PRODUCT'],
            "inputBusinessLine": businessLine,
            "inputSecret": os.getenv('CUSTOM_SECRET1'),
            "inputListDocumentationB64": [pdf_base64],
            "inputWebSearch": "false",
            "inputParallel": "true"}
        response = requests.post(url, data=data, timeout=300, verify=False)
        dfPROD['RESPONSE'].iat[i] = response.json()
        print('SUCCESS ', file_name, len(pdf_base64))
    except:
        print('FAILURE ', file_name)
dfPROD

SUCCESS  PIM000003031-Actimalt Liquid Regular.pdf 134864
SUCCESS  PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf 24648
SUCCESS  PIM000003729-4. Acerola JC_TH.pdf 129544
SUCCESS  PIM000003890-ACCELERZYME CPG.pdf 194512
SUCCESS  PIM000007436-Specification_1101103010101_Apple juice concentrate_Austria Juice Standard - Stock quality - Medium acidity.pdf 98116
SUCCESS  PIM000008862-EXTER 100 TDS 2021.pdf 198396
SUCCESS  PIM000009406-Rokoagar LS ORGANIC - Food Applications Eng - E01.pdf 510316
SUCCESS  PIM000009441-SPEC APPLE FLAVOUR NPC1917M.pdf 105664
SUCCESS  PIM000009983-YU15 rev02 - organic chickpeas flour.pdf 426540
SUCCESS  PIM000021685-Specification Apple JC 35209000710000 20180717.pdf 163692


Unnamed: 0,FULL_PATH,FILE_NAME,PRODUCT,SUPPLIER,B64,B64_LEN,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003031-Actimalt Liquid Regular.pdf,ACTIMALT LIQUID REGULAR,Muntons,JVBERi0xLjUNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0...,134864,{'inputProductName': 'ACTIMALT LIQUID REGULAR'...
1,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf,FAMB279 N&A BROWN SUGAR TYPE FL,"WILD Flavors, Inc.",JVBERi0xLjUNJeLjz9MNCjE5IDAgb2JqDTw8L0xpbmVhcm...,24648,{'inputProductName': 'FAMB279 N&A BROWN SUGAR ...
2,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003729-4. Acerola JC_TH.pdf,"Acerola juice concentrate, clarified R=64-68",SVZ Tomaszow Sp. z o.o.,JVBERi0xLjQKJeLjz9MNCjEgMCBvYmoKPDwgCi9DcmVhdG...,129544,{'inputProductName': 'Acerola juice concentrat...
3,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003890-ACCELERZYME CPG.pdf,Accelerzyme CPG BF,DSM Food Specialties B.V.,JVBERi0xLjQKJeLjz9MKCjEgMCBvYmoKPDwvVHlwZSAvQ2...,194512,"{'inputProductName': 'Accelerzyme CPG BF', 'in..."
4,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000007436-Specification_1101103010101_Apple...,Apple juice concentrate,AUSTRIA JUICE GmbH,JVBERi0xLjcKjp2jtMXW5/gKMiAwIG9iagpbL0lDQ0Jhc2...,98116,{'inputProductName': 'Apple juice concentrate'...
5,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000008862-EXTER 100 TDS 2021.pdf,EXTER 100 - Bouillon,Exter B.V.,JVBERi0xLjYNJeLjz9MNCjMwMCAwIG9iag08PC9MaW5lYX...,198396,"{'inputProductName': 'EXTER 100 - Bouillon', '..."
6,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000009406-Rokoagar LS ORGANIC - Food Applic...,ROKOAGAR LS ORGANIC,"INDUSTRIAS ROKO, S.A.",JVBERi0xLjUNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0...,510316,"{'inputProductName': 'ROKOAGAR LS ORGANIC', 'i..."
7,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000009441-SPEC APPLE FLAVOUR NPC1917M.pdf,APPLE FLAVOUR NPC1917M,Matrix Flavours & Fragrances Sdn Bhd,JVBERi0xLjUKJeLjz9MKMyAwIG9iago8PC9Db2xvclNwYW...,105664,"{'inputProductName': 'APPLE FLAVOUR NPC1917M',..."
8,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000009983-YU15 rev02 - organic chickpeas fl...,Pre-gelatinized organic chickpeas flour,Naturis,JVBERi0xLjMKJcTl8uXrp/Og0MTGCjMgMCBvYmoKPDwgL0...,426540,{'inputProductName': 'Pre-gelatinized organic ...
9,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000021685-Specification Apple JC 3520900071...,Apple Juice Concentrate,WILD,JVBERi0xLjUNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0...,163692,{'inputProductName': 'Apple Juice Concentrate'...


# REARRANGE

In [28]:
dfPROD2 = dfPROD.copy()

keys = list(dfPROD2['RESPONSE'].iat[0].keys())
for key in keys:
    dfPROD2[key] = [x[key] for x in dfPROD2['RESPONSE']]

lscol = ['FILE_NAME', 'PRODUCT',   
       'gpt_manufacturer_or_supplier_answer',
       'gpt_manufacturer_or_supplier_reason',
       'gpt_select_industry_cluster_answer',
       'gpt_select_industry_cluster_reason', 'gpt_select_compositions_answer',
       'gpt_select_compositions_reason', 'gpt_select_functions_answer',
       'gpt_select_functions_reason', 'gpt_select_applications_answer',
       'gpt_select_applications_reason', 'gpt_cas_from_doc_answer',
       'gpt_cas_from_doc_reason', 'gpt_physical_form_answer',
       'gpt_physical_form_reason', 'gpt_gen_product_description',
       'gpt_recommended_dosage_answer', 'gpt_recommended_dosage_reason',
       'gpt_certifications_answer', 'gpt_certifications_reason',
       'gpt_claims_answer', 'gpt_claims_reason', 'gpt_health_benefits_answer',
       'gpt_health_benefits_reason']
dfPROD2 = dfPROD2[lscol]
dfPROD2.to_excel('tmp.xlsx', index=False)

dfPROD2

Unnamed: 0,FILE_NAME,PRODUCT,gpt_manufacturer_or_supplier_answer,gpt_manufacturer_or_supplier_reason,gpt_select_industry_cluster_answer,gpt_select_industry_cluster_reason,gpt_select_compositions_answer,gpt_select_compositions_reason,gpt_select_functions_answer,gpt_select_functions_reason,...,gpt_physical_form_reason,gpt_gen_product_description,gpt_recommended_dosage_answer,gpt_recommended_dosage_reason,gpt_certifications_answer,gpt_certifications_reason,gpt_claims_answer,gpt_claims_reason,gpt_health_benefits_answer,gpt_health_benefits_reason
0,PIM000000425-COLOR CLAY Cydonia Natural TDS.pdf,Color Clay® CYDONIA,"DKSH Marketing Services Spain, S.A.U.",The document states that the Color Clay® produ...,[Personal Care],Selected Personal Care because the document me...,[Mineral],Selected Mineral because the product is descri...,"[Cleanser, Dye/Pigment]",Selected Cleanser because the document mention...,...,Selected Powder because the document explicitl...,Color Clay® CYDONIA is a natural quince yellow...,,No recommended dosage instructions are mention...,[],No certifications related to the product Color...,[],No explicit claims related to any of the liste...,[],Only applicable for FBI business line
1,PIM000000425-COLOR CLAY Natural Brochure.pdf,Color Clay®,Colorclay S.L.,The document states that Colorclay S.L. is inv...,[Personal Care],Selected Personal Care because the document ex...,[Mineral],Selected Mineral because the document repeated...,"[Cleanser, Dye/Pigment, Exfoliant, Skin Sensor...",Selected Cleanser because the document mention...,...,Selected Powder because the document mentions ...,Color Clay® by Colorclay S.L. is a 100% natura...,,"No specific recommended dosage instructions, i...",[],No certifications related to Color Clay® from ...,"[Natural Cosmetic, Purifying, Sebum Control, S...",Selected Natural Cosmetic because the product ...,[],Only applicable for FBI business line


In [14]:
dfPROD2.columns

Index(['FULL_PATH', 'FILE_NAME', 'PRODUCT', 'SUPPLIER', 'B64', 'B64_LEN',
       'RESPONSE', 'inputProductName', 'inputBusinessLine',
       'inputListDocumentation', 'inputSecret', 'inputWebSearch',
       'inputParallel', 'stg_lsTempFile', 'stg_businessLineStr',
       'stg_hashinputProductName', 'stg_hashinputBusinessLine',
       'stg_hashinputListDocumentation', 'stg_hashCombined',
       'stg_lsParsedText', 'stg_parsedText', 'stg_lsBase64',
       'gpt_manufacturer_or_supplier_answer',
       'gpt_manufacturer_or_supplier_reason', 'gpt_composition_search_answer',
       'gpt_function_search_answer', 'gpt_application_search_answer',
       'gpt_combined_web_search', 'gpt_text_of_this_product_only_answer',
       'gpt_select_industry_cluster_answer',
       'gpt_select_industry_cluster_reason', 'gpt_select_compositions_answer',
       'gpt_select_compositions_reason', 'gpt_select_functions_answer',
       'gpt_select_functions_reason', 'gpt_select_applications_answer',
       'gpt_

In [11]:
list(dfPROD['RESPONSE'].iat[0].keys())

['inputProductName',
 'inputBusinessLine',
 'inputListDocumentation',
 'inputSecret',
 'inputWebSearch',
 'inputParallel',
 'stg_lsTempFile',
 'stg_businessLineStr',
 'stg_hashinputProductName',
 'stg_hashinputBusinessLine',
 'stg_hashinputListDocumentation',
 'stg_hashCombined',
 'stg_lsParsedText',
 'stg_parsedText',
 'stg_lsBase64',
 'gpt_manufacturer_or_supplier_answer',
 'gpt_manufacturer_or_supplier_reason',
 'gpt_composition_search_answer',
 'gpt_function_search_answer',
 'gpt_application_search_answer',
 'gpt_combined_web_search',
 'gpt_text_of_this_product_only_answer',
 'gpt_select_industry_cluster_answer',
 'gpt_select_industry_cluster_reason',
 'gpt_select_compositions_answer',
 'gpt_select_compositions_reason',
 'gpt_select_functions_answer',
 'gpt_select_functions_reason',
 'gpt_select_applications_answer',
 'gpt_select_applications_reason',
 'gpt_cas_from_doc_answer',
 'gpt_cas_from_doc_reason',
 'gpt_physical_form_answer',
 'gpt_physical_form_reason',
 'gpt_gen_product_