In [1]:
# GENERAL
from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Body
from typing import Dict, Any
from typing import List, Annotated
import asyncio
import random
import tempfile
import shutil
import os
import fitz
import io
import base64
import datetime
import hashlib
import time
import anyio
import requests
import json
import simple_salesforce
from PIL import Image

# URLLIB3
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# AZURE AI DOCUMENT INTELLIGENCE
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient

# LOAD ENV VARIABLES
from dotenv import load_dotenv
load_dotenv()

# CUSTOM UTILS
from customutils import *

In [2]:
import pandas as pd

businessLine = 'FBI'

# DEFINE PATH
pdfpath = f"versionAchive/V0.9-alpha/testFiles/{businessLine}/"
# LIST ALL PDF
lspdf = [f for f in os.listdir(pdfpath) if f.endswith('.pdf')]
# DF
dfPDF = pd.DataFrame()
dfPDF['FULL_PATH'] = [pdfpath+f for f in lspdf]
dfPDF['FILE_SIZE_KB'] = dfPDF['FULL_PATH'].apply(lambda x: round(os.path.getsize(x) / 1024, 2))
dfPDF['FILE_NAME'] = lspdf
dfPDF['RESPONSE'] = None

dfPDF.head()

Unnamed: 0,FULL_PATH,FILE_SIZE_KB,FILE_NAME,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,98.78,PIM000003031-Actimalt Liquid Regular.pdf,
1,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,18.05,PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf,
2,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,94.88,PIM000003729-4. Acerola JC_TH.pdf,
3,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,142.46,PIM000003890-ACCELERZYME CPG.pdf,
4,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,71.86,PIM000007436-Specification_1101103010101_Apple...,


In [3]:
dfPDF = dfPDF.iloc[:5]
dfPDF.head()

Unnamed: 0,FULL_PATH,FILE_SIZE_KB,FILE_NAME,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,98.78,PIM000003031-Actimalt Liquid Regular.pdf,
1,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,18.05,PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf,
2,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,94.88,PIM000003729-4. Acerola JC_TH.pdf,
3,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,142.46,PIM000003890-ACCELERZYME CPG.pdf,
4,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,71.86,PIM000007436-Specification_1101103010101_Apple...,


In [4]:
for i in range(len(dfPDF)):
    try:
        full_path = dfPDF['FULL_PATH'][i]
        file_name = dfPDF['FILE_NAME'][i]
        url = "http://127.0.0.1:8000/v1_get_products_and_suppliers"
        #url = "https://web-app-basic-b3-dksh-raw-tds-parser-acgchxgncqdjb2ew.southeastasia-01.azurewebsites.net/v1_get_products_and_suppliers"
        input_secret = os.getenv('CUSTOM_SECRET1')
        files = [('inputListDocumentation', open(full_path, 'rb'))]
        data = {'inputSecret': input_secret}
        response = requests.post(url, files=files, data=data, verify=False)
        dfPDF['RESPONSE'].iat[i] = response.json()['products_and_suppliers']
        print('SUCCESS ', file_name)
    except:
        print('ERROR   ', file_name)
dfPDF.head()

SUCCESS  PIM000003031-Actimalt Liquid Regular.pdf
SUCCESS  PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf
SUCCESS  PIM000003729-4. Acerola JC_TH.pdf
SUCCESS  PIM000003890-ACCELERZYME CPG.pdf
SUCCESS  PIM000007436-Specification_1101103010101_Apple juice concentrate_Austria Juice Standard - Stock quality - Medium acidity.pdf


Unnamed: 0,FULL_PATH,FILE_SIZE_KB,FILE_NAME,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,98.78,PIM000003031-Actimalt Liquid Regular.pdf,"[{'PRODUCT_NAME': 'ACTIMALT LIQUID REGULAR', '..."
1,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,18.05,PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf,[{'PRODUCT_NAME': 'FAMB279 N&A BROWN SUGAR TYP...
2,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,94.88,PIM000003729-4. Acerola JC_TH.pdf,"[{'PRODUCT_NAME': 'Acerola juice concentrate, ..."
3,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,142.46,PIM000003890-ACCELERZYME CPG.pdf,"[{'PRODUCT_NAME': 'Accelerzyme CPG BF', 'SUPPL..."
4,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,71.86,PIM000007436-Specification_1101103010101_Apple...,"[{'PRODUCT_NAME': 'Apple juice concentrate', '..."


In [5]:
lsdf = []
for i in range(len(dfPDF)):
    row = dfPDF.iloc[i]
    if row['RESPONSE'] is None:
        continue
    for prod in row['RESPONSE']:
        df = pd.DataFrame({'FULL_PATH':[row['FULL_PATH']],
                           'FILE_NAME':[row['FILE_NAME']],
                           'PRODUCT':[prod['PRODUCT_NAME']],
                           'SUPPLIER':[prod['SUPPLIER_NAME']]})
        lsdf.append(df)
dfPROD = pd.concat(lsdf, ignore_index=True)
dfPROD['B64'] = [pdf_to_base64(x) for x in dfPROD['FULL_PATH']]
dfPROD['B64_LEN'] = [len(x) for x in dfPROD['B64']]
dfPROD.head()

Unnamed: 0,FULL_PATH,FILE_NAME,PRODUCT,SUPPLIER,B64,B64_LEN
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003031-Actimalt Liquid Regular.pdf,ACTIMALT LIQUID REGULAR,Muntons,JVBERi0xLjUNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0...,134864
1,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf,FAMB279 N&A BROWN SUGAR TYPE FL,"WILD Flavors, Inc.",JVBERi0xLjUNJeLjz9MNCjE5IDAgb2JqDTw8L0xpbmVhcm...,24648
2,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003729-4. Acerola JC_TH.pdf,"Acerola juice concentrate, clarified R=64-68",SVZ Tomaszow Sp. z o.o.,JVBERi0xLjQKJeLjz9MNCjEgMCBvYmoKPDwgCi9DcmVhdG...,129544
3,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003890-ACCELERZYME CPG.pdf,Accelerzyme CPG BF,DSM Food Specialties B.V.,JVBERi0xLjQKJeLjz9MKCjEgMCBvYmoKPDwvVHlwZSAvQ2...,194512
4,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000007436-Specification_1101103010101_Apple...,Apple juice concentrate,AUSTRIA JUICE GmbH,JVBERi0xLjcKjp2jtMXW5/gKMiAwIG9iagpbL0lDQ0Jhc2...,98116


In [6]:
# dfPROD = dfPROD.iloc[:2]
# dfPROD

In [6]:
##############################
# OPTION1: USING FILE UPLOAD #
##############################

dfPROD['RESPONSE'] = None
for i in range(len(dfPROD)):
    try:
        row = dfPROD.iloc[i]
        full_path = row['FULL_PATH']
        file_name = row['FILE_NAME']
        url = "http://127.0.0.1:8000/v1_parse_pim_fields"
        # url = "https://web-app-basic-b3-dksh-raw-tds-parser-acgchxgncqdjb2ew.southeastasia-01.azurewebsites.net/v1_parse_pim_fields"
        data = {
            "inputProductName": row['PRODUCT'],
            "inputBusinessLine": businessLine,
            "inputSecret": os.getenv('CUSTOM_SECRET1'),
            # booleans sent as strings in multipart forms
            "inputWebSearch": "false",
            "inputParallel": "true"}
        files = [("inputListDocumentation", (file_name, open(row["FULL_PATH"], "rb"), "application/pdf"))]
        response = requests.post(url, data=data, files=files, timeout=300, verify=False)
        dfPROD['RESPONSE'].iat[i] = response.json()
        print('SUCCESS ', file_name)
    except:
        print('FAILURE ', file_name)
dfPROD

SUCCESS  PIM000003031-Actimalt Liquid Regular.pdf
SUCCESS  PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf
SUCCESS  PIM000003729-4. Acerola JC_TH.pdf
SUCCESS  PIM000003890-ACCELERZYME CPG.pdf
SUCCESS  PIM000007436-Specification_1101103010101_Apple juice concentrate_Austria Juice Standard - Stock quality - Medium acidity.pdf


Unnamed: 0,FULL_PATH,FILE_NAME,PRODUCT,SUPPLIER,B64,B64_LEN,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003031-Actimalt Liquid Regular.pdf,ACTIMALT LIQUID REGULAR,Muntons,JVBERi0xLjUNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0...,134864,{'inputProductName': 'ACTIMALT LIQUID REGULAR'...
1,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf,FAMB279 N&A BROWN SUGAR TYPE FL,"WILD Flavors, Inc.",JVBERi0xLjUNJeLjz9MNCjE5IDAgb2JqDTw8L0xpbmVhcm...,24648,{'inputProductName': 'FAMB279 N&A BROWN SUGAR ...
2,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003729-4. Acerola JC_TH.pdf,"Acerola juice concentrate, clarified R=64-68",SVZ Tomaszow Sp. z o.o.,JVBERi0xLjQKJeLjz9MNCjEgMCBvYmoKPDwgCi9DcmVhdG...,129544,{'inputProductName': 'Acerola juice concentrat...
3,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003890-ACCELERZYME CPG.pdf,Accelerzyme CPG BF,DSM Food Specialties B.V.,JVBERi0xLjQKJeLjz9MKCjEgMCBvYmoKPDwvVHlwZSAvQ2...,194512,"{'inputProductName': 'Accelerzyme CPG BF', 'in..."
4,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000007436-Specification_1101103010101_Apple...,Apple juice concentrate,AUSTRIA JUICE GmbH,JVBERi0xLjcKjp2jtMXW5/gKMiAwIG9iagpbL0lDQ0Jhc2...,98116,{'inputProductName': 'Apple juice concentrate'...


In [16]:
######################
# OPTION2: USING B64 #
######################

dfPROD['RESPONSE'] = None
for i in range(len(dfPROD)):
    try:
        row = dfPROD.iloc[i]
        full_path = row['FULL_PATH']
        file_name = row['FILE_NAME']
        pdf_base64 = row['B64']
        url = "http://127.0.0.1:8000/v1_parse_pim_fields_b64"
        # url = "https://web-app-basic-b3-dksh-raw-tds-parser-acgchxgncqdjb2ew.southeastasia-01.azurewebsites.net/v1_parse_pim_fields_b64"
        data = {
            "inputProductName": row['PRODUCT'],
            "inputBusinessLine": businessLine,
            "inputSecret": os.getenv('CUSTOM_SECRET1'),
            "inputListDocumentationB64": [pdf_base64],
            "inputWebSearch": "false",
            "inputParallel": "true"}
        response = requests.post(url, data=data, timeout=300, verify=False)
        dfPROD['RESPONSE'].iat[i] = response.json()
        print('SUCCESS ', file_name, len(pdf_base64))
    except:
        print('FAILURE ', file_name)
dfPROD

SUCCESS  PIM000003031-Actimalt Liquid Regular.pdf 134864
SUCCESS  PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf 24648
SUCCESS  PIM000003729-4. Acerola JC_TH.pdf 129544
SUCCESS  PIM000003890-ACCELERZYME CPG.pdf 194512
SUCCESS  PIM000007436-Specification_1101103010101_Apple juice concentrate_Austria Juice Standard - Stock quality - Medium acidity.pdf 98116


Unnamed: 0,FULL_PATH,FILE_NAME,PRODUCT,SUPPLIER,B64,B64_LEN,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003031-Actimalt Liquid Regular.pdf,ACTIMALT LIQUID REGULAR,Muntons,JVBERi0xLjUNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0...,134864,{'inputProductName': 'ACTIMALT LIQUID REGULAR'...
1,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf,FAMB279 N&A BROWN SUGAR TYPE FL,"WILD Flavors, Inc.",JVBERi0xLjUNJeLjz9MNCjE5IDAgb2JqDTw8L0xpbmVhcm...,24648,{'inputProductName': 'FAMB279 N&A BROWN SUGAR ...
2,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003729-4. Acerola JC_TH.pdf,"Acerola juice concentrate, clarified R=64-68",SVZ Tomaszow Sp. z o.o.,JVBERi0xLjQKJeLjz9MNCjEgMCBvYmoKPDwgCi9DcmVhdG...,129544,{'inputProductName': 'Acerola juice concentrat...
3,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003890-ACCELERZYME CPG.pdf,Accelerzyme CPG BF,DSM Food Specialties B.V.,JVBERi0xLjQKJeLjz9MKCjEgMCBvYmoKPDwvVHlwZSAvQ2...,194512,"{'inputProductName': 'Accelerzyme CPG BF', 'in..."
4,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000007436-Specification_1101103010101_Apple...,Apple juice concentrate,AUSTRIA JUICE GmbH,JVBERi0xLjcKjp2jtMXW5/gKMiAwIG9iagpbL0lDQ0Jhc2...,98116,{'inputProductName': 'Apple juice concentrate'...


# REARRANGE

In [7]:
dfPROD2 = dfPROD.copy()

keys = list(dfPROD2['RESPONSE'].iat[0].keys())
for key in keys:
    dfPROD2[key] = [x[key] for x in dfPROD2['RESPONSE']]

lscol = ['FILE_NAME', 'PRODUCT',   
       'gpt_manufacturer_or_supplier_answer',
       'gpt_manufacturer_or_supplier_reason',
       'gpt_select_industry_cluster_answer',
       'gpt_select_industry_cluster_reason', 'gpt_select_compositions_answer',
       'gpt_select_compositions_reason', 'gpt_select_functions_answer',
       'gpt_select_functions_reason', 'gpt_select_applications_answer',
       'gpt_select_applications_reason', 'gpt_cas_from_doc_answer',
       'gpt_cas_from_doc_reason', 'gpt_physical_form_answer',
       'gpt_physical_form_reason', 'gpt_gen_product_description',
       'gpt_recommended_dosage_answer', 'gpt_recommended_dosage_reason',
       'gpt_certifications_answer', 'gpt_certifications_reason',
       'gpt_claims_answer', 'gpt_claims_reason', 'gpt_health_benefits_answer',
       'gpt_health_benefits_reason']
dfPROD2 = dfPROD2[lscol]
dfPROD2.to_excel('tmp.xlsx', index=False)

dfPROD2

Unnamed: 0,FILE_NAME,PRODUCT,gpt_manufacturer_or_supplier_answer,gpt_manufacturer_or_supplier_reason,gpt_select_industry_cluster_answer,gpt_select_industry_cluster_reason,gpt_select_compositions_answer,gpt_select_compositions_reason,gpt_select_functions_answer,gpt_select_functions_reason,...,gpt_physical_form_reason,gpt_gen_product_description,gpt_recommended_dosage_answer,gpt_recommended_dosage_reason,gpt_certifications_answer,gpt_certifications_reason,gpt_claims_answer,gpt_claims_reason,gpt_health_benefits_answer,gpt_health_benefits_reason
0,PIM000003031-Actimalt Liquid Regular.pdf,ACTIMALT LIQUID REGULAR,Muntons,The document header and product specification ...,[Beverage & Dairy (BD)],Selected Beverage & Dairy (BD) because the pro...,"[Cereals/Gluten, Sugars/Carbohydrates]",Selected Cereals/Gluten because the product co...,"[Dietary Fiber, Flavouring & Flavour Modulatio...",Selected Dietary Fiber because the document li...,...,Selected Liquid because the product is describ...,ACTIMALT LIQUID REGULAR is a viscous yellow-br...,,No recommended dosage instructions are mention...,[],No certifications related to ACTIMALT LIQUID R...,[],No mention of fermentation process or bio-ferm...,[],No applicable health benefits because product ...
1,PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf,FAMB279 N&A BROWN SUGAR TYPE FL,"WILD Flavors, Inc.",The product specification document for FAMB279...,"[Beverage & Dairy (BD), Confectionary & Bakery...","Selected Beverage & Dairy (BD), Confectionary ...","[Gum Arabic, Maltodextrin, Nature Identical Fl...",Selected all listed ingredients because they a...,"[Carrier & Bulking Agent, Colour & Colour Rete...",Selected Carrier & Bulking Agent because ingre...,...,Selected Powder because the Appearance is desc...,FAMB279 N&A BROWN SUGAR TYPE FL is a light bro...,,No recommended dosage instructions are mention...,"[GRAS, FEMAGRAS]","Selected GRAS, FEMAGRAS because the document s...",[],"No mention of bio-fermentation, meat or dairy ...",[],No applicable health benefits because product ...
2,PIM000003729-4. Acerola JC_TH.pdf,"Acerola juice concentrate, clarified R=64-68",SVZ Tomaszow Sp. z o.o.,The product specification document for Acerola...,"[Beverage & Dairy (BD), Food Supplements & Nut...",Selected Beverage & Dairy (BD) because the pro...,"[Fruit juices and concentrates, Vitamins, Acid...",Selected 'Fruit juices and concentrates' becau...,"[Acidity Regulator, Antioxidant, Colour & Colo...",Selected Acidity Regulator because the product...,...,Selected Liquid because the product is describ...,"Acerola juice concentrate, clarified R=64-68 f...",,No recommended dosage instructions are mention...,[],No certifications related to the product Acero...,[],"No mention of bio-fermentation, meat or dairy ...",[],No applicable health benefits because product ...
3,PIM000003890-ACCELERZYME CPG.pdf,Accelerzyme CPG BF,DSM Food Specialties B.V.,The product specification sheet for Accelerzym...,[Processed Food & Food Service (PFFS)],Selected Processed Food & Food Service (PFFS) ...,"[Protease enzyme, Glycerol]",Selected Protease enzyme because the product i...,"[Food Enzyme, Humectant]",Selected Food Enzyme because the product is de...,...,Selected Liquid because the product descriptio...,Accelerzyme CPG BF is a liquid carboxypeptidas...,,No recommended dosage instructions are mention...,"[ISO, CODEX, JECFA, HALAL, JAKIM, MUI, Kosher,...",Selected ISO because the document states analy...,[Bio-Fermentation],Selected Bio-Fermentation because the product ...,[],No applicable health benefits because product ...
4,PIM000007436-Specification_1101103010101_Apple...,Apple juice concentrate,AUSTRIA JUICE GmbH,The document is a product specification for Ap...,[Beverage & Dairy (BD)],Selected Beverage & Dairy (BD) because the pro...,"[Fruit juice, Malic acid]",Selected 'Fruit juice' because the product is ...,[Acidity Regulator],Selected Acidity Regulator because the documen...,...,Selected Liquid because the document mentions ...,Apple juice concentrate from AUSTRIA JUICE Gmb...,,No recommended dosage instructions are mention...,[],No certifications related to Apple juice conce...,[],"No mention of bio-fermentation processes, meat...",[],No applicable health benefits because product ...


In [14]:
dfPROD2.columns

Index(['FULL_PATH', 'FILE_NAME', 'PRODUCT', 'SUPPLIER', 'B64', 'B64_LEN',
       'RESPONSE', 'inputProductName', 'inputBusinessLine',
       'inputListDocumentation', 'inputSecret', 'inputWebSearch',
       'inputParallel', 'stg_lsTempFile', 'stg_businessLineStr',
       'stg_hashinputProductName', 'stg_hashinputBusinessLine',
       'stg_hashinputListDocumentation', 'stg_hashCombined',
       'stg_lsParsedText', 'stg_parsedText', 'stg_lsBase64',
       'gpt_manufacturer_or_supplier_answer',
       'gpt_manufacturer_or_supplier_reason', 'gpt_composition_search_answer',
       'gpt_function_search_answer', 'gpt_application_search_answer',
       'gpt_combined_web_search', 'gpt_text_of_this_product_only_answer',
       'gpt_select_industry_cluster_answer',
       'gpt_select_industry_cluster_reason', 'gpt_select_compositions_answer',
       'gpt_select_compositions_reason', 'gpt_select_functions_answer',
       'gpt_select_functions_reason', 'gpt_select_applications_answer',
       'gpt_

In [11]:
list(dfPROD['RESPONSE'].iat[0].keys())

['inputProductName',
 'inputBusinessLine',
 'inputListDocumentation',
 'inputSecret',
 'inputWebSearch',
 'inputParallel',
 'stg_lsTempFile',
 'stg_businessLineStr',
 'stg_hashinputProductName',
 'stg_hashinputBusinessLine',
 'stg_hashinputListDocumentation',
 'stg_hashCombined',
 'stg_lsParsedText',
 'stg_parsedText',
 'stg_lsBase64',
 'gpt_manufacturer_or_supplier_answer',
 'gpt_manufacturer_or_supplier_reason',
 'gpt_composition_search_answer',
 'gpt_function_search_answer',
 'gpt_application_search_answer',
 'gpt_combined_web_search',
 'gpt_text_of_this_product_only_answer',
 'gpt_select_industry_cluster_answer',
 'gpt_select_industry_cluster_reason',
 'gpt_select_compositions_answer',
 'gpt_select_compositions_reason',
 'gpt_select_functions_answer',
 'gpt_select_functions_reason',
 'gpt_select_applications_answer',
 'gpt_select_applications_reason',
 'gpt_cas_from_doc_answer',
 'gpt_cas_from_doc_reason',
 'gpt_physical_form_answer',
 'gpt_physical_form_reason',
 'gpt_gen_product_