In [2]:
# GENERAL
from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Body
from typing import Dict, Any
from typing import List, Annotated
import asyncio
import random
import tempfile
import shutil
import os
import fitz
import io
import base64
import datetime
import hashlib
import time
import anyio
import requests
import json
import simple_salesforce
from PIL import Image

# URLLIB3
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# AZURE AI DOCUMENT INTELLIGENCE
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient

# LOAD ENV VARIABLES
from dotenv import load_dotenv
load_dotenv()

# CUSTOM UTILS
from customutils import *

In [3]:
import pandas as pd

businessLine = 'FBI'

# DEFINE PATH
pdfpath = f"versionAchive/V0.9-alpha/testFiles/{businessLine}/"
# LIST ALL PDF
lspdf = [f for f in os.listdir(pdfpath) if f.endswith('.pdf')]
# DF
dfPDF = pd.DataFrame()
dfPDF['FULL_PATH'] = [pdfpath+f for f in lspdf]
dfPDF['FILE_SIZE_KB'] = dfPDF['FULL_PATH'].apply(lambda x: round(os.path.getsize(x) / 1024, 2))
dfPDF['FILE_NAME'] = lspdf
dfPDF['RESPONSE'] = None

dfPDF.head()

Unnamed: 0,FULL_PATH,FILE_SIZE_KB,FILE_NAME,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,98.78,PIM000003031-Actimalt Liquid Regular.pdf,
1,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,18.05,PIM000003047-8. FAMB279 BROWN SUGAR TYPE FL.pdf,
2,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,94.88,PIM000003729-4. Acerola JC_TH.pdf,
3,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,142.46,PIM000003890-ACCELERZYME CPG.pdf,
4,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,71.86,PIM000007436-Specification_1101103010101_Apple...,


In [4]:
dfPDF = dfPDF.iloc[:1]
dfPDF.head()

Unnamed: 0,FULL_PATH,FILE_SIZE_KB,FILE_NAME,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,98.78,PIM000003031-Actimalt Liquid Regular.pdf,


In [7]:
for i in range(len(dfPDF)):
    try:
        full_path = dfPDF['FULL_PATH'][i]
        file_name = dfPDF['FILE_NAME'][i]
        #url = "http://127.0.0.1:8000/v1_get_products_and_suppliers"
        url = "https://web-app-basic-b3-dksh-raw-tds-parser-acgchxgncqdjb2ew.southeastasia-01.azurewebsites.net/v1_get_products_and_suppliers"
        input_secret = os.getenv('CUSTOM_SECRET1')
        files = [('inputListDocumentation', open(full_path, 'rb'))]
        data = {'inputSecret': input_secret}
        response = requests.post(url, files=files, data=data, verify=False)
        dfPDF['RESPONSE'].iat[i] = response.json()['products_and_suppliers']
        print('SUCCESS ', file_name)
    except:
        print('ERROR   ', file_name)
dfPDF.head()

SUCCESS  PIM000003031-Actimalt Liquid Regular.pdf


Unnamed: 0,FULL_PATH,FILE_SIZE_KB,FILE_NAME,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,98.78,PIM000003031-Actimalt Liquid Regular.pdf,"[{'PRODUCT_NAME': 'ACTIMALT LIQUID REGULAR', '..."


In [8]:
lsdf = []
for i in range(len(dfPDF)):
    row = dfPDF.iloc[i]
    if row['RESPONSE'] is None:
        continue
    for prod in row['RESPONSE']:
        df = pd.DataFrame({'FULL_PATH':[row['FULL_PATH']],
                           'FILE_NAME':[row['FILE_NAME']],
                           'PRODUCT':[prod['PRODUCT_NAME']],
                           'SUPPLIER':[prod['SUPPLIER_NAME']]})
        lsdf.append(df)
dfPROD = pd.concat(lsdf, ignore_index=True)
dfPROD['B64'] = [pdf_to_base64(x) for x in dfPROD['FULL_PATH']]
dfPROD['B64_LEN'] = [len(x) for x in dfPROD['B64']]
dfPROD.head()

Unnamed: 0,FULL_PATH,FILE_NAME,PRODUCT,SUPPLIER,B64,B64_LEN
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003031-Actimalt Liquid Regular.pdf,ACTIMALT LIQUID REGULAR,Muntons,JVBERi0xLjUNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0...,134864


In [6]:
# dfPROD = dfPROD.iloc[:2]
# dfPROD

In [10]:
##############################
# OPTION1: USING FILE UPLOAD #
##############################

dfPROD['RESPONSE'] = None
for i in range(len(dfPROD)):
    try:
        row = dfPROD.iloc[i]
        full_path = row['FULL_PATH']
        file_name = row['FILE_NAME']
        # url = "http://127.0.0.1:8000/v1_parse_pim_fields"
        url = "https://web-app-basic-b3-dksh-raw-tds-parser-acgchxgncqdjb2ew.southeastasia-01.azurewebsites.net/v1_parse_pim_fields"
        data = {
            "inputProductName": row['PRODUCT'],
            "inputBusinessLine": businessLine,
            "inputSecret": os.getenv('CUSTOM_SECRET1'),
            # booleans sent as strings in multipart forms
            "inputWebSearch": "false",
            "inputParallel": "true"}
        files = [("inputListDocumentation", (file_name, open(row["FULL_PATH"], "rb"), "application/pdf"))]
        response = requests.post(url, data=data, files=files, timeout=300, verify=False)
        dfPROD['RESPONSE'].iat[i] = response.json()
        print('SUCCESS ', file_name)
    except:
        print('FAILURE ', file_name)
dfPROD

SUCCESS  PIM000003031-Actimalt Liquid Regular.pdf


Unnamed: 0,FULL_PATH,FILE_NAME,PRODUCT,SUPPLIER,B64,B64_LEN,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003031-Actimalt Liquid Regular.pdf,ACTIMALT LIQUID REGULAR,Muntons,JVBERi0xLjUNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0...,134864,{'inputProductName': 'ACTIMALT LIQUID REGULAR'...


In [12]:
######################
# OPTION2: USING B64 #
######################

dfPROD['RESPONSE'] = None
for i in range(len(dfPROD)):
    try:
        row = dfPROD.iloc[i]
        full_path = row['FULL_PATH']
        file_name = row['FILE_NAME']
        pdf_base64 = row['B64']
        # url = "http://127.0.0.1:8000/v1_parse_pim_fields_b64"
        url = "https://web-app-basic-b3-dksh-raw-tds-parser-acgchxgncqdjb2ew.southeastasia-01.azurewebsites.net/v1_parse_pim_fields_b64"
        data = {
            "inputProductName": row['PRODUCT'],
            "inputBusinessLine": businessLine,
            "inputSecret": os.getenv('CUSTOM_SECRET1'),
            "inputListDocumentationB64": [pdf_base64],
            "inputWebSearch": "false",
            "inputParallel": "true"}
        response = requests.post(url, data=data, timeout=300, verify=False)
        dfPROD['RESPONSE'].iat[i] = response.json()
        print('SUCCESS ', file_name, len(pdf_base64))
    except:
        print('FAILURE ', file_name)
dfPROD

SUCCESS  PIM000003031-Actimalt Liquid Regular.pdf 134864


Unnamed: 0,FULL_PATH,FILE_NAME,PRODUCT,SUPPLIER,B64,B64_LEN,RESPONSE
0,versionAchive/V0.9-alpha/testFiles/FBI/PIM0000...,PIM000003031-Actimalt Liquid Regular.pdf,ACTIMALT LIQUID REGULAR,Muntons,JVBERi0xLjUNCiW1tbW1DQoxIDAgb2JqDQo8PC9UeXBlL0...,134864,{'inputProductName': 'ACTIMALT LIQUID REGULAR'...


# REARRANGE

In [11]:
dfPROD2 = dfPROD.copy()

keys = list(dfPROD2['RESPONSE'].iat[0].keys())
for key in keys:
    dfPROD2[key] = [x[key] for x in dfPROD2['RESPONSE']]

lscol = ['FILE_NAME', 'PRODUCT',   
       'gpt_manufacturer_or_supplier_answer',
       'gpt_manufacturer_or_supplier_reason',
       'gpt_select_industry_cluster_answer',
       'gpt_select_industry_cluster_reason', 'gpt_select_compositions_answer',
       'gpt_select_compositions_reason', 'gpt_select_functions_answer',
       'gpt_select_functions_reason', 'gpt_select_applications_answer',
       'gpt_select_applications_reason', 'gpt_cas_from_doc_answer',
       'gpt_cas_from_doc_reason', 'gpt_physical_form_answer',
       'gpt_physical_form_reason', 'gpt_gen_product_description',
       'gpt_recommended_dosage_answer', 'gpt_recommended_dosage_reason',
       'gpt_certifications_answer', 'gpt_certifications_reason',
       'gpt_claims_answer', 'gpt_claims_reason', 'gpt_health_benefits_answer',
       'gpt_health_benefits_reason']
dfPROD2 = dfPROD2[lscol]
dfPROD2.to_excel('tmp.xlsx', index=False)

dfPROD2

Unnamed: 0,FILE_NAME,PRODUCT,gpt_manufacturer_or_supplier_answer,gpt_manufacturer_or_supplier_reason,gpt_select_industry_cluster_answer,gpt_select_industry_cluster_reason,gpt_select_compositions_answer,gpt_select_compositions_reason,gpt_select_functions_answer,gpt_select_functions_reason,...,gpt_physical_form_reason,gpt_gen_product_description,gpt_recommended_dosage_answer,gpt_recommended_dosage_reason,gpt_certifications_answer,gpt_certifications_reason,gpt_claims_answer,gpt_claims_reason,gpt_health_benefits_answer,gpt_health_benefits_reason
0,PIM000003031-Actimalt Liquid Regular.pdf,ACTIMALT LIQUID REGULAR,Muntons,The document header and product specification ...,[Beverage & Dairy (BD)],Selected Beverage & Dairy (BD) because the pro...,"[Cereals/Gluten, Sugars/Carbohydrates]",Selected Cereals/Gluten because the product co...,"[Dietary Fiber, Flavouring & Flavour Modulatio...",Selected Dietary Fiber because the document li...,...,Selected Liquid because the product is describ...,ACTIMALT LIQUID REGULAR is a viscous yellow-br...,,No recommended dosage instructions are mention...,[],No certifications related to ACTIMALT LIQUID R...,[],No mention of fermentation process or bio-ferm...,[],No applicable health benefits because product ...


In [14]:
dfPROD2.columns

Index(['FULL_PATH', 'FILE_NAME', 'PRODUCT', 'SUPPLIER', 'B64', 'B64_LEN',
       'RESPONSE', 'inputProductName', 'inputBusinessLine',
       'inputListDocumentation', 'inputSecret', 'inputWebSearch',
       'inputParallel', 'stg_lsTempFile', 'stg_businessLineStr',
       'stg_hashinputProductName', 'stg_hashinputBusinessLine',
       'stg_hashinputListDocumentation', 'stg_hashCombined',
       'stg_lsParsedText', 'stg_parsedText', 'stg_lsBase64',
       'gpt_manufacturer_or_supplier_answer',
       'gpt_manufacturer_or_supplier_reason', 'gpt_composition_search_answer',
       'gpt_function_search_answer', 'gpt_application_search_answer',
       'gpt_combined_web_search', 'gpt_text_of_this_product_only_answer',
       'gpt_select_industry_cluster_answer',
       'gpt_select_industry_cluster_reason', 'gpt_select_compositions_answer',
       'gpt_select_compositions_reason', 'gpt_select_functions_answer',
       'gpt_select_functions_reason', 'gpt_select_applications_answer',
       'gpt_

In [11]:
list(dfPROD['RESPONSE'].iat[0].keys())

['inputProductName',
 'inputBusinessLine',
 'inputListDocumentation',
 'inputSecret',
 'inputWebSearch',
 'inputParallel',
 'stg_lsTempFile',
 'stg_businessLineStr',
 'stg_hashinputProductName',
 'stg_hashinputBusinessLine',
 'stg_hashinputListDocumentation',
 'stg_hashCombined',
 'stg_lsParsedText',
 'stg_parsedText',
 'stg_lsBase64',
 'gpt_manufacturer_or_supplier_answer',
 'gpt_manufacturer_or_supplier_reason',
 'gpt_composition_search_answer',
 'gpt_function_search_answer',
 'gpt_application_search_answer',
 'gpt_combined_web_search',
 'gpt_text_of_this_product_only_answer',
 'gpt_select_industry_cluster_answer',
 'gpt_select_industry_cluster_reason',
 'gpt_select_compositions_answer',
 'gpt_select_compositions_reason',
 'gpt_select_functions_answer',
 'gpt_select_functions_reason',
 'gpt_select_applications_answer',
 'gpt_select_applications_reason',
 'gpt_cas_from_doc_answer',
 'gpt_cas_from_doc_reason',
 'gpt_physical_form_answer',
 'gpt_physical_form_reason',
 'gpt_gen_product_

In [None]:
{
  "inputProductName": "ACTIMALT LIQUID REGULAR",
  "inputBusinessLine": "FBI",
  "inputListDocumentation": "HIDDEN",
  "inputSecret": "HIDDEN",
  "inputWebSearch": false,
  "inputParallel": true,
  "stg_lsTempFile": "HIDDEN",
  "stg_businessLineStr": "Food & Beverage",
  "stg_hashinputProductName": "d76a894df91bcc7a7ed5729d1656ed1d0ac030c4413752c5cd3b087247390179",
  "stg_hashinputBusinessLine": "c6cd4cf936fd5ad884ed4c278d147982124a6b7df27d95ddf58cd7a60660664c",
  "stg_hashinputListDocumentation": "5443bfd8a58669c54d69f7a9773fe5e987b36ea7f60a4cdd973467c1af10e28f",
  "stg_hashCombined": "aa3f690107490e9bd4c1a88e39a7c2d9ae4c5cacd7253411168336fbd84790d8",
  "stg_lsParsedText": "HIDDEN",
  "stg_parsedText": "HIDDEN",
  "stg_lsBase64": "HIDDEN",
  "gpt_manufacturer_or_supplier_answer": "Muntons",
  "gpt_manufacturer_or_supplier_reason": "The document header and product specification sheet clearly indicate 'Muntons' as the entity associated with ACTIMALT LIQUID REGULAR, suggesting Muntons is the manufacturer or supplier of this product.",
  "gpt_composition_search_answer": "",
  "gpt_function_search_answer": "",
  "gpt_application_search_answer": "",
  "gpt_combined_web_search": "",
  "gpt_text_of_this_product_only_answer": "HIDDEN",
  "gpt_select_industry_cluster_answer": [
    "Beverage & Dairy (BD)"
  ],
  "gpt_select_industry_cluster_reason": "Selected Beverage & Dairy (BD) because the product is a viscous syrup derived from barley and malted barley, commonly used as an ingredient in beverages such as malt drinks and dairy-based malted beverages, as indicated by its description and typical use. There is no mention of use in confectionery, bakery, food supplements, or processed food service in the document.",
  "gpt_select_compositions_answer": [
    "Cereals/Gluten",
    "Sugars/Carbohydrates"
  ],
  "gpt_select_compositions_reason": "Selected Cereals/Gluten because the product contains barley and malted barley and explicitly mentions gluten content (2,570 mg/kg). Selected Sugars/Carbohydrates because the nutritional information details various sugars (maltose, maltotriose, glucose, fructose, sucrose) and total carbohydrates present in the product.",
  "gpt_select_functions_answer": [
    "Dietary Fiber",
    "Flavouring & Flavour Modulation",
    "Protein",
    "Sweetener"
  ],
  "gpt_select_functions_reason": "Selected Dietary Fiber because the document lists Total Dietary Fibre as 1.5 g per 100g. Selected Flavouring & Flavour Modulation because the product has a characteristic cereal/malt flavour and odour as described. Selected Protein because the nutritional information shows 2.5-4.0 g protein per 100g. Selected Sweetener because the product is described as sweet and contains significant sugars (maltose 30-40 g, maltotriose 6-10 g, glucose, sucrose, fructose). No other functions are mentioned or implied in the document.",
  "gpt_select_applications_answer": [
    "Processed Food",
    "Ingredients Manufacturers"
  ],
  "gpt_select_applications_reason": "Selected Processed Food because the product is a barley and malt extract syrup used as an ingredient in food processing, as indicated by its description and composition. Selected Ingredients Manufacturers because the product is an ingredient (barley and malt extract syrup) intended for use in manufacturing other food products, as shown by the detailed ingredient and nutritional data.",
  "gpt_cas_from_doc_answer": "N/A",
  "gpt_cas_from_doc_reason": "The document provides detailed product specifications, nutritional data, and ingredient information for ACTIMALT LIQUID REGULAR by Muntons, but does not mention any CAS Registry Number.",
  "gpt_physical_form_answer": "Liquid",
  "gpt_physical_form_reason": "Selected Liquid because the product is described as a viscous liquid and yellow-brown syrup in the description and appearance sections of the document.",
  "gpt_gen_product_description": "ACTIMALT LIQUID REGULAR is a viscous yellow-brown syrup made by enzyme-assisted hot water extraction of barley and malted barley, followed by filtration and vacuum concentration. It has a sweet, characteristic malt flavor and cereal aroma. This syrup contains a high soluble extract of barley and malt, making it suitable for use as a natural malt extract ingredient in food and beverage applications.",
  "gpt_recommended_dosage_answer": "N/A",
  "gpt_recommended_dosage_reason": "No recommended dosage instructions are mentioned in the document for ACTIMALT LIQUID REGULAR by Muntons.",
  "gpt_certifications_answer": [],
  "gpt_certifications_reason": "No certifications related to ACTIMALT LIQUID REGULAR by Muntons are mentioned in the provided document.",
  "gpt_claims_answer": [],
  "gpt_claims_reason": "No mention of fermentation process or bio-fermentation is found; product is a barley and malt extract syrup, not a meat or dairy alternative; no claims or information about natural or ethical sourcing; no data on resource or energy optimization; no indication of sustainable food waste reduction; no mention of upcycling or use of by-products.",
  "gpt_health_benefits_answer": [],
  "gpt_health_benefits_reason": "No applicable health benefits because product functions not in the required list (Dietary Fiber, Food Culture, Fortification/Nutraceutical, Probiotic/Postbiotic, Protein)",
  "time_start": "2026-01-20 01:20:17.903599",
  "time_end": "2026-01-20 01:20:28.019786",
  "time_duration": "10.116187"
}