In [3]:
# Standard library imports
import base64
import datetime
import io
import json
import os
import tempfile
import time
import uuid
# Third‚Äëparty imports
import fitz
import numpy as np
import pandas as pd
from PIL import Image
import pycountry
import requests
import streamlit as st
from streamlit_js_eval import streamlit_js_eval
# Azure AI Document Intelligence
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
# Custom Utils
from customutils import *
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
#####################
# USING LLAMA PARSE #
#####################


with open(fileinfo['serverPath'], 'rb') as f:
    mime = 'application/pdf' if filename.endswith('.pdf') else 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
    files = [('files', (filename, f, mime))]
    data = {"apikey": os.getenv('LLAMA_CLOUD_API_KEY')}
    response = requests.post(
        "https://ancient-almeda-personal-personal-22e19704.koyeb.app/llama_parse_batch",
        data=data,
        files=files,
        verify=False)
    if response.status_code == 200:
        file_dict[filename]['S1_PARSE']['status'] = '‚úÖ Success'
        file_dict[filename]['S1_PARSE']['result'] = response.json()['results']
    else:
        file_dict[filename]['S1_PARSE']['status'] = f'‚ùå Error: (HTTP {response.status_code})'

In [2]:
####################################
# AZURE DOCUMENT INTELLIGENCE TEST #
####################################
def azureDocumentIntelligenceParsePDF(file_path, key):
    document_intelligence_client = DocumentIntelligenceClient(
        endpoint="https://document-intelligence-standard-s0-main02.cognitiveservices.azure.com/", credential=AzureKeyCredential(key))
    with open(file_path, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-read", 
            f,
            content_type="application/pdf")
        result = poller.result()
    # Build Markdown content from lines
    markdown_lines = []
    for page_num, page in enumerate(result.pages):
        markdown_lines.append(f"\n## Page {page_num + 1}\n")
        for line in page.lines:
            markdown_lines.append(line.content)
    markdown_output = "\n".join(markdown_lines)
    print("Markdown Output:\n")
    print(markdown_output)
    return markdown_output

pdfPath = r"C:\Users\khunakorn.l\OneDrive - DKSH\9999-99-99 GitRepo\dksh-raw-tds-parser\testFiles\FBI\PIM000003031-Actimalt Liquid Regular.pdf"
azureDocumentIntelligenceParsePDF(pdfPath, os.getenv('AZURE_DOCUMENT_INTELLIGENCE_API_KEY'))

Markdown Output:


## Page 1

Muntons
Passionate about malt
PRODUCT SPECIFICATION and NUTRITIONAL DATA
ACTIMALT LIQUID REGULAR
Description:
A viscous liquid produced by enzyme assisted hot water extraction of barley and
malted barley, followed by filtration and concentration under vacuum evaporation.
Appearance:
Odour:
Yellow-brown syrup
Pleasant with a characteristic cereal / malt odour
Sweet with a characteristic cereal / malt flavour
Ingredients:
Barley, Malted Barley, Water
Syrup Composition:
Soluble extract of barley and malted barley
Typically 77.1 - 79.6 %
Typically 20.4 - 22.9 %
Water
Allergen Information:
Contains Gluten: 2,570 mg/kg *
*
External analysis result - single random sample - Feb 2013 (for information only)
Laboratory: Campden Technology Ltd
Test Method: R-Biopharm RIDASCREEN Gliadin Competitive Immunoassay R7021
Suggested Ingredient Declaration for
Retail Product Labelling:
Barley and Malt Extract
Analytical Specification:
Refractometric Solids %
79.5 to 82
pH (10%

'\n## Page 1\n\nMuntons\nPassionate about malt\nPRODUCT SPECIFICATION and NUTRITIONAL DATA\nACTIMALT LIQUID REGULAR\nDescription:\nA viscous liquid produced by enzyme assisted hot water extraction of barley and\nmalted barley, followed by filtration and concentration under vacuum evaporation.\nAppearance:\nOdour:\nYellow-brown syrup\nPleasant with a characteristic cereal / malt odour\nSweet with a characteristic cereal / malt flavour\nIngredients:\nBarley, Malted Barley, Water\nSyrup Composition:\nSoluble extract of barley and malted barley\nTypically 77.1 - 79.6 %\nTypically 20.4 - 22.9 %\nWater\nAllergen Information:\nContains Gluten: 2,570 mg/kg *\n*\nExternal analysis result - single random sample - Feb 2013 (for information only)\nLaboratory: Campden Technology Ltd\nTest Method: R-Biopharm RIDASCREEN Gliadin Competitive Immunoassay R7021\nSuggested Ingredient Declaration for\nRetail Product Labelling:\nBarley and Malt Extract\nAnalytical Specification:\nRefractometric Solids %\n79

In [6]:
#####################
# AZURE OPENAI TEST #
#####################
import os
import requests
import json

body = {
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user",   "content": "I am going to Paris, what should I see?"}
    ],
    "max_tokens": 4096,
    "temperature": 1.0,
    "top_p": 1.0
}

url = "https://azure-ai-services-main01.cognitiveservices.azure.com/openai/deployments/azure-ai-services-main01-gpt-4o-main01/chat/completions?api-version=2024-12-01-preview"
headers = {"Content-Type": "application/json", "api-key": os.getenv("AZURE_OPENAI_KEY")}
response = requests.post(url, headers=headers, data=json.dumps(body))

data = response.json()
print(data["choices"][0]["message"]["content"])

That‚Äôs exciting! Paris is a city full of history, art, culture, and romance. Here are some must-see places and activities to consider during your trip to the City of Light:

---

### **Iconic Landmarks**  
1. **Eiffel Tower**  
   - The symbol of Paris! Visit during the day for sweeping views of the city or at night to see it sparkle with lights. You can ascend by elevator or take the stairs for a more adventurous experience.

2. **Louvre Museum**  
   - The world‚Äôs largest art museum, home to masterpieces like the Mona Lisa and Venus de Milo. Even the architecture and glass pyramid are breathtaking!

3. **Notre-Dame Cathedral** (Note: Restoration following the 2019 fire may impact visits)  
   - A masterpiece of Gothic architecture. Don‚Äôt miss its intricate fa√ßade and, if possible, climb to the towers for views of the Seine River and surrounding city.

4. **Sacr√©-C≈ìur Basilica & Montmartre**  
   - This stunning white basilica sits atop Montmartre hill. The neighborhood itsel

In [7]:
################
# WEB - SEARCH #
################

question = 'What is the CAS number for product [CEDEPAL TD-403 MFLD] from manufacturer [Stepan Company]'
search_context_size = 'high'
body = {"model": "gpt-4o-search-preview",
        'web_search_options': {'search_context_size': search_context_size},
        "messages": [{'role': 'user', 
                        'content': question}],
        "max_tokens": 4096}

response = requests.post("https://ancient-almeda-personal-personal-22e19704.koyeb.app/openai",                                         
                        json=body, 
                        params={"apikey": os.getenv('OPENAI_API_KEY')},
                        verify=False)


response.json()['choices'][0]['message']['content']



'The CAS number for CEDEPAL¬Æ TD-403 MFLD, a product from Stepan Company, is 25446-78-0. This product is also known by its INCI name, Sodium Trideceth Sulfate. ([knowde.com](https://www.knowde.com/stores/stepan-company/products/cedepal-td-403-mfld?utm_source=openai)) '

In [None]:
######################
# DEEP RESEARCH TEST #
######################
import os
import requests
import json

input_text = """
Research about the APPLICATIONS of [CEDEPAL TD-403 MFLD] utilization in the [Personal Care] industries
"""

body = {
    "model": "o3-deep-research",
    "input": input_text,
    "tools": [{"type": "web_search_preview"}],
    # "stream": True
}

url = "https://api.openai.com/v1/responses"
headers = {"Authorization": f"Bearer {os.getenv("OPENAI_API_KEY")}", "Content-Type": "application/json"}
response = requests.post(url, headers=headers, data=json.dumps(body), timeout=3600, verify=False, stream=True)
# response = requests.post(url, headers=headers, data=json.dumps(body), timeout=3600, verify=False, stream=True)

# # Iterate over the streaming lines
# for line in response.iter_lines(decode_unicode=True):
#     if line and line.startswith("data: "):
#         data_str = line[len("data: "):]
#         if data_str.strip() == "[DONE]":
#             break
#         chunk = json.loads(data_str)
#         # The 'delta' field contains the incremental tokens
#         delta = chunk["choices"][0]["delta"]
#         print(delta.get("content", ""), end="", flush=True)

