In [1]:
# GENERAL
from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Body
from typing import Dict, Any
from typing import List, Annotated
import asyncio
import random
import tempfile
import shutil
import os
import fitz
import io
import base64
import datetime
import hashlib
import time
import anyio
import requests
import json
import simple_salesforce
from PIL import Image

# URLLIB3
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# AZURE AI DOCUMENT INTELLIGENCE
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient

# LOAD ENV VARIABLES
from dotenv import load_dotenv
load_dotenv()

# CUSTOM UTILS
from customutils import *

In [2]:
def _conSF():
    # GET TOKEN IF NO TOKEN YET
    if 'SALESFORCE_ACCESS_TOKEN' not in os.environ or 'SALESFORCE_INSTANCE_URL' not in os.environ:
        # GET TOKEN
        import requests
        url = os.getenv('SFDC_URL')
        payload = {}
        headers = {'Authorization': os.getenv('SFDC_AUTH'),
                'Cookie': os.getenv('SFDC_COOKIE')}
        response = requests.request("POST", url, headers=headers, data=payload)
        os.environ['SALESFORCE_ACCESS_TOKEN'] = response.json()['access_token']
        os.environ['SALESFORCE_INSTANCE_URL'] = response.json()['instance_url']
    # MAIN
    from simple_salesforce import Salesforce
    sf = Salesforce(instance_url=os.environ['SALESFORCE_INSTANCE_URL'], session_id=os.environ['SALESFORCE_ACCESS_TOKEN'])
    return sf

In [3]:
import pandas as pd

def queryAllSf(tableName):
    # CONNECT TO SALESFORCE USING YOUR EXISTING FUNCTION
    sfConn = _conSF()

    # DYNAMICALLY ACCESS THE OBJECT (TABLE) AND DESCRIBE IT
    # THIS FETCHES METADATA INCLUDING ALL FIELD NAMES
    sfObj = getattr(sfConn, tableName)
    objDesc = sfObj.describe()

    # EXTRACT ALL FIELD NAMES FROM THE METADATA
    fieldList = [field['name'] for field in objDesc['fields']]

    # CONSTRUCT THE SOQL QUERY
    # JOIN ALL FIELD NAMES WITH COMMA
    soqlQuery = "SELECT " + ",".join(fieldList) + " FROM " + tableName

    # EXECUTE THE QUERY
    # USE QUERY_ALL TO RETRIEVE ALL RECORDS AUTOMATICALLY HANDLIN PAGINATION
    queryResult = sfConn.query_all(soqlQuery)

    # CONVERT TO PANDAS DATAFRAME
    if queryResult['totalSize'] > 0:
        sfDf = pd.DataFrame(queryResult['records'])
        
        # REMOVE THE 'ATTRIBUTES' METADATA COLUMN
        if 'attributes' in sfDf.columns:
            sfDf = sfDf.drop(columns=['attributes'])
    else:
        # RETURN EMPTY DATAFRAME WITH CORRECT COLUMNS IF NO DATA FOUND
        sfDf = pd.DataFrame(columns=fieldList)

    return sfDf

In [15]:
dfFUNC = queryAllSf('function__c')
dfFUNC = dfFUNC[dfFUNC['IsDeleted'] == False]
dfFUNC = dfFUNC[['Name','Industry_Cluster__c']].drop_duplicates().reset_index(drop=True)

dfIC = queryAllSf('Industry_Cluster__c')
dfIC = dfIC[['Id','Business_Line_Name_Formula__c']].drop_duplicates()
dfIC.columns = ['Industry_Cluster__c','Business_Line_Name_Formula__c']

dfFUNC = dfFUNC.merge(dfIC, on='Industry_Cluster__c', how='left')
dfFUNC = dfFUNC[['Business_Line_Name_Formula__c','Name']].drop_duplicates().reset_index(drop=True)

dfFUNC.to_excel('Functions.xlsx', index=False)

In [None]:

dfFUNC = queryAllSf('function__c')
dfFUNC = dfFUNC[['']]

In [4]:
resultDf = queryAllSf('function__c')
resultDf

Unnamed: 0,Id,OwnerId,IsDeleted,Name,CurrencyIsoCode,CreatedDate,CreatedById,LastModifiedDate,LastModifiedById,SystemModstamp,LastViewedDate,LastReferencedDate,Industry_Cluster__c,Industry_Cluster_Name_Formula__c
0,a505h000000qSg1AAE,0055h000003COv4AAG,False,ADDITIVES - Adhesion Promoter,NZD,2024-04-30T10:02:39.000+0000,0055h000003COv4AAG,2024-04-30T10:02:39.000+0000,0055h000003COv4AAG,2024-04-30T10:02:39.000+0000,,,a0C5h000003T5QFEA0,Agrochemicals (AG)
1,a505h000000qSg2AAE,0055h000003COv4AAG,False,Acidity Regulator,NZD,2024-04-30T10:02:39.000+0000,0055h000003COv4AAG,2024-04-30T10:02:39.000+0000,0055h000003COv4AAG,2024-04-30T10:02:39.000+0000,,,a0C900000044Q4sEAE,Beverage & Dairy (BD)
2,a505h000000qSg3AAE,0055h000003COv4AAG,False,Acidity Regulator,NZD,2024-04-30T10:02:39.000+0000,0055h000003COv4AAG,2024-04-30T10:02:39.000+0000,0055h000003COv4AAG,2024-04-30T10:02:39.000+0000,,,a0C900000044Q50EAE,Confectionary & Bakery (CB)
3,a505h000000qSg4AAE,0055h000003COv4AAG,False,Acidity Regulator,NZD,2024-04-30T10:02:39.000+0000,0055h000003COv4AAG,2024-04-30T10:02:39.000+0000,0055h000003COv4AAG,2024-04-30T10:02:39.000+0000,,,a0C900000044Q4yEAE,Food Supplements & Nutrition (FSN)
4,a505h000000qSg5AAE,0055h000003COv4AAG,False,Acidity Regulator,NZD,2024-04-30T10:02:39.000+0000,0055h000003COv4AAG,2024-04-30T10:02:39.000+0000,0055h000003COv4AAG,2024-04-30T10:02:39.000+0000,,,a0C900000044Q4zEAE,Processed Food & Food Service (PFFS)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
776,a50Mg0000054Su5IAE,0055h0000097AuqAAE,False,Others,CHF,2025-04-29T08:01:24.000+0000,0055h0000097AuqAAE,2025-04-29T08:01:24.000+0000,0055h0000097AuqAAE,2025-04-29T08:01:24.000+0000,,,a0C900000044Q4qEAE,Homecare & Institutional Cleaning
777,a50Mg0000089iA5IAI,0055h0000097AuqAAE,False,Recombinant Proteins and Cytokines,CHF,2025-07-07T07:04:32.000+0000,0055h0000097AuqAAE,2025-07-07T07:04:32.000+0000,0055h0000097AuqAAE,2025-07-07T07:04:32.000+0000,,,a0C0o00001AZJMLEA5,Biopharma
778,a50Mg000008CTfzIAG,0055h0000097AuqAAE,False,Others,CHF,2025-07-08T04:42:17.000+0000,0055h0000097AuqAAE,2025-07-08T04:42:17.000+0000,0055h0000097AuqAAE,2025-07-08T04:42:17.000+0000,,,a0C0o00001AZJMLEA5,Biopharma
779,a50Mg00000EIaD7IAL,0055h0000097AuqAAE,False,Surfactant,CHF,2025-10-22T01:45:24.000+0000,0055h0000097AuqAAE,2025-10-22T01:45:24.000+0000,0055h0000097AuqAAE,2025-10-22T01:45:24.000+0000,,,a0C900000044Q4qEAE,Homecare & Institutional Cleaning


In [6]:
resultDf = queryAllSf('Industry_Cluster__c')
resultDf[['Id','Business_Line_Name_Formula__c']]

Unnamed: 0,Id,Business_Line_Name_Formula__c
0,a0C0o00000tsVBxEAM,Pharmaceutical Industry (PHI)
1,a0C0o00001AZJMLEA5,Pharmaceutical Industry (PHI)
2,a0C0o00001AZJMVEA5,Pharmaceutical Industry (PHI)
3,a0C5h000003T5QFEA0,Specialty Chemicals Industry (SCI)
4,a0C5h00000CrOQbEAN,Food & Beverage Ingredients (FBI)
5,a0C5h00000Crd0REAR,Pharmaceutical Industry (PHI)
6,a0C900000044Q4qEAE,Personal Care Industry (PCI)
7,a0C900000044Q4sEAE,Food & Beverage Ingredients (FBI)
8,a0C900000044Q4tEAE,Pharmaceutical Industry (PHI)
9,a0C900000044Q4vEAE,Personal Care Industry (PCI)


In [None]:
# pip install --upgrade openai
import os
from openai import OpenAI

# Set OPENAI_API_KEY in your environment first
client = OpenAI(api_key='xxx')

In [3]:
def deep_research(query: str, *,
                  model: str = "o3-deep-research-2025-06-26",
                  summary: str = "auto"):
    """
    Run an open-web Deep Research task and return the report text + citations.
    Models: o4-mini-deep-research-2025-06-26 (faster/cheaper) or
            o3-deep-research-2025-06-26 (highest quality).
    """
    system_message = (
        "You are a professional researcher. Produce a structured, "
        "citation-rich report with concise sections. Prefer recent, "
        "authoritative sources. Include inline citations."
    )

    resp = client.responses.create(
        model=model,
        input=[
            {"role": "developer", "content": [{"type": "input_text", "text": system_message}]},
            {"role": "user", "content": [{"type": "input_text", "text": query}]},
        ],
        reasoning={"summary": summary},
        tools=[
            {"type": "web_search_preview"},          # allow open-web search
            # Optional: also let it run code for light analysis/plots
            {"type": "code_interpreter", "container": {"type": "auto", "file_ids": []}},
        ],
        # Tip: for very long jobs you can add background=True and poll (see below).
    )

    # Final report text
    report_text = resp.output[-1].content[0].text

    # Extract (title, url, span) for each inline citation
    annotations = getattr(resp.output[-1].content[0], "annotations", []) or []
    citations = [{"title": a.title, "url": a.url,
                  "span": (a.start_index, a.end_index)} for a in annotations]

    return report_text, citations

if __name__ == "__main__":
    text, cites = deep_research("Compare the top 3 open-source vector databases as of 2025. "
                                "Focus on performance, ecosystem, and TCO. Provide links.")
    print(text)
    print("\nCitations:")
    for c in cites:
        print(f"- {c['title']} — {c['url']}")


APIConnectionError: Connection error.