In [1]:
from typing import List, Type, TypeVar, Optional
from pydantic import BaseModel, Field
from langchain_openai import AzureChatOpenAI
import tiktoken
import json
import time
import os
from dotenv import load_dotenv
load_dotenv()

# Replace 'YOUR_API_KEY' with your actual OpenAI API
T = TypeVar('T', bound=BaseModel)

def extract_multi_needle(schema: Type[T], haystack: str, example_needles: List[str]) -> List[T]:
    extracted_needles = []
    model_name = 'gpt-4o-mini'

    model = AzureChatOpenAI(
            openai_api_version=os .environ.get("AZURE_OPENAI_VERSION", "2024-07-18"),
        azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini"),
        azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT", "https://gptmini4o.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2023-03-15-preview"),
        api_key=os.environ.get("AZURE_OPENAI_KEY", "c2105be0c2744742980b57320b87e813"),
    )

    # Initialize tokenizer
    encoding = tiktoken.encoding_for_model(model_name)

    # Token limits
    max_tokens_per_request = 128000
    max_tokens_for_response = 
    max_tokens_for_prompt = max_tokens_per_request - max_tokens_for_response

    # Tokenize the haystack
    haystack_tokens = encoding.encode(haystack)
    num_tokens = len(haystack_tokens)

    # Determine chunk size
    chunk_size = 100000

    # Split the haystack into chunks
    chunks = [
        encoding.decode(haystack_tokens[i:i + chunk_size])
        for i in range(0, num_tokens, chunk_size)
    ]

    # Prepare schema description
    schema_description = "Extract information according to the following schema:\n{\n"
    for field_name, field in schema.__fields__.items():
        field_descr = field.description or ''
        field_type = (
            field.annotation.__name__ if hasattr(field.annotation, '__name__') else str(field.annotation)
        )
        schema_description += f'  "{field_name}": "{field_descr} ({field_type})",\n'
    schema_description += "}\n"

    # Prepare examples
    examples_text = "Examples of the desired output format:\n"
    for example in example_needles:
        examples_text += f"- {example}\n"

    # System prompt
    system_prompt = "You are an AI language model that extracts structured data from text."

    # Process each chunk
    for idx, chunk in enumerate(chunks):
        user_prompt = (
            f"{schema_description}\n"
            f"{examples_text}\n"
            f"Text to analyze:\n\"\"\"\n{chunk}\n\"\"\"\n"
            "Extract any instances matching the schema from the text above. "
            "Provide the output as a JSON array of objects."
        )

        # Ensure the prompt fits within token limits
        prompt_tokens = encoding.encode(user_prompt)
        if len(prompt_tokens) > max_tokens_for_prompt:
            print(f"Prompt too long for chunk {idx}, reducing chunk size.")
            continue

        # Call OpenAI API
        try:
            response = model.invoke(
            input=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            )
            print(response)

            # Parse the assistant's reply
            try:
                extracted_data = json.loads(response.content)
                if isinstance(extracted_data, list):
                    for item in extracted_data:
                        try:
                            extracted_item = schema(**item)
                            extracted_needles.append(extracted_item)
                        except Exception as e:
                            print(f"Error parsing item: {item}, error: {e}")
                else:
                    print(f"Expected a list, got: {type(extracted_data)}")
            except json.JSONDecodeError as e:
                print(f"JSON decode error for chunk {idx}: {e}")
                print("Assistant reply:")
                print(response.content)
        except Exception as e:
            print(f"Error with OpenAI API for chunk {idx}: {e}")
            if "rate limit" in str(e).lower():
                print("Rate limit exceeded. Sleeping for 60 seconds.")
                time.sleep(60)
                continue

    return extracted_needles


# Example usage
if __name__ == "__main__":
    class TechCompany(BaseModel):
        name: Optional[str] = Field(default=None, description="The full name of the technology company")
        location: Optional[str] = Field(default=None, description="City and country where the company is headquartered")
        employee_count: Optional[int] = Field(default=None, description="Total number of employees")
        founding_year: Optional[int] = Field(default=None, description="Year the company was established")
        is_public: Optional[bool] = Field(default=None, description="Whether the company is publicly traded (True) or privately held (False)")
        valuation: Optional[float] = Field(default=None, description="Company's valuation in billions of dollars")
        primary_focus: Optional[str] = Field(default=None, description="Main area of technology or industry the company focuses on")

    example_needles = [
        "Ryoshi, based in Neo Tokyo, Japan, is a private quantum computing firm founded in 2031, currently valued at $8.7 billion with 1,200 employees focused on quantum cryptography."
    ]

    # Sample haystack text
    with open("haystack.txt", "r") as file:
        haystack = file.read()

    extracted_data = extract_multi_needle(TechCompany, haystack, example_needles)

    for item in extracted_data:
        print(item)

content='```json\n[]\n```' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 10229, 'total_tokens': 10234, 'completion_tokens_details': None}, 'model_name': 'gpt-4o-mini', 'system_fingerprint': 'fp_878413d04d', 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {}} id='run-edca0e71-69ec-4557-9da8-87e24f636040-0' usage_metadata={'input_tokens': 10229, 'output_tokens': 5, 'total_tokens': 10234, 'input_token_details': {}, 'output_token_details': {}}
JSON decode error for chunk 0: Expecting value: line 1 column 1 (char 0)
Assistant reply:
```json
[]
```
content='```json\n[]\n```' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 10229, 'total_tokens': 10234, 'completion_tokens_details': None}, 'model_name': 'gpt-4o-mini', 'system_fingerprint': 'fp_878413d04d', 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {}} id='run-5264051b-6