In [1]:
!pip install sentence_transformers
!pip install langchain
!pip install faiss-gpu langchain_openai

Collecting sentence_transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.0
Collecting langchain
  Downloading langchain-0.3.3-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.10 (from langchain)
  Downloading langchain_core-0.3.10-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.134-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata 

In [2]:
import os
import json
import re
from typing import List, Type, TypeVar
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
import faiss
from langchain_openai import AzureChatOpenAI
from langchain.schema import HumanMessage, SystemMessage

# Define a generic type variable bound to BaseModel
T = TypeVar('T', bound=BaseModel)

  from tqdm.autonotebook import tqdm, trange


In [3]:
def extract_multi_needle(schema: Type[T], haystack: str, example_needles: List[str]) -> List[T]:
    # Initialize the list to hold the extracted needles
    extracted_needles = []

    # Initialize the SentenceTransformer model for embeddings
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

    # Split the haystack into sentences using regex
    sentences = re.split(r'(?<=[.!?])\s+', haystack)

    # Compute embeddings for the sentences in the haystack
    sentence_embeddings = embedding_model.encode(sentences, batch_size=256, show_progress_bar=True)

    # Compute embeddings for the example needles
    example_embeddings = embedding_model.encode(example_needles, show_progress_bar=True)

    # Normalize embeddings for cosine similarity
    import numpy as np
    sentence_embeddings_normalized = sentence_embeddings / np.linalg.norm(sentence_embeddings, axis=1, keepdims=True)
    example_embeddings_normalized = example_embeddings / np.linalg.norm(example_embeddings, axis=1, keepdims=True)

    # Compute cosine similarities
    cosine_similarities = np.dot(example_embeddings_normalized, sentence_embeddings_normalized.T)

    # Set a similarity threshold
    similarity_threshold = 0.6  # Adjust as needed

    # Get indices of sentences above the threshold
    candidate_indices = np.argwhere(cosine_similarities >= similarity_threshold)[:, 1]
    candidate_sentences = [sentences[idx] for idx in set(candidate_indices)]

    # Also include sentences with specific keywords
    keywords = ["founded", "employees", "valued", "based in", "headquartered", "established", "private", "public"]
    keyword_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)]

    # Combine candidate sentences
    candidate_sentences = list(set(candidate_sentences).union(set(keyword_sentences)))

    print(f"Number of candidate sentences: {len(candidate_sentences)}")

    # Initialize the Azure OpenAI LLM model
    model = AzureChatOpenAI(
        openai_api_version=os.environ.get("AZURE_OPENAI_VERSION", "2023-03-15-preview"),
        azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini"),
        azure_endpoint=os.environ.get(
            "AZURE_OPENAI_ENDPOINT",
            "https://gptmini4o.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2023-03-15-preview"
        ),
        openai_api_key=os.environ.get("AZURE_OPENAI_KEY", "c2105be0c2744742980b57320b87e813"),
    )

    # Generate a description of the schema fields and their descriptions
    def generate_schema_description(schema: Type[BaseModel]) -> str:
        schema_description = ""
        for field_name, field in schema.__fields__.items():
            field_desc = field.description or ''
            field_type = (
                field.annotation.__name__ if hasattr(field.annotation, '__name__') else str(field.annotation)
            )
            schema_description += f"- {field_name} ({field_type}): {field_desc}\n"
        return schema_description

    schema_description = generate_schema_description(schema)

    # Construct the system prompt with schema description
    system_prompt = f"""
You are an assistant that extracts information from text according to a given schema.

The schema is:
{schema_description}

Your task is to read the provided text and extract any information that matches the schema.

Provide the extracted data as a JSON object conforming to the schema.

If the text does not contain relevant information, output an empty JSON object.

Only provide the JSON object, and no additional text.

Consider variations in sentence structure and wording. Extract information even if the text differs from the examples.
"""

    # Process each candidate sentence
    for text in candidate_sentences:
        # Create the conversation messages for the LLM
        messages = [
            SystemMessage(content=system_prompt),
            HumanMessage(content=text)
        ]

        # Call the LLM to process the text
        response = model(messages)

        # Attempt to parse the LLM response as JSON
        try:
            data = json.loads(response.content)
            if data:  # If data is not empty
                # Validate and instantiate the schema
                item = schema(**data)
                extracted_needles.append(item)
        except json.JSONDecodeError as e:
            print(f"JSONDecodeError for text: {text}")
            print(f"LLM response: {response.content}")
            continue
        except Exception as e:
            print(f"Exception for text: {text}")
            print(f"Error: {e}")
            continue

    return extracted_needles


In [4]:
from typing import Optional
from pydantic import BaseModel, Field

class TechCompany(BaseModel):
    name: Optional[str] = Field(default=None, description="The full name of the technology company")
    location: Optional[str] = Field(default=None, description="City and country where the company is headquartered")
    employee_count: Optional[int] = Field(default=None, description="Total number of employees")
    founding_year: Optional[int] = Field(default=None, description="Year the company was established")
    is_public: Optional[bool] = Field(default=None, description="Whether the company is publicly traded (True) or privately held (False)")
    valuation: Optional[float] = Field(default=None, description="Company's valuation in billions of dollars")
    primary_focus: Optional[str] = Field(default=None, description="Main area of technology or industry the company focuses on")

In [5]:
example_needles = ["Ryoshi, based in Neo Tokyo, Japan, is a private quantum computing firm founded in 2031, currently valued at $8.7 billion with 1,200 employees focused on quantum cryptography."]

In [7]:
with open("haystack.txt", "r") as file:
    haystack_text = file.read()

In [8]:
# Example usage
extracted_data = extract_multi_needle(schema=TechCompany, haystack=haystack_text, example_needles=example_needles)

# Serialize the extracted data to a JSON file
with open('extracted_needles.json', 'w') as f:
    json.dump([item.dict() for item in extracted_data], f, indent=2)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1070 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Number of candidate sentences: 201


  response = model(messages)
