In [1]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.0


In [2]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.3.3-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.10 (from langchain)
  Downloading langchain_core-0.3.10-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.134-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4.0,>=0.3.10->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting httpx<1,>=0.23.0 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.7-cp310-cp310

In [3]:
!pip install faiss-gpu langchain_openai

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting openai<2.0.0,>=1.40.0 (from langchain_openai)
  Downloading openai-1.51.2-py3-none-any.whl.metadata (24 kB)
Collecting tiktoken<1,>=0.7 (from langchain_openai)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting jiter<1,>=0.4.0 (from openai<2.0.0,>=1.40.0->langchain_openai)
  Downloading jiter-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_openai-0.2.2-py3-none-any.whl (49 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import os
import json
import re
from typing import List, Type, TypeVar
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
import faiss
from langchain_openai import AzureChatOpenAI
from langchain.schema import HumanMessage, SystemMessage

# Define a generic type variable bound to BaseModel
T = TypeVar('T', bound=BaseModel)

def extract_multi_needle(schema: Type[T], haystack: str, example_needles: List[str]) -> List[T]:
    """
    Extracts and structures information from a large text corpus based on a given schema and examples.

    Args:
        schema (Type[T]): A Pydantic model defining the structure of the needle to be extracted.
        haystack (str): The large text corpus to search through (haystack).
        example_needles (List[str]): A list of example sentences (needles).

    Returns:
        List[T]: A list of extracted needles conforming to the provided schema.
    """
    # Initialize the list to hold the extracted needles
    extracted_needles = []

    # Initialize the SentenceTransformer model for embeddings
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient model for embeddings

    # Split the haystack into sentences using regex
    sentences = re.split(r'(?<=[.!?])\s+', haystack)

    # Compute embeddings for the sentences in the haystack
    sentence_embeddings = embedding_model.encode(sentences, batch_size=256, show_progress_bar=True)

    # Compute embeddings for the example needles
    example_embeddings = embedding_model.encode(example_needles, show_progress_bar=True)

    # Build a Faiss index for efficient similarity search
    dimension = sentence_embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    faiss.normalize_L2(sentence_embeddings)
    index.add(sentence_embeddings)

    # Normalize example embeddings for cosine similarity
    faiss.normalize_L2(example_embeddings)

    # Number of nearest neighbors to retrieve
    k = 100

    # Perform similarity search for each example embedding
    D, I = index.search(example_embeddings, k)

    # Collect candidate sentences based on similarity search
    candidate_sentences = set()
    for indices in I:
        for idx in indices:
            candidate_sentences.add(sentences[idx])

    # Convert the set to a list for processing
    candidate_sentences = list(candidate_sentences)

    # Initialize the Azure OpenAI LLM model
    model = AzureChatOpenAI(
        openai_api_version=os.environ.get("AZURE_OPENAI_VERSION", "2023-03-15-preview"),
        azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini"),
        azure_endpoint=os.environ.get(
            "AZURE_OPENAI_ENDPOINT",
            "https://gptmini4o.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2023-03-15-preview"
        ),
        openai_api_key=os.environ.get("AZURE_OPENAI_KEY", "c2105be0c2744742980b57320b87e813"),
    )

    # Generate a description of the schema fields and their descriptions
    def generate_schema_description(schema: Type[BaseModel]) -> str:
        """
        Generates a string description of the schema.

        Args:
            schema (Type[BaseModel]): The Pydantic model.

        Returns:
            str: A string describing the schema fields and their descriptions.
        """
        schema_description = ""
        for field_name, field in schema.__fields__.items():
            field_desc = field.description or ''
            field_type = (
            field.annotation.__name__ if hasattr(field.annotation, '__name__') else str(field.annotation)
        )
            schema_description += f"- {field_name} ({field_type}): {field_desc}\n"
        return schema_description

    schema_description = generate_schema_description(schema)

    # Construct the system prompt with schema description
    system_prompt = f"""
You are an assistant that extracts information from text according to a given schema.

The schema is:
{schema_description}

Your task is to read the provided text and extract any information that matches the schema.

Provide the extracted data as a JSON object conforming to the schema.

If the text does not contain relevant information, output an empty JSON object.

Only provide the JSON object, and no additional text.
"""

    # Process each candidate sentence
    for text in candidate_sentences:
        # Create the conversation messages for the LLM
        messages = [
            SystemMessage(content=system_prompt),
            HumanMessage(content=text)
        ]

        # Call the LLM to process the text
        response = model(messages)

        # Attempt to parse the LLM response as JSON
        try:
            data = json.loads(response.content)
            if data:  # If data is not empty
                # Validate and instantiate the schema
                item = schema(**data)
                extracted_needles.append(item)
        except json.JSONDecodeError:
            # If parsing fails, skip this text
            continue
        except Exception:
            # If data does not conform to schema, skip
            continue

    return extracted_needles


  from tqdm.autonotebook import tqdm, trange


In [5]:
from typing import Optional
from pydantic import BaseModel, Field

class TechCompany(BaseModel):
    name: Optional[str] = Field(default=None, description="The full name of the technology company")
    location: Optional[str] = Field(default=None, description="City and country where the company is headquartered")
    employee_count: Optional[int] = Field(default=None, description="Total number of employees")
    founding_year: Optional[int] = Field(default=None, description="Year the company was established")
    is_public: Optional[bool] = Field(default=None, description="Whether the company is publicly traded (True) or privately held (False)")
    valuation: Optional[float] = Field(default=None, description="Company's valuation in billions of dollars")
    primary_focus: Optional[str] = Field(default=None, description="Main area of technology or industry the company focuses on")

In [6]:
example_needles = ["Ryoshi, based in Neo Tokyo, Japan, is a private quantum computing firm founded in 2031, currently valued at $8.7 billion with 1,200 employees focused on quantum cryptography."]

In [9]:
with open("haystack.txt", "r") as file:
    haystack_text = file.read()

In [10]:
# Example usage
extracted_data = extract_multi_needle(schema=TechCompany, haystack=haystack_text, example_needles=example_needles)

# Serialize the extracted data to a JSON file
with open('extracted_needles.json', 'w') as f:
    json.dump([item.dict() for item in extracted_data], f, indent=2)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1070 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  response = model(messages)


In [11]:
model = AzureChatOpenAI(
        openai_api_version=os.environ.get("AZURE_OPENAI_VERSION", "2023-03-15-preview"),
        azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini"),
        azure_endpoint=os.environ.get(
            "AZURE_OPENAI_ENDPOINT",
            "https://gptmini4o.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2023-03-15-preview"
        ),
        openai_api_key=os.environ.get("AZURE_OPENAI_KEY", "c2105be0c2744742980b57320b87e813"),
    )

In [12]:
model.invoke("hey").content

'Hey! How can I help you today?'

In [None]:
from typing import List, Type, TypeVar, Optional, Dict, Any
from pydantic import BaseModel
import spacy
import openai
import json
import re
import numpy as np
from sentence_transformers import SentenceTransformer, util

T = TypeVar('T', bound=BaseModel)

def extract_multi_needle(schema: Type[T], haystack: str, example_needles: List[str]) -> List[T]:
    """
    Extracts and structures information from a large text corpus based on a given schema and examples.

    Args:
    schema (Type[T]): A Pydantic model defining the structure of the needle to be extracted.
    haystack (str): The large text corpus to search through (haystack).
    example_needles (List[str]): A list of example sentences (needles).

    Returns:
    List[T]: A list of extracted needles conforming to the provided schema.
    """
    extracted_needles = []

    # Initialize spaCy model
    nlp = spacy.load('en_core_web_sm')
    nlp.max_length = len(haystack) + 1000  # Adjust based on the input size

    # Step 1: Extract keywords from example_needles
    keywords = extract_keywords(example_needles, nlp)

    # Step 2: Preprocess haystack and split into sentences
    haystack_sentences = split_into_sentences(haystack, nlp)

    # Step 3: Find candidate sentences using keyword matching
    candidate_sentences = find_candidate_sentences(haystack_sentences, keywords)

    if not candidate_sentences:
        return extracted_needles  # No candidates found

    # Step 4: Use embeddings to compute similarity between examples and candidates
    num_candidates = 100  # Adjust based on desired processing time
    candidate_sentences = rank_candidates_by_similarity(example_needles, candidate_sentences, num_candidates)

    # Step 5: Extract data from candidate sentences using LLM
    for sentence in candidate_sentences:
        prompt = construct_prompt(schema, sentence)
        response = call_llm_api(prompt)
        data = parse_llm_response(response)
        try:
            instance = schema.parse_obj(data)
            extracted_needles.append(instance)
        except Exception:
            # Handle validation errors silently or log them
            pass

    return extracted_needles

def extract_keywords(example_needles: List[str], nlp) -> set:
    combined_text = ' '.join(example_needles)
    doc = nlp(combined_text)
    keywords = {token.lemma_.lower() for token in doc if token.pos_ in {'NOUN', 'PROPN', 'ADJ'}}
    return keywords

def split_into_sentences(text: str, nlp) -> List[str]:
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

def find_candidate_sentences(sentences: List[str], keywords: set) -> List[str]:
    candidate_sentences = []
    for sentence in sentences:
        sentence_lower = sentence.lower()
        if any(keyword in sentence_lower for keyword in keywords):
            candidate_sentences.append(sentence)
    return candidate_sentences

def rank_candidates_by_similarity(example_needles: List[str], candidate_sentences: List[str], num_candidates: int) -> List[str]:
    model = SentenceTransformer('all-MiniLM-L6-v2')
    example_embeddings = model.encode(example_needles, convert_to_tensor=True)
    candidate_embeddings = model.encode(candidate_sentences, convert_to_tensor=True)
    cosine_scores = util.cos_sim(example_embeddings, candidate_embeddings)
    max_similarities = cosine_scores.max(axis=0).values
    top_indices = np.argsort(-max_similarities.cpu().numpy())[:num_candidates]
    top_candidate_sentences = [candidate_sentences[idx] for idx in top_indices]
    return top_candidate_sentences

def construct_prompt(schema: Type[T], sentence: str) -> str:
    field_info = []
    for field_name, field in schema.__fields__.items():
        description = field.description or ''
        field_type = (
            field.annotation.__name__ if hasattr(field.annotation, '__name__') else str(field.annotation)
        )
        field_info.append(f"- {field_name} ({field_type}): {description}")
    field_info_text = '\n'.join(field_info)
    prompt = f"""Extract the following information from the sentence:

Sentence: "{sentence}"

Information to extract:
{field_info_text}

Return the information as a JSON object with keys matching the field names.
If a piece of information is not available, use null.

Example format:
{{
    "field1": value1,
    "field2": value2,
    ...
}}
"""
    return prompt

def call_llm_api(prompt: str) -> str:
    # Replace 'YOUR_API_KEY' with your actual OpenAI API key
    model = AzureChatOpenAI(
    openai_api_version=os.environ.get("AZURE_OPENAI_VERSION", "2024-07-18"),
    azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT", "gpt-4o-mini"),
    azure_endpoint=os.environ.get(
        "AZURE_OPENAI_ENDPOINT",
        "https://gptmini4o.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2023-03-15-preview"
    ),
    openai_api_key=os.environ.get("AZURE_OPENAI_KEY", "your_default_api_key_here"),
    )
    response = model.invoke(
        input=[
            {'role': 'system', 'content': 'You are an assistant that extracts structured information from text.'},
            {'role': 'user', 'content': prompt}
        ],
        temperature=0,
    )
    return response.content

def parse_llm_response(response: str) -> Dict[str, Any]:
    try:
        data = json.loads(response)
        return data
    except json.JSONDecodeError:
        match = re.search(r'\{.*\}', response, re.DOTALL)
        if match:
            json_str = match.group(0)
            try:
                data = json.loads(json_str)
                return data
            except json.JSONDecodeError:
                return {}
        else:
            return {}

