In [32]:
from typing import Union, Literal, List, Any
import logging
import vertexai

logging.basicConfig(level=logging.DEBUG)

vertexai.init(project="sanai-441017", location="us-central1")

In [16]:
import instructor
from instructor.client_vertexai import from_vertexai
from vertexai.generative_models import GenerativeModel
from pydantic import BaseModel, Field

In [None]:
def get_vertext_ai_client(model: str, use_async: bool = False, **kwargs) -> Union[instructor.Instructor, instructor.AsyncInstructor]:
    # return genai.GenerativeModel(LM_VERSION.GEMINI_FLASH.value, **kwargs) 
    gemini_mode = model.lower().__contains__("gemini")
    system_instruction: List[Union[str, Any]] = kwargs["system_instruction"] if "system_instruction" in kwargs else None
    default_gen_config = {
            "max_output_tokens": 8192,
            "temperature": 1,
            "top_p": 0.95,
            # "response_mime_type": "application/json",
            # "response_schema": {"type":"OBJECT","properties":{"response":{"type":"STRING"}}},
        }
    generation_config: dict = kwargs["generation_config"] if "generation_config" in kwargs else default_gen_config
    
    if not isinstance(system_instruction, list):
        raise TypeError(f"System instruction should be list of System prompts and not of type {type(system_instruction)}")
    if gemini_mode:
        mode = kwargs["mode"] if "mode" in kwargs else instructor.Mode.VERTEXAI_TOOLS
    else:
        raise NotImplementedError("Other then Gemini, No other LLM can be setup at this point of time")
    
    
    llm = from_vertexai(
        client=GenerativeModel(
            model_name=model,
            system_instruction=system_instruction
        ),
        # mode=instructor.Mode.VERTEXAI_TOOLS,  # instructor.Mode.VERTEXAI_JSON, VERTEXAI_TOOLS
        mode=mode,
        generation_config=generation_config,
        _async=use_async,
    )
    if not gemini_mode:
        raise NotImplementedError("Only gemini is set up so far")
    return llm
    


In [40]:
client = get_vertext_ai_client(
    # model="gemini-1.5-pro-preview-0409", 
    model="gemini-1.5-flash-002",
    system_instruction=["You are a helpful assistant."]
)

class Response(BaseModel):
    name: str = Field(description="Name of the person")
    template: Literal["Hi {name}!", "Bye {name}!"] = Field(description="Greeting template based on situation")
    
    def speak(self) -> str:
        return self.template.format(name=self.name)


# logging.basicConfig(level=logging.CRITICAL)

query = "My sister named Gisel is coming to my house, what should I say?"
prompt = """
**Query:** {q}"""
response = client.chat.completions.create(
    messages=[
        # system role is not supported in Gemini
        # {
        #     "role": "system",
        #     "content": "You are a helpful assistant.",
        # },
        {"role": "user", "content": prompt.format(q=query)}
    ],
    response_model=Response
)


DEBUG:instructor:Patching `client.chat.completions.create` with mode=<Mode.VERTEXAI_TOOLS: 'vertexai_tools'>
DEBUG:instructor:Instructor Request: mode.value='vertexai_tools', response_model=<class '__main__.Response'>, new_kwargs={'generation_config': {'max_output_tokens': 8192, 'temperature': 1, 'top_p': 0.95}, 'contents': [role: "user"
parts {
  text: "\n**Query:** My sister named Gisel is coming to my house, what should I say?"
}
], 'tools': [function_declarations {
  name: "Response"
  parameters {
    type_: OBJECT
    properties {
      key: "template"
      value {
        type_: STRING
        description: "Greeting template based on situation"
        enum: "Hi {name}!"
        enum: "Bye {name}!"
        title: "Template"
      }
    }
    properties {
      key: "name"
      value {
        type_: STRING
        description: "Name of the person"
        title: "Name"
      }
    }
    required: "name"
    required: "template"
    property_ordering: "name"
    property_orderi

In [42]:
response.speak()

'Hi Gisel!'

In [41]:
response

Response(name='Gisel', template='Hi {name}!')

# RAG

In [108]:
class Extraction(BaseModel):
    topic: str
    summary: str
    hypothetical_questions: list[str] = Field(
        default_factory=list,
        description="Hypothetical questions that this document could answer",
    )
    keywords: list[str] = Field(
        default_factory=list, description="Keywords that this document is about"
    )
    
    
class Response(BaseModel):
    extractions: List[Extraction] = Field(..., description="List of extractions. This list cannot be empty.")

In [58]:
import requests

headers = {
    'sec-ch-ua-platform': '"macOS"',
    'Referer': 'https://in.tradingview.com/',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
    'DNT': '1',
    'sec-ch-ua-mobile': '?0',
}

params = {
    'client': 'overview',
    'lang': 'en',
    'symbol': 'NSE:TATAMOTORS',
}

# response = requests.get(, params=params, headers=headers)

sess = requests.Session()
res = sess.get('https://news-headlines.tradingview.com/v2/view/headlines/symbol',
               params=params, headers=headers)
res.status_code

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): news-headlines.tradingview.com:443
DEBUG:urllib3.connectionpool:https://news-headlines.tradingview.com:443 "GET /v2/view/headlines/symbol?client=overview&lang=en&symbol=NSE%3ATATAMOTORS HTTP/11" 200 None


200

In [59]:
res.content.decode()

'{"items":[{"id":"tag:reuters.com,2024:newsml_L1N3M70P1:0","title":"Companies invest in EV battery factories in Europe","provider":"reuters","sourceLogoId":"reuters","published":1732284790,"source":"Reuters","urgency":2,"permission":"preview","relatedSymbols":[{"symbol":"TSE:7201","logoid":"nissan"},{"symbol":"MIL:STLAM","logoid":"stellantis"},{"symbol":"XETR:MBG","logoid":"daimler"},{"symbol":"EURONEXT:TTE","logoid":"total"},{"symbol":"EURONEXT:EN","logoid":"bouygues"},{"symbol":"EURONEXT:BOL","logoid":"bollore"},{"symbol":"NASDAQ:TSLA","logoid":"tesla"},{"symbol":"SZSE:300750","logoid":"contemporary-amper"},{"symbol":"NASDAQ:MVST","logoid":"microvast"},{"symbol":"MIL:SERI","logoid":"kr-energy-spa"},{"symbol":"KRX:051910","logoid":"lg-display"},{"symbol":"NSE:TATAMOTORS","logoid":"tata"},{"symbol":"EURONEXT:RNO","logoid":"renault"},{"symbol":"XETR:VOW","logoid":"volkswagen"},{"symbol":"SZSE:002074","logoid":"gotion-high-tech-co-ltd"},{"symbol":"XETR:PAH3","logoid":"porsche-automobil-h

In [112]:
from pprint import pprint
from collections.abc import Iterable

text_chunk = res.content.decode()

# with open("./assets/article.txt","r+") as file:
#     article = file.read()
# text_chunk = article
# text_chunk = """
# ## Simple RAG

# **What is it?**

# The simplest implementation of RAG embeds a user query and do a single embedding search in a vector database, like a vector store of Wikipedia articles. However, this approach often falls short when dealing with complex queries and diverse data sources.

# **What are the limitations?**

# - **Query-Document Mismatch:** It assumes that the query and document embeddings will align in the vector space, which is often not the case.
#     - Query: "Tell me about climate change effects on marine life."
#     - Issue: The model might retrieve documents related to general climate change or marine life, missing the specific intersection of both topics.
# - **Monolithic Search Backend:** It relies on a single search method and backend, reducing flexibility and the ability to handle multiple data sources.
#     - Query: "Latest research in quantum computing."
#     - Issue: The model might only search in a general science database, missing out on specialized quantum computing resources.
# - **Text Search Limitations:** The model is restricted to simple text queries without the nuances of advanced search features.
#     - Query: "what problems did we fix last week"
#     - Issue: cannot be answered by a simple text search since documents that contain problem, last week are going to be present at every week.
# - **Limited Planning Ability:** It fails to consider additional contextual information that could refine the search results.
#     - Query: "Tips for first-time Europe travelers."
#     - Issue: The model might provide general travel advice, ignoring the specific context of first-time travelers or European destinations.
# """

client = get_vertext_ai_client(
    # model="gemini-1.5-pro-preview-0409", 
    model="gemini-1.5-flash-002",
    system_instruction=["You are a skillful document analyser. Your role is to extract chunks from the following and create a set of topics."]
)

response = client.chat.completions.create(
    # stream=True,
    response_model=Response,
    messages=[
        # {
        #     "role": "system",
        #     "content": "Your role is to extract chunks from the following and create a set of topics.",
        # },
        {"role": "user", "content": text_chunk},
    ],
)

# data = []
# for extraction in extractions:
#     data.append(extraction.model_dump())
# extracts = list(extractions)

DEBUG:instructor:Patching `client.chat.completions.create` with mode=<Mode.VERTEXAI_TOOLS: 'vertexai_tools'>
DEBUG:instructor:Instructor Request: mode.value='vertexai_tools', response_model=<class '__main__.Response'>, new_kwargs={'generation_config': {'max_output_tokens': 8192, 'temperature': 1, 'top_p': 0.95}, 'contents': [role: "user"
parts {
  text: "{\"items\":[{\"id\":\"tag:reuters.com,2024:newsml_L1N3M70P1:0\",\"title\":\"Companies invest in EV battery factories in Europe\",\"provider\":\"reuters\",\"sourceLogoId\":\"reuters\",\"published\":1732284790,\"source\":\"Reuters\",\"urgency\":2,\"permission\":\"preview\",\"relatedSymbols\":[{\"symbol\":\"TSE:7201\",\"logoid\":\"nissan\"},{\"symbol\":\"MIL:STLAM\",\"logoid\":\"stellantis\"},{\"symbol\":\"XETR:MBG\",\"logoid\":\"daimler\"},{\"symbol\":\"EURONEXT:TTE\",\"logoid\":\"total\"},{\"symbol\":\"EURONEXT:EN\",\"logoid\":\"bouygues\"},{\"symbol\":\"EURONEXT:BOL\",\"logoid\":\"bollore\"},{\"symbol\":\"NASDAQ:TSLA\",\"logoid\":\"tes

In [113]:
response.extractions

[Extraction(topic='Electric Vehicle (EV) Battery Factories Investments in Europe', summary='Several companies are investing in the construction of electric vehicle battery factories in Europe.', hypothetical_questions=[], keywords=['Electric Vehicle', 'EV', 'Battery', 'Factories', 'Europe', 'Investments']),
 Extraction(topic="Jaguar's Rebranding Strategy", summary="Jaguar's rebranding is focused on a pink and diverse image, notably without featuring any cars, as reported by the Wall Street Journal.", hypothetical_questions=[], keywords=['Jaguar', 'Rebranding', 'Pink', 'Diversity', 'Electric Vehicle']),
 Extraction(topic='Indian Automotive Festive Season Sales', summary="India's festive season witnessed a 12% growth in retail auto sales, according to a dealers' body.", hypothetical_questions=[], keywords=['India', 'Automotive', 'Sales', 'Festive Season', 'Growth']),
 Extraction(topic='Jaguar Land Rover Vehicle Recall', summary='Jaguar Land Rover North America is recalling approximately 

In [111]:
len(response.extractions)

1

## RAG - Decomposing Questions

In [71]:
class Question(BaseModel):
    id: int = Field(..., description="A unique identifier for the question")
    query: str = Field(..., description="The question decomposited as much as possible")
    subquestions: list[int] = Field(
        default_factory=list,
        description="The subquestions that this question is composed of. There should be atleast 2 subquestions for sure",
    )


class QueryPlan(BaseModel):
    root_question: str = Field(..., description="The root question that the user asked")
    plan: list[Question] = Field(
        ..., description="The plan to answer the root question and its subquestions"
    )

client = get_vertext_ai_client(
    # model="gemini-1.5-pro-preview-0409", 
    model="gemini-1.5-flash-002",
    system_instruction=["You are a financial query understanding system capable of decomposing a question into subquestions."]
)

query = "What is current floating shares' percentage of TATA MOTORS?"

qplan = client.chat.completions.create(
    response_model=QueryPlan,
    messages=[
        {
            "role": "user",
            "content": query,
        },
    ],
)


DEBUG:instructor:Patching `client.chat.completions.create` with mode=<Mode.VERTEXAI_TOOLS: 'vertexai_tools'>
DEBUG:instructor:Instructor Request: mode.value='vertexai_tools', response_model=<class '__main__.QueryPlan'>, new_kwargs={'generation_config': {'max_output_tokens': 8192, 'temperature': 1, 'top_p': 0.95}, 'contents': [role: "user"
parts {
  text: "What is current floating shares\' percentage of TATA MOTORS?"
}
], 'tools': [function_declarations {
  name: "QueryPlan"
  parameters {
    type_: OBJECT
    properties {
      key: "root_question"
      value {
        type_: STRING
        description: "The root question that the user asked"
        title: "Root Question"
      }
    }
    properties {
      key: "plan"
      value {
        type_: ARRAY
        items {
          type_: OBJECT
          properties {
            key: "subquestions"
            value {
              type_: ARRAY
              items {
                type_: INTEGER
              }
              descrip

DEBUG:instructor:No compatible response.usage found, token usage not updated.
DEBUG:instructor:Instructor Raw Response: candidates {
  content {
    role: "model"
    parts {
      function_call {
        name: "QueryPlan"
        args {
          fields {
            key: "root_question"
            value {
              string_value: "What is current floating shares\' percentage of TATA MOTORS?"
            }
          }
          fields {
            key: "plan"
            value {
              list_value {
                values {
                  struct_value {
                    fields {
                      key: "subquestions"
                      value {
                        list_value {
                        }
                      }
                    }
                    fields {
                      key: "query"
                      value {
                        string_value: "What is the total number of shares of TATA MOTORS?"
                      }
      

In [72]:
qplan

QueryPlan(root_question="What is current floating shares' percentage of TATA MOTORS?", plan=[Question(id=1, query='What is the total number of shares of TATA MOTORS?', subquestions=[]), Question(id=2, query='What is the number of shares held by promoters and other entities in TATA MOTORS?', subquestions=[]), Question(id=3, query='Calculate the percentage of floating shares by dividing the number of floating shares by the total number of shares and multiplying by 100', subquestions=[1, 2])])

In [74]:
qplan.plan

[Question(id=1, query='What is the total number of shares of TATA MOTORS?', subquestions=[]),
 Question(id=2, query='What is the number of shares held by promoters and other entities in TATA MOTORS?', subquestions=[]),
 Question(id=3, query='Calculate the percentage of floating shares by dividing the number of floating shares by the total number of shares and multiplying by 100', subquestions=[1, 2])]

In [75]:
qplan2 = client.chat.completions.create(
    response_model=QueryPlan,
    messages=[
        {
            "role": "user",
            "content": qplan.plan[1].query,
        },
    ],
)
qplan2

DEBUG:instructor:Instructor Request: mode.value='vertexai_tools', response_model=<class '__main__.QueryPlan'>, new_kwargs={'generation_config': {'max_output_tokens': 8192, 'temperature': 1, 'top_p': 0.95}, 'contents': [role: "user"
parts {
  text: "What is the number of shares held by promoters and other entities in TATA MOTORS?"
}
], 'tools': [function_declarations {
  name: "QueryPlan"
  parameters {
    type_: OBJECT
    properties {
      key: "root_question"
      value {
        type_: STRING
        description: "The root question that the user asked"
        title: "Root Question"
      }
    }
    properties {
      key: "plan"
      value {
        type_: ARRAY
        items {
          type_: OBJECT
          properties {
            key: "subquestions"
            value {
              type_: ARRAY
              items {
                type_: INTEGER
              }
              description: "The subquestions that this question is composed of. There should be atleast 2 sub

QueryPlan(root_question='What is the number of shares held by promoters and other entities in TATA MOTORS?', plan=[Question(id=1, query='What is the number of shares held by promoters in TATA MOTORS?', subquestions=[]), Question(id=2, query='What is the number of shares held by other entities in TATA MOTORS?', subquestions=[])])

# Chain of Density Summarization

link: https://python.useinstructor.com/tutorials/6-chain-of-density/#setup-and-dependencies

In [94]:
import nltk
import spacy
nltk.download('punkt')
nltk.download('punkt_tab')

# !python -m spacy download en_core_web_sm --quiet
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/santokalayil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/santokalayil/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [95]:
from pydantic import Field, field_validator
from textwrap import dedent

class InitialSummary(BaseModel):
    """
    This is an initial summary which should be long ( 4-5 sentences, ~80 words)
    yet highly non-specific, containing little information beyond the entities marked as missing.
    Use overly verbose languages and fillers (Eg. This article discusses) to reach ~80 words.
    """

    summary: str = Field(
        ...,
        description="This is a summary of the article provided which is overly verbose and uses fillers. \
        It should be roughly 80 words in length",
    )


class RewrittenSummary(BaseModel):
    """
    This is a new, denser summary of identical length which covers every entity
    and detail from the previous summary plus the Missing Entities.

    Guidelines
    - Make every word count : Rewrite the previous summary to improve flow and make space for additional entities
    - Never drop entities from the previous summary. If space cannot be made, add fewer new entities.
    - The new summary should be highly dense and concise yet self-contained, eg., easily understood without the Article.
    - Make space with fusion, compression, and removal of uninformative phrases like "the article discusses"
    - Missing entities can appear anywhere in the new summary

    An Entity is a real-world object that's assigned a name - for example, a person, country a product or a book title.
    """

    summary: str = Field(
        ...,
        description="This is a new, denser summary of identical length which covers every entity and detail from the previous summary plus the Missing Entities. It should have the same length ( ~ 80 words ) as the previous summary and should be easily understood without the Article",
    )
    absent: list[str] = Field(
        ...,
        default_factory=list,
        description="this is a list of Entities found absent from the new summary that were present in the previous summary",
    )
    missing: list[str] = Field(
        default_factory=list,
        description="This is a list of 1-3 informative Entities from the Article that are missing from the new summary which should be included in the next generated summary.",
    )
        
    
    @field_validator("summary")
    def min_length(cls, v: str):
        tokens = nltk.word_tokenize(v) 
        num_tokens = len(tokens)
        if num_tokens < 60:
            raise ValueError(
                "The current summary is too short. Please make sure that you generate a new summary that is around 80 words long."
            )
        return v
    
    @field_validator("missing")
    def has_missing_entities(cls, missing_entities: list[str]):
        if len(missing_entities) == 0:
            raise ValueError(
                "You must identify 1-3 informative Entities from the Article which are missing from the previously generated summary to be used in a new summary"
            )
        return missing_entities
    
    @field_validator("absent")
    def has_no_absent_entities(cls, absent_entities: list[str]):
        absent_entity_string = ",".join(absent_entities)
        if len(absent_entities) > 0:
            print(f"Detected absent entities of {absent_entity_string}")
            raise ValueError(
                f"Do not omit the following Entities {absent_entity_string} from the new summary"
            )
        return absent_entities
    
    @field_validator("summary")
    def min_entity_density(cls, v: str):
        tokens = nltk.word_tokenize(v)
        num_tokens = len(tokens)
    
        # Extract Entities
        doc = nlp(v) 
        num_entities = len(doc.ents)
    
        density = num_entities / num_tokens
        if density < 0.08: 
            raise ValueError(
                f"The summary of {v} has too few entities. Please regenerate a new summary with more new entities added to it. Remember that new entities can be added at any point of the summary."
            )
    
        return v

In [103]:
client = get_vertext_ai_client(
    # model="gemini-1.5-pro-preview-0409", 
    model="gemini-1.5-flash-002",
    system_instruction=[
        # "You are a financial query understanding system capable of decomposing a question into subquestions.",
        "Write a summary about the article that is long (4-5 sentences) yet highly non-specific. Use overly, verbose language and fillers(eg.,'this article discusses') to reach ~80 words",
    ],
)

rewrite_summary_client = get_vertext_ai_client(
    model="gemini-1.5-pro-preview-0409", 
    # model="gemini-1.5-flash-002",
    system_instruction=[
        dedent("""
        You are going to generate an increasingly concise,entity-dense summary of the following article.

        Perform the following two tasks
        - Identify 1-3 informative entities from the following article which is missing from the previous summary
        - Write a new denser summary of identical length which covers every entity and detail from the previous summary plus the Missing Entities

        Guidelines
        - Make every word count: re-write the previous summary to improve flow and make space for additional entities
        - Make space with fusion, compression, and removal of uninformative phrases like "the article discusses".
        - The summaries should become highly dense and concise yet self-contained, e.g., easily understood without the Article.
        - Missing entities can appear anywhere in the new summary
        - Never drop entities from the previous summary. If space cannot be made, add fewer new entities.
        """),
    ],
)

def summarize_article(article: str, summary_steps: int = 3):
    summary_chain = []
    # We first generate an initial summary
    summary: InitialSummary = client.chat.completions.create(  
        # model="gpt-4-1106-preview",
        response_model=InitialSummary,
        messages=[
            # {
            #     "role": "system",
            #     "content": "Write a summary about the article that is long (4-5 sentences) yet highly non-specific. Use overly, verbose language and fillers(eg.,'this article discusses') to reach ~80 words",
            # },
            {"role": "user", "content": f"Here is the Financial Statement Report: {article}"},
            {
                "role": "user",
                "content": "The generated summary should be about 60 words.",
            },
        ],
        max_retries=2,
    )
    prev_summary = None
    summary_chain.append(summary.summary)
    for _i in range(summary_steps):
        missing_entity_message = (
            []
            if prev_summary is None
            else [
                {
                    "role": "user",
                    "content": f"Please include these Missing Entities: {','.join(prev_summary.missing)}",
                },
            ]
        )
        new_summary: RewrittenSummary = rewrite_summary_client.chat.completions.create( 
            # model="gpt-4-1106-preview",
            messages=[
                # {
                #     "role": "system",
                #     "content": """
                # You are going to generate an increasingly concise,entity-dense summary of the following article.

                # Perform the following two tasks
                # - Identify 1-3 informative entities from the following article which is missing from the previous summary
                # - Write a new denser summary of identical length which covers every entity and detail from the previous summary plus the Missing Entities

                # Guidelines
                # - Make every word count: re-write the previous summary to improve flow and make space for additional entities
                # - Make space with fusion, compression, and removal of uninformative phrases like "the article discusses".
                # - The summaries should become highly dense and concise yet self-contained, e.g., easily understood without the Article.
                # - Missing entities can appear anywhere in the new summary
                # - Never drop entities from the previous summary. If space cannot be made, add fewer new entities.
                # """,
                # },
                {"role": "user", "content": f"Here is the Article: {article}"},
                {
                    "role": "user",
                    "content": f"Here is the previous summary: {summary_chain[-1]}",
                },
                *missing_entity_message,
            ],
            max_retries=3, 
            # max_tokens=1000,
            response_model=RewrittenSummary,
        )
        summary_chain.append(new_summary.summary)
        prev_summary = new_summary

    return summary_chain

DEBUG:instructor:Patching `client.chat.completions.create` with mode=<Mode.VERTEXAI_TOOLS: 'vertexai_tools'>
DEBUG:instructor:Patching `client.chat.completions.create` with mode=<Mode.VERTEXAI_TOOLS: 'vertexai_tools'>


In [105]:
with open("./assets/article.txt","r+") as file:
    article = file.read()
    
# nlp = spacy.load("en_core_web_sm")

def calculate_entity_density(sentence:str):
    tokens = nltk.word_tokenize(sentence)
    entities = nlp(sentence).ents
    entity_density = round(len(entities)/len(tokens),3)

    return len(tokens),len(entities),entity_density
    
summaries = summarize_article(article)

DEBUG:instructor:Instructor Request: mode.value='vertexai_tools', response_model=<class '__main__.InitialSummary'>, new_kwargs={'generation_config': {'max_output_tokens': 8192, 'temperature': 1, 'top_p': 0.95}, 'contents': [role: "user"
parts {
  text: "Here is the Financial Statement Report: TATA MOTORS LIMITED\nRegd.Office : Bombay House, 24, Homi Mody Street, Mumbai 400 001\nCIN L28920MH1945PLC004520\n(₹ in crores)\nSTATEMENT OF STANDALONE AUDITED FINANCIAL RESULTS FOR THE QUARTER AND SIX MONTHS ENDED SEPTEMBER 30, 2024\nQuarter ended Six months ended Year ended\nParticulars September 30, June 30, September 30, September 30, September 30, March 31,\n2024 2024 2023 2024 2023 2024\nRevenue from operations Audited\n(a) Revenue 15,371 16,732 18,403 32,103 34,136 72,746\n(b) Other operating revenue 147 130 139 277 238 557\nI. Total revenue from operations (a)+(b) 15,518 16,862 18,542 32,380 34,374 73,303\nII. Other income 294 1,989 580 2,283 879 1,150\nIII. Total Income (I+II) 15,812 18,

DEBUG:instructor:No compatible response.usage found, token usage not updated.
DEBUG:instructor:Instructor Raw Response: candidates {
  content {
    role: "model"
    parts {
      function_call {
        name: "InitialSummary"
        args {
          fields {
            key: "summary"
            value {
              string_value: "This article presents a detailed financial statement report for Tata Motors Limited for the quarter and six months ended September 30, 2024.  The report includes comprehensive data on revenue, expenses, profits, cash flows, and key financial ratios.  Significant events, such as a scheme of arrangement for share capital reduction and a proposed demerger of business units, are also discussed.  The report provides insights into the financial health and performance of Tata Motors.  Further analysis of this data is needed to draw specific conclusions."
            }
          }
        }
      }
    }
  }
  finish_reason: STOP
  avg_logprobs: -0.2055130529841

In [None]:
for index, summary in enumerate(summaries):
    tokens, entity, density = calculate_entity_density(summary)
    print(f"Article {index+1} -> Results (Tokens: {tokens}, Entity Count: {entity}, Density: {density})")

Article 1 -> Results (Tokens: 95, Entity Count: 3, Density: 0.032)
Article 2 -> Results (Tokens: 107, Entity Count: 9, Density: 0.084)
Article 3 -> Results (Tokens: 101, Entity Count: 10, Density: 0.099)
Article 4 -> Results (Tokens: 95, Entity Count: 13, Density: 0.137)


In [114]:
summaries[-1]

'Tata Motors Limited\\\'s September 2024 quarterly report reveals six-month revenue from operations at ₹32,380 crore and profit before tax of ₹4,188 crore. Notably, Basic EPS stood at ₹7.88.  The report highlighted a scheme replacing "A" Ordinary Shares with New Ordinary Shares to reduce share capital, with Diluted EPS at ₹7.88. Approvals are pending for this scheme and a proposed demerger of the Commercial Vehicle (CV) business with a merger of Tata Motors Passenger Vehicles Limited into the listed company. Further analysis is recommended.'

# Unstructured

- Installs
    - unstructured[all-docs]==0.13.3
    - unstructured-inference==0.7.27

https://github.com/katanaml/sparrow/blob/main/sparrow-data/parse/sparrow_parse/processors/table_structure_processor.py

# Vector Embeddings

In [117]:
from __future__ import annotations

from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel


def embed_text(texts: List[str]) -> list[list[float]]:
    """Embeds texts with a pre-trained, foundational model.

    Returns:
        A list of lists containing the embedding vectors for each input text
    """

    # # A list of texts to be embedded.
    # texts = ["banana muffins? ", "banana bread? banana muffins?"]
    
    # The dimensionality of the output embeddings.
    dimensionality = 256
    # The task type for embedding. Check the available tasks in the model's documentation.
    task = "RETRIEVAL_DOCUMENT"  # task_type

    model = TextEmbeddingModel.from_pretrained("text-embedding-005")
    inputs = [TextEmbeddingInput(text, task_type=task) for text in texts]
    kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}
    embeddings = model.get_embeddings(inputs, **kwargs)

    print(embeddings)
    # Example response:
    # [[0.006135190837085247, -0.01462465338408947, 0.004978656303137541, ...], [0.1234434666, ...]],
    return [embedding.values for embedding in embeddings]

embeddings = embed_text(["santo"])

[TextEmbedding(values=[-0.010988702066242695, -0.033970996737480164, -0.029479719698429108, -0.010278915986418724, -0.025875648483633995, 0.07550902664661407, -0.014946348033845425, 0.014330334030091763, -0.0034625923726707697, -0.0160514023154974, -0.005946489050984383, -0.07523702830076218, -0.0010706844041123986, 0.044492222368717194, -0.042652856558561325, -0.009538733400404453, 0.03406780585646629, -0.06588839739561081, 0.00417078472673893, -0.017865842208266258, 0.017006762325763702, -0.10525583475828171, -0.024377765133976936, -0.021084560081362724, 0.040403544902801514, -0.017070451751351357, 0.028878284618258476, -0.006213067099452019, -0.03844127058982849, 0.006999605800956488, 0.032362259924411774, -0.04699324443936348, 0.04065646231174469, -0.025399019941687584, -0.030671216547489166, -0.059795886278152466, 0.06532688438892365, -0.003278624964877963, 0.05675726383924484, 0.0030032244976609945, 0.01750003919005394, 0.007389340084046125, -0.034029148519039154, -0.064244747161

In [119]:
len(embeddings)

1

In [120]:
len(embeddings[0])

256

task_type
The name of the downstream task the embeddings will be used for. Valid values:

    RETRIEVAL_QUERY
        Specifies the given text is a query in a search/retrieval setting.
    RETRIEVAL_DOCUMENT
        Specifies the given text is a document from the corpus being searched.
    SEMANTIC_SIMILARITY
        Specifies the given text will be used for STS.
    CLASSIFICATION
        Specifies that the given text will be classified.
    CLUSTERING
        Specifies that the embeddings will be used for clustering.
    QUESTION_ANSWERING
        Specifies that the embeddings will be used for question answering.
    FACT_VERIFICATION
        Specifies that the embeddings will be used for fact verification.
    CODE_RETRIEVAL_QUERY
        Specifies that the embeddings will be used for code retrieval.