In [2]:
!pip -q install langchain openai google-search-results tiktoken
!pip -q install kor markdownify

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m934.6/934.6 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.6/149.6 kB[0m [31m16.9 MB/s[0m

In [1]:
import os

os.environ["OPENAI_API_KEY"] = "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"

In [3]:
!pip show langchain

Name: langchain
Version: 0.0.181
Summary: Building applications with LLMs through composability
Home-page: https://www.github.com/hwchase17/langchain
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, async-timeout, dataclasses-json, numexpr, numpy, openapi-schema-pydantic, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: kor


## Kor Basics

The basic workflow is the following:

1. Load the document
2. Clean up the document (optional)
3. Split the document into chunks
4. Define a schema for extraction
5. Extract from every chunk of text

In [3]:
from typing import List, Optional

from langchain.callbacks import get_openai_callback
from langchain.chat_models import ChatOpenAI

from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

import pandas as pd
from pydantic import BaseModel, Field, validator
from kor import extract_from_documents, from_pydantic, create_extraction_chain


from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter


## Simple examples

In [4]:
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
)

In [5]:
schema = Object(
    id="clause_extraction",
    description="Clauses from a given legal document",
    attributes=[
        Text(
            id="clause",
            description="The clause and the number of the clause",
            examples=[("In such an event, the Shareholder and the Affiliate shall constitute an Affiliate Block and shall comply with and be bound by the provisions of Clause 17.4, the provisions of which shall apply on a mutatis mutandis basis.", "Clause 17.4")],
        ),
        Text(
            id="clause_name",
            description="The name of the Clause",
            examples=[("“Affiliate Block” shall have the meaning ascribed to it in Clause 17.5.1.", "Affiliate Block")],
        )
    ],
    examples=[
        (
            "“Company Reserved Matters” shall have the meaning ascribed to it in Clause 12.3.",
            [
                {"clause": " Clause 12.3", "clause_name": "Company Reserved Matters"}
            ],
        ),
        (
            "Tag-Along Event has the meaning set forth in Article 8.4.1 of this Agreement.",
         [
            {'clause': "Article 8.4.1", 'clause_name': "Tag-Along Event"}
         ]
        ),
        
    ],
    many=True,
)


chain = create_extraction_chain(llm, schema)

In [6]:
print(chain.prompt.format_prompt(text="[user input]").to_string())

Your goal is to extract structured information from the user's input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.

```TypeScript

clause_extraction: Array<{ // Clauses from a given legal document
 clause: string // The clause and the number of the clause
 clause_name: string // The name of the Clause
}>
```


Please output the extracted information in CSV format in Excel dialect. Please use a | as the delimiter. 
 Do NOT add any clarifying information. Output MUST follow the schema above. Do NOT add any additional columns that do not appear in the schema.



Input: “Company Reserved Matters” shall have the meaning ascribed to it in Clause 12.3.
Output: clause|clause_name
 Clause 12.3|Company Reserved Matters

Input: In such an event, the Shareholder and the Affiliate shall constitute an Affiliate Block and shall comply with and be bound

In [7]:
chain.predict_and_parse(text="“Designated Bank Account” shall have the meaning ascribed to it in Clause 4.1.10.")["data"]

{'clause_extraction': [{'clause': 'Clause 4.1.10',
   'clause_name': 'Designated Bank Account'}]}

## Nested Objects and JSON

In [None]:
from_address = Object(
    id="from_address",
    description="Person moved away from this address",
    attributes=[
        Text(id="street"),
        Text(id="city"),
        Text(id="state"),
        Text(id="zipcode"),
        Text(id="country", description="A country in the world; e.g., France."),
    ],
    examples=[
        (
            "100 Main St, Boston, MA, 23232, USA",
            {
                "street": "100 Marlo St",
                "city": "Boston",
                "state": "MA",
                "zipcode": "23232",
                "country": "USA",
            },
        )
    ],
)

to_address = from_address.replace(
    id="to_address", description="Address to which the person is moving"
)

schema = Object(
    id="information",
    attributes=[
        Text(
            id="person_name",
            description="The full name of the person or partial name",
            examples=[("John Smith was here", "John Smith")],
        ),
        from_address,
        to_address,
    ],
    many=True,
)

### JSON encoding
To use nested objects, at least for now we have to swap to the JSON encoder.

In [13]:
chain = create_extraction_chain(
    llm, schema, encoder_or_encoder_class="json", input_formatter=None
)

In [14]:
print(chain.prompt.format_prompt(text="[user input]").to_string())

Your goal is to extract structured information from the user's input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.

```TypeScript

clause_extraction: Array<{ // Clauses from a given legal document
 clause: string // The clause and the number of the clause
 clause_name: string // The name of the Clause
}>
```


Please output the extracted information in JSON format. Do not output anything except for the extracted information. Do not add any clarifying information. Do not add any fields that are not in the schema. If the text contains attributes that do not appear in the schema, please ignore them. All output must be in JSON format and follow the schema specified above. Wrap the JSON in <json> tags.



Input: “Company Reserved Matters” shall have the meaning ascribed to it in Clause 12.3.
Output: <json>{"clause_extraction": [{"clause": " C

In [15]:
chain.predict_and_parse(
    text='''“Option Exercise Date” shall have the meaning ascribed to it in Clause 22.2.
“Mike Athleisure Business” shall have the meaning ascribed to it in Clause 23.1.3 (b).
'''
)["data"]

{'clause_extraction': [{'clause': 'Clause 22.2',
   'clause_name': 'Option Exercise Date'},
  {'clause': 'Clause 23.1.3 (b)', 'clause_name': 'Mike Athleisure Business'}]}

## With Pydantic and validation

In [None]:
!wget -q https://www.dropbox.com/s/gekyuep86zibhl1/conversation-025722052023.txt

#### Load the document

In [8]:
!pip install python-docx

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25l[?25hdone
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184491 sha256=bf0011e537c96a6360349c18bd9d1bd9249e777936fbb1d70515ecebae173480
  Stored in directory: /root/.cache/pip/wheels/80/27/06/837436d4c3bd989b957a91679966f207bfd71d358d63a8194d
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11


In [None]:
!unzip /content/SampleDocs.zip
!mv /content/SampleDocs/Testing /content/Testing

In [11]:
!pip install docx2txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3957 sha256=43cfda920323ad103f52e804f385e75c3bae28cbe2411001240db704f68038a1
  Stored in directory: /root/.cache/pip/wheels/22/58/cf/093d0a6c3ecfdfc5f6ddd5524043b88e59a9a199cb02352966
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8


#### Split the text into docs 

In [13]:
doc = Document(page_content=conversation)

In [14]:
split_docs = RecursiveCharacterTextSplitter().split_documents([doc])

langchain.schema.Document

#### Extract Clause Info 


In [16]:
llm = ChatOpenAI(
     model_name="gpt-3.5-turbo",
    temperature=0,
)

In [18]:
class Clause(BaseModel):
    clause: str = Field(
        description="The clause and its number",
    )
    clause_name: str = Field(
        description="The name of the clause",
    )

    @validator("clause", "clause_name")
    def name_must_not_be_empty(cls, v):
        if not v:
            raise ValueError("Name must not be empty")
        return v




In [19]:
schema, extraction_validator = from_pydantic(
    Clause,
    description="Extract information about clauses, its number and its name.",
    examples=[
        (
            "“Company Reserved Matters” shall have the meaning ascribed to it in Clause 12.3.",
            [
                {"clause": " Clause 12.3", "clause_name": "Company Reserved Matters"}
            ],
        )
    ],
    many=True,
)

In [20]:
chain = create_extraction_chain(
    llm,
    schema,
    encoder_or_encoder_class="csv",
    validator=extraction_validator,
    input_formatter="triple_quotes",
)

In [None]:
print(chain.prompt.format_prompt(text="[user input]").to_string())

In [67]:
with get_openai_callback() as cb:
    document_extraction_results = await extract_from_documents(
        chain, split_docs, max_concurrency=5, use_uid=False, return_exceptions=True
    )
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Successful Requests: {cb.successful_requests}")
    print(f"Total Cost (USD): ${cb.total_cost}")

Total Tokens: 52129
Prompt Tokens: 41958
Completion Tokens: 10171
Successful Requests: 40
Total Cost (USD): $0.10425800000000002


In [None]:
document_extraction_results

#### Let's put it in a human readable format

In [None]:
import json

def extract_restaurant_info(json_data):
    for record in json_data:
        restaurant_list = record.get('data', {}).get('restaurant', [])
        for restaurant in restaurant_list:
            name = restaurant.get('name', '')
            location = restaurant.get('location', '')
            style = restaurant.get('style', '')
            top_dish = restaurant.get('top_dish', '')
            
            # If style is not specified, we'll just say "Cuisine not specified"
            style = style if style else 'Cuisine not specified'

            print(f'Restaurant Name: {name}\nLocation: {location}\nStyle: {style}\nTop Dish: {top_dish}\n')



In [None]:

extract_restaurant_info(document_extraction_results)

Restaurant Name: El Celler de Can Roca
Location: Girona, Spain
Style: Cuisine not specified
Top Dish: forest-inspired dish

Restaurant Name: Noma
Location: Copenhagen, Denmark
Style: Nordic cuisine
Top Dish: fermented berries and ants dessert

Restaurant Name: La Cava del Tequila
Location: Mexico City, Mexico
Style: Mexican
Top Dish: mole

Restaurant Name: Gaggan
Location: Bangkok, Thailand
Style: modern Indian cuisine
Top Dish: Lick It Up course

Restaurant Name: Osteria Francescana
Location: Modena, Italy
Style: modern Italian cuisine
Top Dish: Oops! I Dropped the Lemon Tart

Restaurant Name: Attica
Location: Melbourne, Australia
Style: Australian cuisine
Top Dish: Potato cooked in the earth it was grown



#### Lets put it in a structured DataFrame

In [None]:
import pandas as pd

def generate_dataframe(json_data):
    # Prepare an empty list to store all restaurant data
    data = []

    for record in json_data:
        restaurant_list = record.get('data', {}).get('restaurant', [])
        for restaurant in restaurant_list:
            # Get details for each restaurant and append it to data
            data.append([
                restaurant.get('name', ''),
                restaurant.get('location', ''),
                restaurant.get('style', '') if restaurant.get('style', '') else 'Cuisine not specified',
                restaurant.get('top_dish', '')
            ])

    # Convert the list into a DataFrame
    df = pd.DataFrame(data, columns=['Name', 'Location', 'Style', 'Top Dish'])

    return df

# Usage:
df = generate_dataframe(document_extraction_results)


In [None]:
df

Unnamed: 0,Name,Location,Style,Top Dish
0,El Celler de Can Roca,"Girona, Spain",Cuisine not specified,forest-inspired dish
1,Noma,"Copenhagen, Denmark",Nordic cuisine,fermented berries and ants dessert
2,La Cava del Tequila,"Mexico City, Mexico",Mexican,mole
3,Gaggan,"Bangkok, Thailand",modern Indian cuisine,Lick It Up course
4,Osteria Francescana,"Modena, Italy",modern Italian cuisine,Oops! I Dropped the Lemon Tart
5,Attica,"Melbourne, Australia",Australian cuisine,Potato cooked in the earth it was grown


In [None]:
schema, validator = from_pydantic(Restaurant)

In [None]:
chain = create_extraction_chain(
    llm,
    schema,
    encoder_or_encoder_class="csv",
    validator=validator,
    input_formatter="triple_quotes",
)

In [None]:
with get_openai_callback() as cb:
    document_extraction_results = await extract_from_documents(
        chain, split_docs, max_concurrency=5, use_uid=False, return_exceptions=True
    )
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Successful Requests: {cb.successful_requests}")
    print(f"Total Cost (USD): ${cb.total_cost}")

Total Tokens: 2666
Prompt Tokens: 2412
Completion Tokens: 254
Successful Requests: 3
Total Cost (USD): $0.005332


In [None]:
document_extraction_results

[{'uid': '0',
  'source_uid': '0',
  'data': {'restaurant': [{'name': '-',
     'location': '-',
     'style': '-',
     'top_dish': '-'},
    {'name': 'El Celler de Can Roca',
     'location': 'Girona, Spain',
     'style': 'Creative and delicious',
     'top_dish': 'Forest-inspired dish'},
    {'name': 'Noma',
     'location': 'Copenhagen, Denmark',
     'style': 'Simple and natural Nordic cuisine',
     'top_dish': 'Dessert made with fermented berries and ants'},
    {'name': 'La Cava del Tequila',
     'location': 'Mexico City',
     'style': 'Authentic and flavorful regional specialties',
     'top_dish': 'Mole'},
    {'name': 'Gaggan',
     'location': 'Bangkok, Thailand',
     'style': 'Creative and playful Indian cuisine',
     'top_dish': 'Lick It Up course'},
    {'name': 'Osteria Francescana',
     'location': 'Modena, Italy',
     'style': 'Modern take on traditional Italian cuisine',
     'top_dish': 'Oops! I Dropped the Lemon Tart'}]},
  'raw': 'name|location|style|top_di

In [None]:

extract_restaurant_info(document_extraction_results)

Restaurant Name: -
Location: -
Style: -
Top Dish: -

Restaurant Name: El Celler de Can Roca
Location: Girona, Spain
Style: Creative and delicious
Top Dish: Forest-inspired dish

Restaurant Name: Noma
Location: Copenhagen, Denmark
Style: Simple and natural Nordic cuisine
Top Dish: Dessert made with fermented berries and ants

Restaurant Name: La Cava del Tequila
Location: Mexico City
Style: Authentic and flavorful regional specialties
Top Dish: Mole

Restaurant Name: Gaggan
Location: Bangkok, Thailand
Style: Creative and playful Indian cuisine
Top Dish: Lick It Up course

Restaurant Name: Osteria Francescana
Location: Modena, Italy
Style: Modern take on traditional Italian cuisine
Top Dish: Oops! I Dropped the Lemon Tart

Restaurant Name: 
Location: ---
Style: ---
Top Dish: ---

Restaurant Name: 
Location: Melbourne, Australia
Style: Australian
Top Dish: Potato cooked in the earth it was grown

Restaurant Name: 
Location: N/A
Style: Mexican
Top Dish: N/A

Restaurant Name: 
Location: N/A

In [43]:
extraction_chain = create_extraction_chain(llm, schema)

In [None]:
split_docs[0]

ValidationError: ignored

In [27]:
def split_conversation(filename, max_tokens=1024):
    """
    Load a conversation from a file and split it into sections of approximately 2048 tokens.

    Parameters:
    filename (str): The name of the file to read the conversation from.
    max_tokens (int): The maximum number of tokens per section.

    Returns:
    list: A list of strings, where each string is a section of the conversation.
    """
    with open(filename, 'r') as f:
        conversation = f.read()

    # Split the conversation into turns
    turns = conversation.split("\n\n")

    sections = []
    section = ""

    for turn in turns:
        # If adding the next turn would exceed the maximum number of tokens,
        # add the current section to the list and start a new section
        if len(section.split()) + len(turn.split()) > max_tokens:
            sections.append(section.strip())
            section = ""

        # Add the turn to the current section
        section += f"{turn}\n\n"

    # Add the last section to the list
    sections.append(section.strip())

    return sections



## Docs splitting

In [71]:
def split_conversation_word(filename, max_tokens=1024):
    """
    Load a conversation from a Word file and split it into sections of approximately 2048 tokens.

    Parameters:
    filename (str): The name of the file to read the conversation from.
    max_tokens (int): The maximum number of tokens per section.

    Returns:
    list: A list of strings, where each string is a section of the conversation.
    """
    # Load the Word file into a Document object
    from docx import Document
    document = Document(filename)

    # Get the text of the conversation
    conversation = ""
    for paragraph in document.paragraphs:
        conversation += paragraph.text + "\n"

    # Split the conversation into turns
    turns = conversation.split("\n\n")

    sections = []
    section = ""

    for turn in turns:
        # If adding the next turn would exceed the maximum number of tokens,
        # add the current section to the list and start a new section
        if len(section.split()) + len(turn.split()) + 2 > max_tokens:
            sections.append(section.strip())
            section = ""

        # Add the turn to the current section
        section += f"{turn}\n\n"

    # Add the last section to the list
    sections.append(section.strip())

    return sections


In [72]:
file = split_conversation_word("/content/SampleDocs/Demo Joint Venture Agreement .docx")

In [None]:
file[2]

In [None]:
extracted = extraction_chain.predict_and_parse(text=file[6])["data"]

print(extracted)

In [None]:
restaurant_schema = Object(
    id="restaurant",
    description=(
        "People are talking about restaurants names and dishes as well as qualities of the restaturant"
    ),
    attributes=[
        Text(
            id="name",
            description="The name of the restaurant"
        )
    ],
    examples=[("We went for a quick bite at McDonalds",[{"name": "McDonalds"}]),
            ("I just love the steaks at Mortons",[{"name": "Mortons"}]),
            ("We already have a booking at The Eatery so can't goto Mortons",[{"name": "The Eatery"},{"name": "Mortons"}])
            ],
    many=True,
)

### with browsing


In [None]:
# Kor!
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

# LangChain Models
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

# Standard Helpers
import pandas as pd
import requests
import time
import json
from datetime import datetime

# Text Helpers
from bs4 import BeautifulSoup
from markdownify import markdownify as md

# For token counting
from langchain.callbacks import get_openai_callback

def printOutput(output):
    print(json.dumps(output,sort_keys=True, indent=3))

### Load the text file

## Prepare the model

In [None]:
llm = ChatOpenAI(
     model_name="gpt-3.5-turbo",
    temperature=0,
    # max_tokens=2048,
)

In [None]:
restaurant_schema = Object(
    id="restaurant",
    description=(
        "People are talking about restaurants and dishes as well as qualities of the restaturant"
    ),
    attributes=[
        Text(
            id="name",
            description="The name of the restaurant"
        )
    ],
    examples=[("We went for a quick bite at McDonalds",[{"name": "McDonalds"}]),
                      ("I just love the steaks at Mortons",[{"name": "Mortons"}]),
                      ("We already have a booking at The Eatery so can't goto Mortons",[{"name": "The Eatery"},{"name": "Mortons"}])
                      ],
    many=True,
)

In [None]:
# restaurant_schema = Object(

#     id="restaurant",
    
#     # Natural language description about your object
#     description="Personal information about a person",
    
#     # Fields you'd like to capture from a piece of text about your object.
#     attributes=[
#         Text(
#             id="first_name",
#             description="The first name of a person.",
#         )
#     ],
    
#     # Examples help go a long way with telling the LLM what you need
#     examples=[
#         ("Alice and Bob are friends", [{"first_name": "Alice"}, {"first_name": "Bob"}])
#     ]
# )

In [None]:
chain = create_extraction_chain(llm, restaurant_schema)

In [None]:
sections[0]

'Food lover 2: Instruction: Please describe your first most unforgettable meal, including the location, ambiance, taste, and any unique experiences.\nInput: My first most unforgettable meal was at a restaurant called El Celler de Can Roca in Girona, Spain. The ambiance was elegant and modern, and the food was a creative and delicious 18-course tasting menu. One unique experience was when they brought out a dish that was inspired by the smells of the forest.\n\nFood lover 1: My response: That sounds amazing! The forest-inspired dish must have been a unique experience. My first most unforgettable meal was at a restaurant called Noma in Copenhagen, Denmark. The location was in an old warehouse by the waterfront, and the ambiance was rustic and cozy. The food was presented in a simple and natural way, with many of the ingredients sourced from the surrounding Nordic region. One of the most memorable dishes was a dessert made with fermented berries and ants, which added a surprising and deli

In [None]:
text = sections[0]
output = chain.predict_and_parse(text=(text))["data"]

printOutput(output)

{
   "restaurant": [
      {
         "name": "La Cava del Tequila"
      }
   ]
}


In [None]:
output = chain.predict_and_parse(text=("The dog went to the park"))["data"]
printOutput(output)

{
   "person": []
}


## Multiple Fields

In [None]:
 ("I had the fresh pasta with cream", "fresh pasta with cream"), 
        #                 ("for me the steak frites was a good choice on my diet","steak frites"),
        #                 ("The grilled octopus was so yummy","grilled octopus"), 
        #                 ("I had to send the fish tacos back as they were raw","fish tacos"),
        #             ],
        #     many=True,
        # ),
    ],
    many=True,
)

In [None]:
with get_openai_callback() as cb:
    result = chain.predict_and_parse(text=text)
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Successful Requests: {cb.successful_requests}")
    print(f"Total Cost (USD): ${cb.total_cost}")

Total Tokens: 1858
Prompt Tokens: 1847
Completion Tokens: 11
Successful Requests: 1
Total Cost (USD): $0.0037159999999999997


In [88]:
import re

def label_clauses(statement):
  """
  Label all clauses in a statement.

  Args:
    statement (str): The statement to label.

  Returns:
    dict: A dictionary, where the keys are the clause names and the values are the clause numbers.
  """

  clauses = {}

  # Find all the clauses in the statement.
  for match in re.finditer(r"\b[A-Z]+:\b", statement):
    # Get the clause name.
    clause_name = match.group()

    # Get the clause number.
    clause_number = match.start() + 1

    # Create a dictionary entry for the clause.
    clause_dict = {
      "clause_name": clause_name,
      "clause_number": f"Clause {clause_number}",
    }

    # Add the dictionary entry to the dictionary.
    clauses[clause_name] = clause_dict

  # Return the dictionary.
  return clauses


if __name__ == "__main__":
  statement = """
⦁	“Third Party Claim” shall have the meaning ascribed to it in Clause 65 of this Agreement;
⦁	“Validity Date” shall have the meaning ascribed to it in Clause 10 of this Agreement;
⦁	“Works” shall have the meaning ascribed to it in Clause 51 of this Agreement.
"""

  clauses = label_clauses(statement)

  print(clauses)


{}


In [92]:
text = "“Conditions Precedent” shall mean the conditions listed in Clause 3.1 to be fulfilled to the satisfaction of the Investors, unless waived by the Investors in writing."
result = label_clauses(text)

In [93]:
clauses = {}

# Find all the clauses in the statement.
for match in re.finditer(r"\b[A-Z]+:\b", statement):
  # Get the clause name.
  clause_name = match.group()

  # Get the clause number.
  clause_number = match.start() + 1

  # Create a dictionary entry for the clause.
  clause_dict = {
    "clause_name": clause_name,
    "clause_number": f"Clause {clause_number}",
  }

  # Add the dictionary entry to the dictionary.
  clauses[clause_name] = clause_dict

print(clauses)

{}
