In [1]:
import json
from IPython.display import JSON

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.staging.base import dict_to_elements

import chromadb

INFO: NumExpr defaulting to 8 threads.


In [2]:
DLAI_API_KEY = ""
DLAI_API_URL = ""

In [3]:
s =  UnstructuredClient(
    api_key_auth=DLAI_API_KEY,
    server_url=DLAI_API_URL,    
)

In [4]:
filename = "winter-sports.epub"
with open(filename,'rb') as f:
    files = shared.Files(
        content =  f.read(),
        file_name=filename
    )
req =  shared.PartitionParameters(files=files)

In [5]:
res = s.general.partition(req)

In [7]:
res.elements

[{'type': 'Title',
  'element_id': '0c6fff519d8892b1783ead469b230320',
  'text': 'The Project Gutenberg eBook of Winter Sports in\nSwitzerland, by E. F. Benson',
  'metadata': {'page_number': 1,
   'languages': ['eng'],
   'filename': 'winter-sports.epub',
   'filetype': 'application/epub'}},
 {'type': 'NarrativeText',
  'element_id': 'a704b2f040706162ec4539fe6e0bfae9',
  'text': '\nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online at\n',
  'metadata': {'page_number': 1,
   'languages': ['eng'],
   'parent_id': '0c6fff519d8892b1783ead469b230320',
   'filename': 'winter-sports.epub',
   'filetype': 'application/epub'}},
 {'type': 'NarrativeText',
  'element_id': '5b5eebc5d7d1c47189123b2b25b0204e',
  'text': 'www.gutenberg.org. If you are not located\nin th

In [9]:
JSON(json.dumps(res.elements[10:15], indent=2))



<IPython.core.display.JSON object>

In [12]:
[x for x in res.elements if x['type']=='Title' and 'hockey' in x['text'].lower()]

[{'type': 'Title',
  'element_id': 'dcea2070d3d9171c9c817967917ca77b',
  'text': 'ICE-HOCKEY',
  'metadata': {'page_number': 2,
   'languages': ['eng'],
   'filename': 'winter-sports.epub',
   'filetype': 'application/epub'}},
 {'type': 'Title',
  'element_id': '8738b816c128ffcafa1fbfedd3be5f44',
  'text': 'ICE HOCKEY',
  'metadata': {'page_number': 2,
   'languages': ['eng'],
   'filename': 'winter-sports.epub',
   'filetype': 'application/epub'}}]

In [13]:
[x for x in res.elements if x['type']=='Title' ] #and 'hockey' in x['text'].lower()]

[{'type': 'Title',
  'element_id': '0c6fff519d8892b1783ead469b230320',
  'text': 'The Project Gutenberg eBook of Winter Sports in\nSwitzerland, by E. F. Benson',
  'metadata': {'page_number': 1,
   'languages': ['eng'],
   'filename': 'winter-sports.epub',
   'filetype': 'application/epub'}},
 {'type': 'Title',
  'element_id': '4a7aedb1f1383e75bb0ee85e359cee60',
  'text': 'Title: Winter Sports in Switzerland',
  'metadata': {'page_number': 1,
   'languages': ['eng'],
   'filename': 'winter-sports.epub',
   'filetype': 'application/epub'}},
 {'type': 'Title',
  'element_id': '395a1ca8c9a7512b4ef7910c383f02f0',
  'text': 'Author: E. F. Benson',
  'metadata': {'page_number': 1,
   'languages': ['eng'],
   'filename': 'winter-sports.epub',
   'filetype': 'application/epub'}},
 {'type': 'Title',
  'element_id': 'd13cc748dfa64302f1cacc7df9eb8ce6',
  'text': 'Illustrator: C. Fleming Williams',
  'metadata': {'page_number': 1,
   'languages': ['eng'],
   'filename': 'winter-sports.epub',
   'f

In [14]:
chapters = [
    "THE SUN-SEEKER",
    "RINKS AND SKATERS",
    "TEES AND CRAMPITS",
    "ICE-HOCKEY",
    "SKI-ING",
    "NOTES ON WINTER RESORTS",
    "FOR PARENTS AND GUARDIANS",
]

In [17]:
chapter_ids= {}
for element in res.elements:
    for chapter in chapters:
        if element['text'] == chapter and element['type'] == 'Title':
            chapter_ids[element['element_id']] = chapter
            break
        


In [18]:
chapter_ids

{'d9725192dcad4b316b9000b2e0b9d803': 'THE SUN-SEEKER',
 '85a84e38543ad3417cdef25e3616cbbc': 'RINKS AND SKATERS',
 '5229c64d88a2a9bd98581911deedc70c': 'TEES AND CRAMPITS',
 'dcea2070d3d9171c9c817967917ca77b': 'ICE-HOCKEY',
 '0354439dc766447e6406fdd882e8e918': 'SKI-ING',
 '272ffcc12c87aef80e8c6d59aeb699b0': 'NOTES ON WINTER RESORTS',
 'bc0864682405480021aabbd2e39fb6c9': 'FOR PARENTS AND GUARDIANS'}

In [19]:
chapter_to_id = {v:k for k,v in chapter_ids.items()}
chapter_to_id

{'THE SUN-SEEKER': 'd9725192dcad4b316b9000b2e0b9d803',
 'RINKS AND SKATERS': '85a84e38543ad3417cdef25e3616cbbc',
 'TEES AND CRAMPITS': '5229c64d88a2a9bd98581911deedc70c',
 'ICE-HOCKEY': 'dcea2070d3d9171c9c817967917ca77b',
 'SKI-ING': '0354439dc766447e6406fdd882e8e918',
 'NOTES ON WINTER RESORTS': '272ffcc12c87aef80e8c6d59aeb699b0',
 'FOR PARENTS AND GUARDIANS': 'bc0864682405480021aabbd2e39fb6c9'}

In [21]:
[x for x in res.elements if x["metadata"].get("parent_id") == chapter_to_id["ICE-HOCKEY"]][0]

{'type': 'NarrativeText',
 'element_id': 'a4c8c73b30452b62425c99d1fe76425e',
 'text': 'Many of the Swiss winter-resorts can put\ninto the field a very strong ice-hockey team, and fine teams from other\ncountries often make winter tours there; but the ice-hockey which the\nordinary winter visitor will be apt to join in will probably be of the\nmost elementary and unscientific kind indulged in, when the skating day\nis drawing to a close, by picked-up sides. As will be readily\nunderstood, the ice over which a hockey match has been played is\nperfectly useless for skaters any more that day until it has been swept,\nscraped, and sprinkled or flooded; and in consequence, at all Swiss\nresorts, with the exception of St. Moritz, where there is a rink that\nhas been made for the hockey-player, or when an important match is being\nplayed, this sport is supplementary to such others as I have spoken of.\nNobody, that is, plays hockey and nothing else, since he cannot play\nhockey at all till the

In [22]:
client = chromadb.PersistentClient(path="chroma_tmp",settings=chromadb.Settings(allow_reset=True))
client.reset()

INFO: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.


True

In [23]:
collection = client.create_collection(
    name='winter_sports',
    metadata={"hnsw:space": "cosine"}
)

In [25]:
for element in res.elements:
    parent_id = element["metadata"].get("parent_id")
    chapter = chapter_ids.get(parent_id, "")
    collection.add(
        documents=[element["text"]],
        ids=[element["element_id"]],
        metadatas=[{"chapter": chapter}]
    )

/Users/praveenreddy/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%
[0;93m2024-05-09 05:35:34.715095 [W:onnxruntime:, helper.cc:67 IsInputSupported] CoreML does not support input dim > 16384. Input:embeddings.word_embeddings.weight, shape: {30522,384}[m
[0;93m2024-05-09 05:35:34.715694 [W:onnxruntime:, coreml_execution_provider.cc:81 GetCapability] CoreMLExecutionProvider::GetCapability, number of partitions supported by CoreML: 49 number of nodes in the graph: 323 number of nodes supported by CoreML: 231[m




In [26]:
results = collection.peek()
print(results["documents"])

['The Project Gutenberg eBook of Winter Sports in\nSwitzerland, by E. F. Benson', '\nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online at\n', 'www.gutenberg.org. If you are not located\nin the United States, you’ll have to check the laws of the country where\nyou are located before using this eBook.', 'Title: Winter Sports in Switzerland', 'Author: E. F. Benson', 'Illustrator: C. Fleming Williams', 'Photographer: Mrs. Aubrey Le Blond', 'Release date: August 23, 2019 [EBook #60153]', 'Most recently updated: January 30, 2020', 'Language: English']


In [27]:
result = collection.query(
    query_texts = ['how many plahyers are on team'],
    n_results=2,
    where={'chapter':'ICE-HOCKEY'}
)
print(json.dumps(result))

{"ids": [["392b0250af6c3b3a472b9bba6c9fc813", "3c94727f6879fb85fc32d0c1b2eb5b32"]], "distances": [[0.6144973635673523, 0.8437265157699585]], "metadatas": [[{"chapter": "ICE-HOCKEY"}, {"chapter": "ICE-HOCKEY"}]], "embeddings": null, "documents": [["It is a wonderful and delightful sight to watch the speed and\naccuracy of a first-rate team, each member of which knows the play of\nthe other five players. The finer the team, as is always the case, the\ngreater is their interdependence on each other, and the less there is of\nindividual play. Brilliant running and dribbling, indeed, you will see;\nbut as distinguished from a side composed of individuals, however good,\nwho are yet not a team, these brilliant episodes are always part of a\nplan, and end not in some wild shot but in a pass or a succession of\npasses, designed to lead to a good opening for scoring. There is,\nindeed, no game at which team play outwits individual brilliance so\ncompletely.", "For the rest, everybody knows the 