In [1]:
import chromadb
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index.node_parser.extractors import KeywordExtractor, MetadataExtractor
from chromadb.utils import embedding_functions

def print_text_with_line_breaks(text, line_length=100):
    for i in range(0, len(text), line_length):
        print(text[i:i+line_length])


#### Loading Data

In [51]:
required_text = [".txt"]
reader = SimpleDirectoryReader(
    input_dir="../src/documents/wikipedia", required_exts=required_text, recursive=True
)
documents = reader.load_data()
print(f"Loaded {len(documents)}")

Loaded 1395


In [52]:
text_splitter = TokenTextSplitter(separator=".", chunk_size=256, chunk_overlap=64)
# metadata_extractor = MetadataExtractor(extractors=[KeywordExtractor(keywords=10)])
# parser = SimpleNodeParser(text_splitter=text_splitter, metadata_extractor=metadata_extractor)
parser = SimpleNodeParser(text_splitter=text_splitter)
nodes = parser.get_nodes_from_documents(documents=documents)

In [2]:
print(nodes[0].get_content())
print(len(nodes))

NameError: name 'nodes' is not defined

#### Loading ChromaDB and Initialize

In [2]:
client = chromadb.PersistentClient(path="../src/representations/")
client.heartbeat()

1691075089119408187

In [3]:
# sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# collection = client.get_or_create_collection(name="picasso_collection", embedding_function=sentence_transformer_ef)
collection = client.get_or_create_collection(name="picasso_collection_wikipedia", embedding_function=sentence_transformer_ef)


In [56]:
for node in nodes:
    document = node.get_content()
    id = node.id_
    collection.add(
        documents=[document],
        ids=[id],
    )

In [7]:
# collection = client.get_or_create_collection(name="picasso_collection", embedding_function=sentence_transformer_ef)
collection = client.get_or_create_collection(name="picasso_collection_wikipedia", embedding_function=sentence_transformer_ef)
collection.count()

28292

#### Creating VectorStore from ChromaDB

In [5]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding

In [6]:
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

  return torch._C._cuda_getDeviceCount() > 0


In [7]:
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embed_model)

In [8]:
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store, storage_context=storage_context, service_context=service_context
)

In [19]:
query_engine = index.as_query_engine()
res =query_engine.query("Whats the meaning of the broken sword in the painting Guernica?")
print(res)


The broken sword near the bottom of the painting symbolizes the defeat of the people at the hand of their tormentors.


In [21]:
from llama_index.indices.vector_store.retrievers import VectorIndexRetriever

retriever = VectorIndexRetriever(index=index, similarity_top_k=5)
node_reps = retriever.retrieve("Picasso movement surrealism")

for node_rep in node_reps:
    print_text_with_line_breaks(node_rep.node.get_content())
    print("======")

" See "The Photographic Conditions of Surrealism" (October, winter 1981).

Picasso's collages
Concer
ning Cubist art, she took Picasso's collage breakthrough to be explicable in terms of Saussure's ide
as about the differential relations and non-referentiality of language, rejecting efforts by other s
cholars to tie the pasted newspaper clippings to social history. Similarly, she held Picasso's styli
stic developments in Cubist portraiture to be products of theoretical problems internal to art, rath
er than outcomes of the artist's love life. Later, she explained Picasso's participation in the rapp
el à l'ordre or return to order of the 1920s in similar structuralist terms. See "In the Name of Pic
asso" (October, spring 1981), "The Motivation of the Sign" (in Lynn Zelevansky, ed., Picasso and Bra
que: A Symposium, 1992), and The Picasso Papers (Farrar, Straus and Giroux, 1998)
Kontinent Picasso: ausgewählte Aufsätze aus 2 Jahrzehnten, Munich 1988 ISBN 3-7913-0891-2.Die Welt d
er Collag