In [1]:
import chromadb
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index.node_parser.extractors import KeywordExtractor, MetadataExtractor
from chromadb.utils import embedding_functions

def print_text_with_line_breaks(text, line_length=100):
    for i in range(0, len(text), line_length):
        print(text[i:i+line_length])


#### Loading Data

In [51]:
required_text = [".txt"]
reader = SimpleDirectoryReader(
    input_dir="../src/documents/wikipedia", required_exts=required_text, recursive=True
)
documents = reader.load_data()
print(f"Loaded {len(documents)}")

Loaded 1395


In [52]:
text_splitter = TokenTextSplitter(separator=".", chunk_size=256, chunk_overlap=64)
# metadata_extractor = MetadataExtractor(extractors=[KeywordExtractor(keywords=10)])
# parser = SimpleNodeParser(text_splitter=text_splitter, metadata_extractor=metadata_extractor)
parser = SimpleNodeParser(text_splitter=text_splitter)
nodes = parser.get_nodes_from_documents(documents=documents)

In [2]:
print(nodes[0].get_content())
print(len(nodes))

NameError: name 'nodes' is not defined

#### Loading ChromaDB and Initialize

In [3]:
client = chromadb.PersistentClient(path="../src/representations/")
client.heartbeat()

1690842612635429890

In [5]:
# sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

In [6]:
# collection = client.get_or_create_collection(name="picasso_collection", embedding_function=sentence_transformer_ef)
collection = client.get_or_create_collection(name="picasso_collection_wikipedia", embedding_function=sentence_transformer_ef)


In [56]:
for node in nodes:
    document = node.get_content()
    id = node.id_
    collection.add(
        documents=[document],
        ids=[id],
    )
#)

In [7]:
# collection = client.get_or_create_collection(name="picasso_collection", embedding_function=sentence_transformer_ef)
collection = client.get_or_create_collection(name="picasso_collection_wikipedia", embedding_function=sentence_transformer_ef)
collection.count()

28292

#### Creating VectorStore from ChromaDB

In [8]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding

In [9]:
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

  return torch._C._cuda_getDeviceCount() > 0


In [10]:
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embed_model)

In [11]:
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store, storage_context=storage_context, service_context=service_context
)

In [19]:
query_engine = index.as_query_engine()
res =query_engine.query("Whats the meaning of the broken sword in the painting Guernica?")
print(res)


The broken sword near the bottom of the painting symbolizes the defeat of the people at the hand of their tormentors.


In [14]:
from llama_index.indices.vector_store.retrievers import VectorIndexRetriever

retriever = VectorIndexRetriever(index=index, similarity_top_k=5)
node_reps = retriever.retrieve("Whats the meaning of eye in the painting Guernica?")

for node_rep in node_reps:
    print_text_with_line_breaks(node_rep.node.get_content())
    print("======")

The Weeping Woman has been described as the most complex, most fragmented and most highly coloured o
f all the weeping women artworks. In addition to the confused mass of hands, mouth, teeth, handkerch
ief and tears in the centre of the painting, Picasso also depicted the eyes with great analytical at
tention. The Tate draws particular attention to the childlike but striking rendition of the eyes, wh
ich have been depicted like boats or overflowing saucers and have been placed on the peaks of the ha
ndkerchief to provide an intense exploration of physical and emotional distress. This element was ex
pressed in earlier works that Picasso produced in the same year, which was most intense between 12 a
nd 18 October 1937. The earlier paintings also featured the symbol of the handkerchief within the co
mposition.The architecture of the weeping woman's face is very distinctive and shares many design el
ements with the four female figures depicted in Guernica. The face is portrayed from mixed 