In [80]:
import chromadb
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index.node_parser.extractors import KeywordExtractor, MetadataExtractor
from chromadb.utils import embedding_functions

def print_text_with_line_breaks(text, line_length=100):
    for i in range(0, len(text), line_length):
        print(text[i:i+line_length])


#### Loading Data

In [51]:
required_text = [".txt"]
reader = SimpleDirectoryReader(
    input_dir="../src/documents/wikipedia", required_exts=required_text, recursive=True
)
documents = reader.load_data()
print(f"Loaded {len(documents)}")

Loaded 1395


In [52]:
text_splitter = TokenTextSplitter(separator=".", chunk_size=256, chunk_overlap=64)
# metadata_extractor = MetadataExtractor(extractors=[KeywordExtractor(keywords=10)])
# parser = SimpleNodeParser(text_splitter=text_splitter, metadata_extractor=metadata_extractor)
parser = SimpleNodeParser(text_splitter=text_splitter)
nodes = parser.get_nodes_from_documents(documents=documents)

In [53]:
print(nodes[0].get_content())
print(len(nodes))

100 Great Paintings is a British television series broadcast in 1980 on BBC 2, devised by Edwin Mullins. He chose 20 thematic groups, such as war, the Adoration, the language of colour, the hunt, and bathing, picking five paintings from each. The selection ranges from 12th-century China through the 1950s, with an emphasis on European paintings.  He deliberately avoided especially famous paintings, such as Leonardo da Vinci's Mona Lisa or John Constable's The Haywain. The series is available on VHS and DVD.On the basis of the series, Mullins published the book Great Paintings: Fifty Masterpieces, Explored, Explained and Appreciated (1981), which contained about half of the theme groups. A German translation of Mullins' book appeared as 100 Meisterwerke in 1983. In 1985, a second volume came out, only in Germany, which discussed the remaining 50 paintings
30599


#### Loading ChromaDB and Initialize

In [54]:
client = chromadb.PersistentClient(path="../src/representations/")
client.heartbeat()

1690125404519743756

In [27]:
# sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)a8e1d/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 7.41MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 1.24MB/s]
Downloading (…)b20bca8e1d/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 30.0MB/s]
Downloading (…)0bca8e1d/config.json: 100%|██████████| 571/571 [00:00<00:00, 3.94MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 789kB/s]
Downloading (…)e1d/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 230kB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:54<00:00, 8.08MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 339kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 1.67MB/s]
Downloading (…)a8e1d/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 866kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 3.08M

In [55]:
collection = client.get_or_create_collection(name="picasso_collection", embedding_function=sentence_transformer_ef)


In [56]:
for node in nodes:
    document = node.get_content()
    id = node.id_
    collection.add(
        documents=[document],
        ids=[id],
    )
#)

In [58]:
collection = client.get_or_create_collection(name="picasso_collection", embedding_function=sentence_transformer_ef)
collection.count()

30599

#### Creating VectorStore from ChromaDB

In [59]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding

In [42]:
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

  return torch._C._cuda_getDeviceCount() > 0


In [60]:
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embed_model)

In [61]:
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store, storage_context=storage_context, service_context=service_context
)

In [69]:
query_engine = index.as_query_engine()
res =query_engine.query("Whats the meaning of eye in the Guernica?")


The eye in the Guernica is meant to represent the all-seeing eye of God, symbolizing His omnipresence and divine providence. It is also a reference to the Illuminati, suggesting the presence of a higher power that is watching and judging the events of the painting. The hidden image of the horse's nostrils and upper teeth forming a human skull is meant to further emphasize the idea of divine judgement.


In [86]:
from llama_index.indices.vector_store.retrievers import VectorIndexRetriever

retriever = VectorIndexRetriever(index=index, similarity_top_k=5)
node_reps = retriever.retrieve("Whats the meaning of eye in the Guernica?")

for node_rep in node_reps:
    print_text_with_line_breaks(node_rep.node.get_content())
    print("======")

Seventeenth-century depictions of the Eye sometimes show it surrounded by clouds or sunbursts. The E
ye of God in a triangle is still used in church architecture and Christian art to symbolize the Trin
ity and God's omnipresence and divine providence. 
The Eye of Providence is notably featured on the 
following Eastern Orthodox, Latter-day Saint, and Catholic buildings, among others:

The Kazan Cathe
dral, Saint Petersburg, Russia
The Salt Lake Temple, Salt Lake City, Utah
Jesuit Church, Mannheim, G
ermany
Shio-Mgvime Monastery, Mtskheta, Georgia

Miscellaneous uses
Commonly in the context of a ref
erence to the Illuminati, numerous video games, TV shows, films, books, and websites contain depicti
ons of the Eye
A bare light bulb in the shape of an all-seeing eye blazes over the suffering horse's head.
To the h
orse's upper right the head and extended right arm of a frightened female figure appears to have flo
ated into the room through a window, and she witnesses the scene. In her rig