#### Defining llm & embedding

In [1]:
import pandas as pd
import numpy as np
from llm_commons.langchain.proxy import ChatOpenAI
from llm_commons.langchain.proxy import OpenAIEmbeddings
from ipywidgets import widgets

llm_model_name = widgets.Dropdown(
    options=[
        "gpt-35-turbo",
        "gpt-35-turbo-16k",
        "gpt-4",
        "gpt-4-32k",
        "gpt-4-turbo",
        "gemini-1.0-pro",
        "gpt-4-vision"
        # "tiiuae--falcon-40b-instruct"
    ],
    value="gpt-35-turbo-16k",
    description="LLM Model Name",
    disabled=False,
)


llm = ChatOpenAI(proxy_model_name=llm_model_name.value)
embeddings = OpenAIEmbeddings(proxy_model_name='text-embedding-ada-002')

from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.llms.langchain import LangChainLLM

llama_llm = LangChainLLM(llm)
llama_emb= LangchainEmbedding(embeddings)

from llama_index.core import Settings
Settings.embed_model = embeddings
Settings.llm = llm

  from llm_commons.langchain.proxy import ChatOpenAI


### Creating LLama_index documents from the corpus data - Scifact & nfcorpus

In [35]:
import json
from llama_index.core import Document
Lama_doc=[]
data_path = 'C:/Users/I068117/UT_Machine Learning/datasets/scifact'

with open(f'{data_path}/corpus.json') as file:
    data = json.load(file)

    for item in data[:5]:
        index = item['_id']
        title = item['title']
        text = item['text'] 
        Lama_doc.append(Document(text=text,metadata= dict(idx =index, title= title, dataset="Scifact",cluster="cluster1")))
    for item in data[5:10]:
        index = item['_id']
        title = item['title']
        text = item['text'] 
        Lama_doc.append(Document(text=text,metadata= dict(idx =index, title= title, dataset="Scifact",cluster="cluster2")))
        
    for item in data[10:15]:
        index = item['_id']
        title = item['title']
        text = item['text'] 
        Lama_doc.append(Document(text=text,metadata= dict(idx =index, title= title, dataset="Scifact",cluster="cluster3"))) 
        
        
data_path = 'C:/Users/I068117/UT_Machine Learning/datasets/nfcorpus'      

with open(f'{data_path}/corpus.json') as file:
    data = json.load(file)

    for item in data[:15]:
        index = item['_id']
        title = item['title']
        text = item['text'] 
        Lama_doc.append(Document(text=text,metadata= dict(idx =index, title= title, dataset="nfcorpus")))

#Lama_doc[0]  

In [6]:
from llama_index.core.node_parser import SentenceSplitter
node_parser = SentenceSplitter()
nodes_2= node_parser.get_nodes_from_documents(Lama_doc)
print(nodes_2[10].get_content(metadata_mode="all"))

Document(id_='8c1e36d1-4517-45b5-8c1d-4919414aec45', embedding=None, metadata={'idx': '4983', 'title': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.', 'dataset': 'Scifact', 'cluster': 'cluster1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at t

##### ChromaVector store

In [17]:
import nest_asyncio
nest_asyncio.apply()

In [16]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [11]:
import chromadb
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("quickstart")

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [13]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
#Storing index on the disk
from llama_index.core import (
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
)

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
#Storing index on the disk.. DONT RUN THIS>....
index = VectorStoreIndex(nodes, storage_context=storage_context)
index.storage_context.persist(persist_dir='C:/Users/I068117/UT_Machine Learning/datasets/scifact/chromaindex')

### Strict Meta-data filtering

In [None]:
index_strictfilter = VectorStoreIndex(nodes_2,storage_context)

In [68]:
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)

filters = MetadataFilters(
    filters=[
        MetadataFilter(key="dataset", operator=FilterOperator.EQ, value="Scifact"),
    ]
)

retriever = index_strictfilter.as_retriever(filters=filters, similarity_top_k=10)

In [69]:
context=retriever.retrieve("Toll-like receptor (TLR) signaling is involved in the pathogenesis of human MDS.")
ci=[]
for c in context:
    ci.append([c.node.metadata['idx'],c.node.metadata['dataset'],c.node.metadata['cluster']])
ci

INFO:httpx:HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"


[['5836', 'Scifact', 'cluster1'],
 ['72159', 'Scifact', 'cluster3'],
 ['19238', 'Scifact', 'cluster1'],
 ['18670', 'Scifact', 'cluster1'],
 ['33370', 'Scifact', 'cluster2'],
 ['92308', 'Scifact', 'cluster3'],
 ['36474', 'Scifact', 'cluster2'],
 ['92499', 'Scifact', 'cluster3'],
 ['79447', 'Scifact', 'cluster3'],
 ['54440', 'Scifact', 'cluster2']]

In [70]:
from llama_index.core.vector_stores import FilterOperator, FilterCondition
filters = MetadataFilters(
    filters=[
        MetadataFilter(key="dataset", operator=FilterOperator.EQ, value="Scifact"),
        MetadataFilter(key="cluster", value="cluster1"),
    ],
    condition=FilterCondition.AND,

)

retriever = index_strictfilter.as_retriever(filters=filters, similarity_top_k=10)

In [71]:
context=retriever.retrieve("Toll-like receptor (TLR) signaling is involved in the pathogenesis of human MDS.")
ci=[]
for c in context:
    ci.append([c.node.metadata['idx'],c.node.metadata['dataset'],c.node.metadata['cluster']])
ci

INFO:httpx:HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"


[['5836', 'Scifact', 'cluster1'],
 ['19238', 'Scifact', 'cluster1'],
 ['18670', 'Scifact', 'cluster1'],
 ['4983', 'Scifact', 'cluster1'],
 ['7912', 'Scifact', 'cluster1']]

### Cons: Strict Metadata Filtering
Not suitable for heterogeneous metadata:
Consider two collections representing two LoBs: HR Policy Documents maintained by SFSF and Travel Policy Documents maintained by Concur. Each LoB might have their own metadata taxonomy with different keys. If a query specifies both concurRole='engineer'and sfsfRole='manager', no document will have both keys and the query result will be empty.

In [72]:
from llama_index.core.vector_stores import FilterOperator, FilterCondition
filters = MetadataFilters(
    filters=[
        MetadataFilter(key="dataset", operator=FilterOperator.EQ, value="nfcorpus"),
        MetadataFilter(key="cluster", value="cluster1"),
    ],
    condition=FilterCondition.AND,

)

retriever = index_strictfilter.as_retriever(filters=filters, similarity_top_k=10)

In [73]:
context=retriever.retrieve("Toll-like receptor (TLR) signaling is involved in the pathogenesis of human MDS.")
ci=[]
for c in context:
    ci.append([c.node.metadata['idx'],c.node.metadata['dataset'],c.node.metadata['cluster']])
ci

INFO:httpx:HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"


[]

### Non-Strict Meta-data filtering using embedding search

In this approach the query along with the nonStrictMetadataFilter is embedded followed by an Embedding Search.

The stored embeddings in the Vector database represent the chunk along with the metadata.- Metadata is embedded along with the document

###### Embedding document+meta-data non strict filters

In [34]:
import json
from llama_index.core import Document
Lama_doc_meta=[]
data_path = 'C:/Users/I068117/UT_Machine Learning/datasets/scifact'

with open(f'{data_path}/corpus.json') as file:
    data = json.load(file)

    for item in data[:5]:
        index = item['_id']
        title = item['title']
        text = item['text']
        cluster="cluster1"
        Lama_doc_meta.append(Document(text=text+ index+title+cluster,metadata= dict(idx =index, title= title, dataset="Scifact",cluster=cluster)))
    for item in data[5:10]:
        index = item['_id']
        title = item['title']
        text = item['text'] 
        cluster="cluster2"
        Lama_doc_meta.append(Document(text=text+ index+title+cluster,metadata= dict(idx =index, title= title, dataset="Scifact",cluster=cluster)))
        
    for item in data[10:15]:
        index = item['_id']
        title = item['title']
        text = item['text']
        cluster="cluster3"
        Lama_doc_meta.append(Document(text=text+ index+title+cluster,metadata= dict(idx =index, title= title, dataset="Scifact",cluster=cluster))) 
        
        
data_path = 'C:/Users/I068117/UT_Machine Learning/datasets/nfcorpus'      

with open(f'{data_path}/corpus.json') as file:
    data = json.load(file)

    for item in data[:15]:
        index = item['_id']
        title = item['title']
        text = item['text'] 
        Lama_doc_meta.append(Document(text=text+ index+title,metadata= dict(idx =index, title= title, dataset="nfcorpus")))

In [None]:
index_nonstrictfilter_emb = VectorStoreIndex.from_documents(Lama_doc_meta,storage_context)

In [74]:
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)

filters = MetadataFilters(
    filters=[
        MetadataFilter(key="dataset", operator=FilterOperator.EQ, value="Scifact"),
    ]
)

retriever = index_nonstrictfilter_emb.as_retriever(filters=filters, similarity_top_k=10)

#### Embedding the non-strict filter in the query + any meta-data associated with the query

In [64]:
pd.set_option('display.max_colwidth', None)
df[df['_id']== 1306]['new_query'] #no meta-data accociated with query

745    Toll-like receptor (TLR) signaling is involved in the pathogenesis of human MDS. 
Name: new_query, dtype: object

In [75]:
context= retriever.retrieve("Toll-like receptor (TLR) signaling is involved in the pathogenesis of human MDS.+ cluster 1")
ci=[]
for c in context:
    ci.append([c.node.metadata['idx'],c.node.metadata['dataset'],c.node.metadata['cluster']])
ci

INFO:httpx:HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"


[['72159', 'Scifact', 'cluster3'],
 ['5836', 'Scifact', 'cluster1'],
 ['19238', 'Scifact', 'cluster1'],
 ['18670', 'Scifact', 'cluster1'],
 ['92308', 'Scifact', 'cluster3'],
 ['33370', 'Scifact', 'cluster2'],
 ['54440', 'Scifact', 'cluster2'],
 ['36474', 'Scifact', 'cluster2'],
 ['92499', 'Scifact', 'cluster3'],
 ['7912', 'Scifact', 'cluster1']]

#### Cons - Non-Strict Metadata Filtering using Embedding Search
1. Might retrieve irrelevant textual chunks pertaining to the metadata but not the query.
2. Stored chunk embeddings must contain metadata to support nonStrictMetadataFilter. To support also queries with nonStrictMetadataFilter, we need to store both text-only chunk embeddings and text+metadata chunk embeddings, increasing storage and embedding cost.
Non-Strict Metadata Filter

### Non-strict meta data filtering using reranker model

In this approach after the embedding search has been done the query along with the nonStrictMetadataFilter is passed to the re-ranker model which in it's reranking should be able to consider the filter.

#### Strict meta-data filter on the embedded search

In [77]:
filters = MetadataFilters(
    filters=[
        MetadataFilter(key="dataset", operator=FilterOperator.EQ, value="Scifact"),
    ]
)

retriever = index_strictfilter.as_retriever(filters=filters, similarity_top_k=10)

In [78]:
context=retriever.retrieve("Toll-like receptor (TLR) signaling is involved in the pathogenesis of human MDS.")
ci=[]
for c in context:
    ci.append([c.node.metadata['idx'],c.node.metadata['dataset'],c.node.metadata['cluster']])
ci

INFO:httpx:HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://api.ai.intprod-eu12.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/dcf97822fc795ac1/embeddings?api-version=2023-09-01-preview "HTTP/1.1 200 OK"


[['5836', 'Scifact', 'cluster1'],
 ['72159', 'Scifact', 'cluster3'],
 ['19238', 'Scifact', 'cluster1'],
 ['18670', 'Scifact', 'cluster1'],
 ['33370', 'Scifact', 'cluster2'],
 ['92308', 'Scifact', 'cluster3'],
 ['36474', 'Scifact', 'cluster2'],
 ['92499', 'Scifact', 'cluster3'],
 ['79447', 'Scifact', 'cluster3'],
 ['54440', 'Scifact', 'cluster2']]

In [83]:
from llama_index.core.postprocessor import SentenceTransformerRerank

  from tqdm.autonotebook import tqdm, trange


INFO:datasets:PyTorch version 2.3.1 available.
PyTorch version 2.3.1 available.
PyTorch version 2.3.1 available.




config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [88]:
reranker = SentenceTransformerRerank(top_n=10, model="BAAI/bge-reranker-base")

#### Re-ranker model without non-strict meta data filtering

In [89]:
from llama_index.core import QueryBundle
from llama_index.core.schema import NodeWithScore
reranked_nodes = reranker.postprocess_nodes(context, query_bundle=QueryBundle("Toll-like receptor (TLR) signaling is involved in the pathogenesis of human MDS"))
ci=[]
for c in reranked_nodes:
    ci.append([c.node.metadata['idx'],c.node.metadata['dataset'],c.node.metadata['cluster']])
ci

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[['5836', 'Scifact', 'cluster1'],
 ['72159', 'Scifact', 'cluster3'],
 ['36474', 'Scifact', 'cluster2'],
 ['54440', 'Scifact', 'cluster2'],
 ['18670', 'Scifact', 'cluster1'],
 ['92308', 'Scifact', 'cluster3'],
 ['92499', 'Scifact', 'cluster3'],
 ['33370', 'Scifact', 'cluster2'],
 ['79447', 'Scifact', 'cluster3'],
 ['19238', 'Scifact', 'cluster1']]

#### Re-ranking with non-strict meta-data filter by adding it to the query - introducing bias for cluster 3

In [91]:
from llama_index.core import QueryBundle
from llama_index.core.schema import NodeWithScore
reranked_nodes = reranker.postprocess_nodes(context, query_bundle=QueryBundle("Toll-like receptor (TLR) signaling is involved in the pathogenesis of human MDS. cluster 3"))
ci=[]
for c in reranked_nodes:
    ci.append([c.node.metadata['idx'],c.node.metadata['dataset'],c.node.metadata['cluster']])
ci

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[['5836', 'Scifact', 'cluster1'],
 ['72159', 'Scifact', 'cluster3'],
 ['92308', 'Scifact', 'cluster3'],
 ['79447', 'Scifact', 'cluster3'],
 ['92499', 'Scifact', 'cluster3'],
 ['36474', 'Scifact', 'cluster2'],
 ['19238', 'Scifact', 'cluster1'],
 ['33370', 'Scifact', 'cluster2'],
 ['54440', 'Scifact', 'cluster2'],
 ['18670', 'Scifact', 'cluster1']]

#### Cohere Re-ranker model without non-strict meta data filtering

In [76]:
import cohere
# init client
co = cohere.Client("fO0pe0IbZyWcHJ2HLhLb1Zc9b0HbFb7bYjpsA2On")

In [116]:
context_doc = []

# Assuming context is a list of objects that have the method get_content(metadata_mode="all")
for i in range(len(context)):
    context_doc.append(context[i].get_content(metadata_mode="all"))

In [154]:
rerank_docs = co.rerank(
    query="Toll-like receptor (TLR) signaling is involved in the pathogenesis of human MDS", documents=context_doc, top_n=10, model="rerank-english-v2.0"
)
cohere_ranked_nodes= [context[i.index] for i in rerank_docs.results]
ci=[]
for c in cohere_ranked_nodes:
    ci.append([c.node.metadata['idx'],c.node.metadata['dataset'],c.node.metadata['cluster']])
ci

[['5836', 'Scifact', 'cluster1'],
 ['72159', 'Scifact', 'cluster3'],
 ['92499', 'Scifact', 'cluster3'],
 ['18670', 'Scifact', 'cluster1'],
 ['79447', 'Scifact', 'cluster3'],
 ['36474', 'Scifact', 'cluster2'],
 ['92308', 'Scifact', 'cluster3'],
 ['54440', 'Scifact', 'cluster2'],
 ['19238', 'Scifact', 'cluster1'],
 ['33370', 'Scifact', 'cluster2']]

#### Re-ranking with non-strict meta-data filter by adding it to the query - introducing bias for cluster 3

In [156]:
rerank_docs = co.rerank(
    query="Toll-like receptor (TLR) signaling is involved in the pathogenesis of human MDS. Cluster 3", documents=context_doc, top_n=10, model="rerank-english-v2.0"
)
cohere_ranked_nodes= [context[i.index] for i in rerank_docs.results]
ci=[]
for c in cohere_ranked_nodes:
    ci.append([c.node.metadata['idx'],c.node.metadata['dataset'],c.node.metadata['cluster']])
ci

INFO:httpx:HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK"
HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK"
HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK"


[['72159', 'Scifact', 'cluster3'],
 ['5836', 'Scifact', 'cluster1'],
 ['18670', 'Scifact', 'cluster1'],
 ['92499', 'Scifact', 'cluster3'],
 ['36474', 'Scifact', 'cluster2'],
 ['79447', 'Scifact', 'cluster3'],
 ['92308', 'Scifact', 'cluster3'],
 ['54440', 'Scifact', 'cluster2'],
 ['19238', 'Scifact', 'cluster1'],
 ['33370', 'Scifact', 'cluster2']]

### Meta- data extractors from the document

In [152]:
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    BaseExtractor,
)

from llama_index.core.schema import MetadataMode

from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
)

extractors_1 = [
    QuestionsAnsweredExtractor(
        questions=3, llm=llama_llm, metadata_mode=MetadataMode.EMBED
    ),
]

class CustomExtractor(BaseExtractor):
    def extract(self, nodes):
        metadata_list = [
            {
                "custom": "Cluster1"
            }
            for node in nodes
        ]
        return metadata_list

In [19]:
from llama_index.core.ingestion import IngestionPipeline

# process nodes with metadata extractors
pipeline = IngestionPipeline(transformations=[node_parser, *CustomExtractor])

nodes_1 = pipeline.run(nodes=nodes[:3], in_place=False, show_progress=True)

Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

  warn_deprecated(
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.35s/it]


### Auto retreival filtering using LLM 

In [126]:
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.vector_stores.types import MetadataInfo, VectorStoreInfo

vector_store_info = VectorStoreInfo(
    content_info="Scifact",
    metadata_info=[
        MetadataInfo(
            name="idx",
            type="str",
            description=(
                "it's number index in the metadata"
            ),
        ),
    ],
)

retriever = VectorIndexAutoRetriever(
    index, vector_store_info=vector_store_info,similarity_top_k=10)