El archivo ```chunks.bin``` fue construido en el notebook ```eda_documents.ipynb```

In [2]:
import pickle
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open('chunks.bin', 'rb') as f_in:
    documents = pickle.load(f_in)

print("# de documentos:", len(documents))
print(documents[0])

# de documentos: 563
page_content='baja de tasas de interes facilita la compra de vivienda en 2024
la compra de vivienda para muchos colombianos se ha visto afectada por diferentes factores economicos como la inflacion, devaluacion de la moneda, entre otros. frente a este panorama, muchos sectores, como el financiero, presentan medidas para la reactivacion de la economia.
el banco de la republica viene haciendo reduccion de tasas de interes desde febrero del 2024, llegando en julio a 11.25%, y' metadata={'id': '2a3d5c52', 'title': 'baja de tasas de interes facilita la compra de vivienda en 2024'}


In [4]:
documents = [
    {
        "id_doc": doc.metadata["id"],
        "chunk": doc.page_content   
    }
    for doc in documents
]

In [5]:
documents[0]

{'id_doc': '2a3d5c52',
 'chunk': 'baja de tasas de interes facilita la compra de vivienda en 2024\nla compra de vivienda para muchos colombianos se ha visto afectada por diferentes factores economicos como la inflacion, devaluacion de la moneda, entre otros. frente a este panorama, muchos sectores, como el financiero, presentan medidas para la reactivacion de la economia.\nel banco de la republica viene haciendo reduccion de tasas de interes desde febrero del 2024, llegando en julio a 11.25%, y'}

### Generacion de embeddings con multilingual-e5-large-instruct

In [6]:
# def get_detailed_instruct(task_description: str, query: str) -> str:
#     return f'Instruct: {task_description}\nQuery: {query}'

## Each query must come with a one-sentence instruction that describes the task
# task = 'Given a web search query, retrieve relevant passages that answer the query'
# queries = [
#     get_detailed_instruct(task, 'how much protein should a female eat'),
#     get_detailed_instruct(task, '南瓜的家常做法')
# ]
## No need to add instruction for retrieval documents
# documents = []
# input_texts = queries + documents

model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')
for doc in tqdm(documents):    
    doc["embedding_e5_large_instruct"] = model.encode(
        doc["chunk"], 
        convert_to_tensor=True, 
        normalize_embeddings=True
    )

  0%|          | 0/563 [00:00<?, ?it/s]

: 

**Nota:** no dió mi ram para usar este modelo

### Generacion de embeddings con multilingual-e5-small

In [8]:
model = SentenceTransformer('intfloat/multilingual-e5-small')

for doc in tqdm(documents):    
    doc["embedding_e5_small"] = model.encode(
        f"passage: {doc['chunk']}",
        normalize_embeddings=True
    )

100%|██████████| 563/563 [01:12<00:00,  7.72it/s]


In [9]:
del model

### Generacion de embeddings con paraphrase-multilingual-mpnet-base-v2

In [6]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

for doc in tqdm(documents):
    doc["embedding_paraphrase_mpnet_base"] = model.encode(doc['chunk'])

100%|██████████| 563/563 [03:37<00:00,  2.58it/s]


In [7]:
del model

### Storage

In [11]:
from langchain_community.vectorstores import FAISS

In [13]:
documents[0].keys()

dict_keys(['id_doc', 'chunk', 'embedding_paraphrase_mpnet_base', 'embedding_e5_small'])

In [None]:
import numpy as np

In [18]:
# modelo 1
from langchain_huggingface import HuggingFaceEmbeddings

embeddings_func = HuggingFaceEmbeddings()

text_embedding_pairs = zip([doc["chunk"] for doc in documents], [doc["embedding_e5_small"] for doc in documents])
faiss = FAISS.from_embeddings(text_embedding_pairs, embeddings_func)



In [19]:
faiss

<langchain_community.vectorstores.faiss.FAISS at 0x7f98949e3100>

In [50]:
model_name = "intfloat/multilingual-e5-large-instruct"
# "intfloat/multilingual-e5-small",
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embeddings_func = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs    
)

You try to use a model that was created with version 2.4.0.dev0, however, your version is 2.2.2. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





FileNotFoundError: [Errno 2] No such file or directory: '/home/sebasjp/.cache/torch/sentence_transformers/intfloat_multilingual-e5-large-instruct/sentence_xlnet_config.json'

In [None]:
np.array(embeddings_func.embed_documents([a["chunk"]]))

In [24]:
# a = documents[0].copy()

In [45]:
# a["embedding_e5_small"]

array([-0.01406619,  0.00473391, -0.0263987 , -0.10681732,  0.03214642,
       -0.04709134,  0.03635797,  0.06220098,  0.09489433,  0.01204683,
        0.07976241,  0.05135557,  0.03074877, -0.03876662, -0.04131197,
       -0.00199246,  0.03582068, -0.06456685, -0.06676188, -0.04475215,
        0.0120186 ,  0.03614866, -0.05122845,  0.0667139 ,  0.0443642 ,
        0.08165304, -0.04998334,  0.00476078,  0.06206318, -0.05130728,
       -0.05314874, -0.00356795,  0.04158295, -0.10153362,  0.06392787,
        0.02498712, -0.03653572, -0.00644967,  0.02698488, -0.04979852,
       -0.06909255, -0.03419831,  0.01600901,  0.0794198 ,  0.06544348,
        0.04306335, -0.06893021,  0.06271015, -0.05432369, -0.05427798,
       -0.04316064,  0.07737124, -0.02671421,  0.02386271,  0.05244727,
       -0.05437637, -0.07344311, -0.00930264, -0.05723023,  0.01546882,
        0.06439141, -0.00023933,  0.01783319,  0.03348359,  0.03487824,
        0.05938034, -0.03243915,  0.0258558 , -0.04050863, -0.03