In [None]:
%pip install -r requirements.txt

In [None]:
%pip install -i https://test.pypi.org/simple/ easy-ingest-text==0.0.4

In [4]:
import json
import logging
from typing import List

import easy_ingest_text.defaults
from easy_ingest_text.embed_text import Embedder
from easy_ingest_text.enhanced_document import EnhancedDocument
from easy_ingest_text.ingest_text import Ingester
from easy_ingest_text.load_text import Loader

In [5]:
# Configure the logging
logging.basicConfig(
    format="%(asctime)s -  %(filename)s:%(lineno)d - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
)

In [6]:
class CustomLoader(Loader):
    """Custom logic for converting files to EnhancedDocuments."""

    def file_to_docs(self, file_path: str) -> List[EnhancedDocument]:
        file_extension = file_path.split(".")[-1]
        if file_extension == "json":
            with open(file_path) as fin:
                try:
                    data = json.load(fin)
                    text = data["text"]
                    # TODO(STP): Add the filename to the metadata.
                    metadata = {}
                    for key in {
                        "title",
                        "url",
                        "site_full",
                        "language",
                        "published",
                    }:
                        if key in data:
                            metadata[key] = data[key]
                    if "source" in metadata:
                        # HACK(STP): Since source is a reserved keyword for
                        # document metadata, we need to rename it here.
                        metadata["source_"] = metadata["source"]
                    metadata["source"] = file_path
                    return [
                        EnhancedDocument(page_content=text, metadata=metadata)
                    ]
                except Exception as e:
                    print(f"Failed to parse {fin}: {e}. Skipping for now")
                    return []
        else:
            return super().file_to_docs(file_path)

In [7]:
vectorstore_config = easy_ingest_text.defaults.DEFAULT_VECTORSTORES_CONFIG
vectorstore_config["FAISS"]["save_local_config"]["save_local"] = True
embedder = Embedder(vectorstore_config=vectorstore_config)
ingester = Ingester(loader=CustomLoader(), embedder=embedder)
# NOTE(STP): You need to upload the dataset to the current directory (`/content`) for this to work.
# The demo dataset used can be found here: https://www.kaggle.com/datasets/jeet2016/us-financial-news-articles
ingester.ingest_dataset(
    input_dir="../financial_dataset.zip",
    is_zipped=True,
    save_intermediate_docs=True,
    output_dir="output_financial_dataset",
    detailed_progress=True,
    max_files=500,
)

2024-08-10 14:50:48 -  SentenceTransformer.py:197 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


Extracted ../financial_dataset.zip into financial_dataset
Extracted financial_dataset/3811_112b52537b67659ad3609a234388c50a/2018_04_112b52537b67659ad3609a234388c50a.zip into financial_dataset/3811_112b52537b67659ad3609a234388c50a/2018_04_112b52537b67659ad3609a234388c50a
Deleted financial_dataset/3811_112b52537b67659ad3609a234388c50a/2018_04_112b52537b67659ad3609a234388c50a.zip
Extracted financial_dataset/3811_112b52537b67659ad3609a234388c50a/2018_02_112b52537b67659ad3609a234388c50a.zip into financial_dataset/3811_112b52537b67659ad3609a234388c50a/2018_02_112b52537b67659ad3609a234388c50a
Deleted financial_dataset/3811_112b52537b67659ad3609a234388c50a/2018_02_112b52537b67659ad3609a234388c50a.zip
Extracted financial_dataset/3811_112b52537b67659ad3609a234388c50a/2018_05_112b52537b67659ad3609a234388c50a.zip into financial_dataset/3811_112b52537b67659ad3609a234388c50a/2018_05_112b52537b67659ad3609a234388c50a
Deleted financial_dataset/3811_112b52537b67659ad3609a234388c50a/2018_05_112b52537b676

Ingesting files:  81%|████████  | 406/500 [00:14<00:03, 28.89files/s]2024-08-10 14:52:26 -  embed_text.py:318 - INFO - 
Successfully saved vectorstore of length 3920 to: faiss_index
Ingesting files:  81%|████████  | 406/500 [00:18<00:04, 21.86files/s]


In [3]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

from easy_ingest_text.defaults import DEFAULT_VECTORSTORES_CONFIG, DEFAULT_EMBEDDERS_CONFIG

hf_embedder = HuggingFaceEmbeddings(**DEFAULT_EMBEDDERS_CONFIG["HuggingFace"])
load_local_config = DEFAULT_VECTORSTORES_CONFIG["FAISS"]["load_local_args"]
load_local_config["embeddings"] = hf_embedder
vectorstore_instance = FAISS.load_local(**load_local_config)
num_documents = len(vectorstore_instance.index_to_docstore_id)
print(f"Total number of documents stored in FAISS vectorstore: {num_documents}")

  from tqdm.autonotebook import tqdm, trange


Total number of documents stored in FAISS vectorstore: 3920


In [4]:
vectorstore_instance

<langchain_community.vectorstores.faiss.FAISS at 0x7245984141f0>

In [5]:
results = vectorstore_instance.similarity_search(query="agriculture companies in south america",k=3)
for doc in results:
    print(f"* Page Content: {doc.page_content}\n ... \nMetadata: {doc.metadata}\n\n-----")

* Page Content: Brazilian producers in the nation’s key center-west agricultural belt usually plant soy in the summer and corn right after the oilseed is harvested in a crop rotation system.
“Soy brings corn together. If soy production expands, corn follows,” Fogaça said.
Along with the new factories, Longping plans to build research centers to improve its corn seeds and to start developing new soy and sorghum seeds, the executives said.
 ... 
Metadata: {'language': 'english', 'title': "China's Longping sees Brazil corn expansion, targets seeds market", 'published': '2018-05-29T21:05:00.000+03:00', 'url': 'https://www.reuters.com/article/us-brazil-corn-lpht/chinas-longping-sees-brazil-corn-expansion-targets-seeds-market-idUSKCN1IU2ER', 'source': 'financial_dataset/2018_05_112b52537b67659ad3609a234388c50a/news_0055330.json', 'content_hash': 'f53152d5-2d4a-504d-9372-5d0ea39e8645', 'metadata_hash': '69afe38f-9cd2-5de3-9d70-cad79d36f2bb', 'document_hash': '624f3268-2d25-55e6-b02a-14d91209d