In [None]:
import hashlib
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings

from langchain.vectorstores import FAISS

In [None]:
model = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')


In [51]:
def gen_hash(text: str, size: int = 32):
    return hashlib.sha256(text.encode('utf-8')).hexdigest()[:size]

In [52]:
loader = TextLoader('./docs/data_science_overview.txt')
documents = loader.lazy_load()

In [53]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100, add_start_index=True)
chunks = splitter.split_documents(documents)

In [54]:
for chunk in chunks:
    chunk.metadata['hash'] = gen_hash(chunk.page_content)

In [None]:
db = FAISS.from_documents(chunks, model)

In [56]:
db

<langchain_community.vectorstores.faiss.FAISS at 0x35f72b6e0>

In [71]:
documents = db.similarity_search_with_relevance_scores("What is data science?", k=3)
[document for document in documents if document[1] > 0.65]

[(Document(id='c6991401-f36e-4a26-8b29-118ebb67f860', metadata={'source': './docs/data_science_overview.txt', 'start_index': 348, 'hash': 'd56174581a0ec6ce0a6111a80617024e'}, page_content='What is Data Science?\n\nData science is the practice of deriving actionable insights from data through a combination of:\n- Statistical analysis\n- Machine learning\n- Data visualization\n- Domain knowledge\n- Programming skills\n- Business acumen\n\nThe goal is to transform raw data into meaningful information that can drive decision-making and solve real-world problems.\n\nThe Data Science Process'),
  np.float32(0.7887581)),
 (Document(id='3c94c512-d26d-4d1f-8a84-f0cec53a3ae0', metadata={'source': './docs/data_science_overview.txt', 'start_index': 0, 'hash': '4292541e16c9140bd5390294860997bf'}, page_content='Data Science: A Comprehensive Overview\n\nData science is an interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from st