In [2]:
from langchain.document_loaders import DirectoryLoader, CSVLoader
from langchain.document_loaders import WebBaseLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.vectorstores.faiss import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.llms.gpt4all import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
# Document Loaders
loader = DirectoryLoader(path="data", loader_cls=CSVLoader, glob='*.csv')
raw_docs = loader.load()

In [5]:
# Document Transformers
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 100, chunk_overlap = 0)

docs = text_splitter.split_documents(raw_docs)

In [10]:
# Embedding
embeddings = HuggingFaceEmbeddings()
faiss_db = FAISS.from_documents(documents=docs, embedding=embeddings)

In [12]:
# Vector Store
FAISS_DB_PATH = "./vectorstore"
FAISS_DB_DIR = "/faiss_db"
faiss_db.save_local(FAISS_DB_PATH + FAISS_DB_DIR)

In [15]:
# Retrievers
retriever = faiss_db.as_retriever()

query = "日本1番のビルは？"
context_docs = retriever.get_relevant_documents(query)
print(f"len={len(context_docs)}")

first_doc = context_docs[0]
print(f"metadata={first_doc.metadata}")
print(first_doc.page_content)

len=4
metadata={'source': 'data/building.csv', 'row': 0}
順位: 1
竣工年: 2027
ビル名: トーチタワー
高さ(m): 390
所在地: 東京都千代田区
国名: 日本


hello world
