In [2]:
import os
from dotenv import load_dotenv
import openai
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Milvus
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [5]:
documents = DirectoryLoader("./WikiData/")
documents = documents.load()

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
doc_splits = text_splitter.split_documents(documents)
print(len(doc_splits))

101


In [7]:
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Set up a vector store used to save the vector embeddings. Here we use Milvus as the vector store.
vector_store = Milvus.from_documents(
    doc_splits,
    embedding=embed_model,
    connection_args={"host": "localhost", "port": 19530}, collection_name="wikiQA"
)

In [9]:
query = "Who is Hisenberg?"
docs = vector_store.similarity_search(query)

print(docs)

[Document(page_content='early scripts.The writers also opened up on their collaborative process and how their form of storytelling evolved with the show. According to writer George Mastras,', metadata={'source': 'WikiData\\Breaking Bad.txt'}), Document(page_content='=== Main characters === Bryan Cranston as Walter White, a high-school chemistry teacher who, shortly after his 50th birthday, is diagnosed with Stage III lung cancer and turns to making meth to secure his family\'s finances. As his shady business progresses, Walter gains a notorious reputation under the alias of "Heisenberg". Cranston stated that, though he enjoyed doing comedy, he decided he ... should really focus on doing something else. But I think any good drama worth its weight always has a sprinkling of comedy in it, because you can ease the tension to an audience when it\'s necessary, and then build it back up again. Walt White has no clue he\'s occasionally funny, but as an actor, I recognize when there are comedic

In [18]:
chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="map_reduce", return_intermediate_steps=True)
query = "Who is Hisenberg?"
answer = chain({"input_documents": docs, "question": query}, return_only_outputs=False)

In [21]:
answer['output_text']

' Heisenberg is an alias used by the character Walter White in the TV show "Breaking Bad".\nSOURCES: WikiData\\Breaking Bad.txt'

In [9]:
# Build a QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
)

In [17]:
# Ask a question!
query = "How did Jesse's girlfriend die?"
qa_chain.run(query)

"Jesse's girlfriend, Jane, died from choking on her own vomit. Walt, who was present at the time, did not intervene to save her."