# LangChain - MultiQuery Retriver

In [1]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chat_models import ChatOpenAI

In [2]:
loader = WikipediaLoader(query='MKUltra')

In [3]:
documents = loader.load()



  lis = BeautifulSoup(html).find_all('li')


In [4]:
documents

[Document(page_content='Project MKUltra (or MK-Ultra) was an illegal human experimentation program designed and undertaken by the U.S. Central Intelligence Agency (CIA) and intended to develop procedures and identify drugs that could be used during interrogations to weaken people and force confessions through brainwashing and psychological torture. It began in 1953 and was halted in 1973. MKUltra used numerous methods to manipulate its subjects\' mental states and brain functions, such as the covert administration of high doses of psychoactive drugs (especially LSD) and other chemicals without the subjects\' consent, electroshocks, hypnosis, sensory deprivation, isolation, verbal and sexual abuse, and other forms of torture.MKUltra was preceded by Project ARTICHOKE. It was organized through the CIA\'s Office of Scientific Intelligence and coordinated with the United States Army Biological Warfare Laboratories. The program engaged in illegal activities, including the use of U.S. and Can

In [5]:
len(documents)

9

## Document Transformer

In [6]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
docs = text_splitter.split_documents(documents)

In [7]:
len(docs)

18

## Creating Embedding Function

In [8]:
api_key = open('../../api_key.txt').read()

In [9]:
embedding_functions = OpenAIEmbeddings(openai_api_key=api_key)

In [10]:
db = Chroma.from_documents(docs, embedding=embedding_functions,
                          persist_directory="./mkultra")

db.persist()

## Multiquery Retriever

In [11]:
question = "When was this declassified?"

In [12]:
llm = ChatOpenAI(openai_api_key=api_key, temperature=0)

In [13]:
retriever_from_llm = MultiQueryRetriever.from_llm(retriever=db.as_retriever(),
                                                 llm=llm)

In [14]:
import logging
logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)

In [15]:
unique_documents = retriever_from_llm.get_relevant_documents(query=question)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. What is the date of declassification for this information?', '2. Can you provide the declassification date for this?', '3. At what time was this information officially declassified?']


In [16]:
len(unique_documents)

2

In [17]:
print(unique_documents[0].page_content)

== Background ==
In 1974, a New York Times article was published that accused the CIA of illegal operations committed against US citizens. Authored by Seymour M. Hersh, it documented an intelligence operation against the anti-war movement, as well as "break-ins, wiretapping and the surreptitious inspection of mail" conducted since the 1950s. According to former CIA Official Cord Meyer, these disclosures "Convinced large sections of the American public that the CIA had become a domestic Gestapo and stimulated an overwhelming demand for the wide-ranging congressional investigations that were to follow."Hersh had been tipped off to the possibility of an "in house operation" by an unidentified member of the CIA in spring of 1974. He embarked on an investigation, speaking to sources that included CIA Chief of Counterintelligence James Angleton. Although he was not aware of its existence, Hersh uncovered much information that had been documented in the "Family Jewels", a report ordered by Di