In [1]:
# Import Modules
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

from transformers import AutoTokenizer, pipeline, AutoModelForQuestionAnswering

# Options
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
# Specify the dataset (from Huggingface)
dataset = "databricks/databricks-dolly-15k"
target_column = 'context'

# Create a loader for dataset
loader = HuggingFaceDatasetLoader(dataset, target_column)

# Load the data
data = loader.load()
data[:5]

[Document(metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}, page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."'),
 Document(metadata={'instruction': 'Which is a species of fish? Tope or Rope', 'response': 'Tope', 'category': 'classification'}, page_content='""'),
 Document(metadata={'instruction': 'Why can camels survive for long without water?', 'resp

In [3]:
# Create an instance of text-splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Transform raw data into split documents
docs = text_splitter.split_documents(data)
docs[:3]

[Document(metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}, page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."'),
 Document(metadata={'instruction': 'Which is a species of fish? Tope or Rope', 'response': 'Tope', 'category': 'classification'}, page_content='""'),
 Document(metadata={'instruction': 'Why can camels survive for long without water?', 'resp

In [4]:
# Choose text embedding model
embedding_model = "sentence-transformers/all-MiniLM-l6-v2"

# Model configuration
model_kwargs = {'device': 'cuda'}
# Encoding option
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
  model_name=embedding_model, # pre-trained
  model_kwargs=model_kwargs,
  encode_kwargs=encode_kwargs
)

In [5]:
# Create a DB for searching purpose (To retrieve embedding vectors efficiently)
db = FAISS.from_documents(docs, embeddings)

In [6]:
# Specify model name
model_name = "Intel/dynamic_tinybert"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
  model_name,
  padding=True,
  truncation=True,
  max_length=512
)

# Model pipeline for text generation (Question-Answering)
pipeline = pipeline(
  "question-answering",
  model=model_name,
  tokenizer=tokenizer,
  return_tensors="pt",
  device=0
)

# Create an instance of the HuggingFacePipelin (Wrapper - Arguments)
llm_kwargs = {"temperature": 0.7, "max_length": 512}
llm = HuggingFacePipeline(
  pipeline=pipeline,
  model_kwargs=llm_kwargs
)

In [7]:
# Create an instance of retriever
retriever = db.as_retriever(search_kwargs={"k": 5})

# Create a question_answering instance using the RetrievalQA class
retQA = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever, return_source_documents=False)

In [8]:
question = "Who is Thomas Jefferson?"
result = retQA.run({"query": question})
print(result["result"])

ValueError: Context information is below. 
------------
"Thomas Jefferson (April 13, 1743 \u2013 July 4, 1826) was an American statesman, diplomat, lawyer, architect, philosopher, and Founding Father who served as the third president of the United States from 1801 to 1809. Among the Committee of Five charged by the Second Continental Congress with authoring the Declaration of Independence, Jefferson was the Declaration's primary author. Following the American Revolutionary War and prior to becoming the nation's third president in 1801, Jefferson was the first United States secretary of state under George Washington and then the nation's second vice president under John Adams."
------------
Given the context information and not prior knowledge, answer the question: Who is Thomas Jefferson?
 argument needs to be of type (SquadExample, dict)