In [4]:
!pip install langchain==0.1.7
# !pip install transformers
# !pip install datasets
# !pip install sentence-transformers
# !pip install faiss-gpu



In [5]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

In [6]:
dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context"

loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)

data = loader.load()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
data[2]

Document(page_content='""', metadata={'instruction': 'Why can camels survive for long without water?', 'response': 'Camels use the fat in their humps to keep them filled with energy and hydration for long periods of time.', 'category': 'open_qa'})

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)

In [9]:
docs[0]

Document(page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."', metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'})

In [10]:
modelPath = "sentence-transformers/all-MiniLM-l6-v2"
embeddings = HuggingFaceEmbeddings(model_name=modelPath)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
text = "try this sentense"
emb = embeddings.embed_query(text)
emb

[0.010690158233046532,
 -0.03374125808477402,
 -0.05700172483921051,
 0.0009301999234594405,
 0.037310078740119934,
 -0.02721451036632061,
 -0.08482573926448822,
 0.018253110349178314,
 -0.03181888535618782,
 0.03769144415855408,
 0.0848909467458725,
 -0.051049258559942245,
 -0.02934180572628975,
 0.01872490905225277,
 -0.02110871486365795,
 0.053049709647893906,
 -0.016056811437010765,
 0.11556853353977203,
 -0.001227346365340054,
 -0.011854412965476513,
 0.051640912890434265,
 -0.08269526064395905,
 0.008467809297144413,
 -0.016407152637839317,
 -0.00800519809126854,
 -0.010744532570242882,
 -0.07152892649173737,
 0.06488744914531708,
 -0.024311620742082596,
 -0.04651271924376488,
 0.0009897721465677023,
 0.03265467658638954,
 0.01956331916153431,
 0.0001751721720211208,
 0.0715949684381485,
 0.026427550241351128,
 0.002743661403656006,
 0.05769873410463333,
 -0.06158985197544098,
 -0.061555176973342896,
 -0.09887015074491501,
 0.004315509926527739,
 0.027963194996118546,
 -0.0031929

In [12]:
len(emb)

384

Vector Stores

In [13]:
db = FAISS.from_documents(docs, embeddings)

In [14]:
question = "what is cheesemaking?"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

"The goal of cheese making is to control the spoiling of milk into cheese. The milk is traditionally from a cow, goat, sheep or buffalo, although, in theory, cheese could be made from the milk of any mammal. Cow's milk is most commonly used worldwide. The cheesemaker's goal is a consistent product with specific characteristics (appearance, aroma, taste, texture). The process used to make a Camembert will be similar to, but not quite the same as, that used to make Cheddar.\n\nSome cheeses may be deliberately left to ferment from naturally airborne spores and bacteria; this approach generally leads to a less consistent product but one that is valuable in a niche market.\n\nCulturing\nCheese is made by bringing milk (possibly pasteurised) in the cheese vat to a temperature required to promote the growth of the bacteria that feed on lactose and thus ferment the lactose into lactic acid. These bacteria in the milk may be wild, as is the case with unpasteurised milk, added from a culture,


LLM Model

In [15]:
# using Intel/dynamic_tinybert which is a fine-tuned model for the purpose of question-answering.
tokenizer = AutoTokenizer.from_pretrained("Intel/dynamic_tinybert")

model = AutoModelForQuestionAnswering.from_pretrained("Intel/dynamic_tinybert")

tokenizer_config.json:   0%|          | 0.00/351 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [16]:
model_name = "Intel/dynamic_tinybert"

tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

question_answer = pipeline("question-answering",
                           model = model_name,
                           tokenizer = tokenizer,
                           return_tensor='pt')

llm = HuggingFacePipeline(pipeline = question_answer,
                          model_kwargs = {"temperature":0.7, "max_length":512})

Retrievers

In [17]:
retriever = db.as_retriever()

In [18]:
docs = retriever.get_relevant_documents("What is Cheesemaking")
print(docs[0].page_content)

"The goal of cheese making is to control the spoiling of milk into cheese. The milk is traditionally from a cow, goat, sheep or buffalo, although, in theory, cheese could be made from the milk of any mammal. Cow's milk is most commonly used worldwide. The cheesemaker's goal is a consistent product with specific characteristics (appearance, aroma, taste, texture). The process used to make a Camembert will be similar to, but not quite the same as, that used to make Cheddar.\n\nSome cheeses may be deliberately left to ferment from naturally airborne spores and bacteria; this approach generally leads to a less consistent product but one that is valuable in a niche market.\n\nCulturing\nCheese is made by bringing milk (possibly pasteurised) in the cheese vat to a temperature required to promote the growth of the bacteria that feed on lactose and thus ferment the lactose into lactic acid. These bacteria in the milk may be wild, as is the case with unpasteurised milk, added from a culture,


Retrieval QA Chain

In [19]:
retriever = db.as_retriever(search_kwargs={"k":4})

qa = RetrievalQA.from_chain_type(llm=llm, chain_type='refine', retriever = retriever, return_source_documents=False)

In [22]:
question = "What is Cheesemaking"
result = qa.run({"query":question})
print(result['result'])
