In [116]:
!pip install -qU datasets sentence-transformers mwparserfromhell langchain==0.0.354 openai==1.6.1 pinecone-client==3.0.0 tiktoken==0.5.2

In [119]:
from google.colab import userdata
import time
import os
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
#from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain.schema import (
    HumanMessage,
    SystemMessage,
)
from langchain_community.chat_models.huggingface import ChatHuggingFace
from transformers import AutoTokenizer

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_API_KEY'] = userdata.get('LANGSMITH')
os.environ['HUGGINGFACEHUB_API_TOKEN'] = userdata.get('HF_TOKEN')
os.environ["TAVILY_API_KEY"] = userdata.get('TAVILY_KEY')

# Load Model

In [120]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")

embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
    )

llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
        'include_prompt_in_result' : False,
        "return_full_text": False
    },
)

chat_model = ChatHuggingFace(llm=llm)

                    repo_id was transferred to model_kwargs.
                    Please confirm that repo_id is what you intended.
                    task was transferred to model_kwargs.
                    Please confirm that task is what you intended.
                    huggingfacehub_api_token was transferred to model_kwargs.
                    Please confirm that huggingfacehub_api_token is what you intended.


## Building the Knowledge Base

In [121]:
from datasets import load_dataset
data = load_dataset("wikipedia", "20220301.simple", split='train[:1000]')
data[6]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


{'id': '13',
 'url': 'https://simple.wikipedia.org/wiki/Alan%20Turing',
 'title': 'Alan Turing',
 'text': 'Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dub

In [122]:
# create the length function
def tokenized_len(text):
    return len(tokenizer.encode(text))

# to get encoding of other models. not needed
# import tiktoken
# tiktoken.encoding_for_model('gpt-3.5-turbo')
# tokenizer = tiktoken.get_encoding('cl100k_base')

tokenized_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

30

In [123]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tokenized_len,
    separators=["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_text(data[6]['text'])
tokenized_len(chunks[0]), tokenized_len(chunks[1]), tokenized_len(chunks[2]) # at most 400

(364, 363, 373)

# Vector Database

In [124]:
from pinecone import Pinecone
from pinecone import ServerlessSpec, PodSpec

api_key = os.getenv("PINECONE_API_KEY") or userdata.get('PINECONE_API_KEY')
pc = Pinecone(api_key=api_key)
spec = PodSpec(environment="gcp-starter")

In [125]:
import time

index_name = 'langchain-retrieval-augmentation'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=768,  # dimensionality of stentence tranformer
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.03524,
 'namespaces': {'': {'vector_count': 3524}},
 'total_vector_count': 3524}

We should see that the new Pinecone index has a `total_vector_count` of `0`, as we haven't added any vectors yet.

## Indexing

We can perform the indexing task using the LangChain vector store object. But for now it is much faster to do it via the Pinecone python client directly. We will do this in batches of `100` or more.

In [111]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 100
texts = []
metadatas = []

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        'wiki-id': str(record['id']),
        'source': record['url'],
        'title': record['title']
    }

    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])

    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]

    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)

    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embeddings.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embeddings.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [52]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.03172,
 'namespaces': {'': {'vector_count': 3172}},
 'total_vector_count': 3172}

## Creating a Vector Store and Querying

Now that we've build our index we can switch back over to LangChain. We start by initializing a vector store using the same index we just built. We do that like so:

In [127]:
# initialize a langchain vectorstore based off the Pinecone index

from langchain.vectorstores import Pinecone

text_field = "text"
vectorstore = Pinecone(
    index, embeddings.embed_query, text_field
)

In [142]:
query = " the “Turing law” grants an automatic pardon to men who died before the law came into force, making it possible for living convicted gay men to seek pardons for offences now no longer on the statute book.\n\nAlas"
query = 'Returned to Cambridge from sabbatical'
query = 'he returned to Cambridge'

vectorstore.similarity_search(
    query,  # our search query
    k=2  # return 1 most relevant docs
)

[Document(page_content="Sources \n Richard B. Todd, ed. (2004). Dictionary of British Classicists, 1500–1960, Bristol: Thoemmes Continuum, 2004 .\n Kelly Boyd, ed. (1999). Encyclopedia of Historians and Historical Writing. London [etc.] : Fitzroy Dearborn \n Lateiner, D. (1989). The historical method of Herodotus. Phoenix, 23. Toronto: University of Toronto Press.\n John Cannon et al., eds. (1988). The Blackwell Dictionary of Historians. Oxford: Blackwell Publishers, 1988 .\n Hartog, F. (1988). The mirror of Herodotus: the representation of the other in the writing of history. Berkeley: University of California Press.\n Erik Christiansen (1970). The Last Hundred Years of the Roman Republic, Odense: Andelsbogtrykkeriet\n Gottschalk, L. R. (1950). Understanding history; a primer of historical method. New York: Knopf\n Barnes, M. S. (1896). Studies in historical method. Heath's pedagogical library. Boston: D.C. Heath & Co.", metadata={'chunk': 1.0, 'source': 'https://simple.wikipedia.org/

# Generative Question-Answering

In [143]:
messages = [
    SystemMessage(content="You're a helpful assistant"),
    HumanMessage(content="Name a few dictators mentioned?"),
]

# chat model understands the expected prompt template. Interesting because I previously thought it's dumb.
chat_model._to_chat_prompt(messages)

"<|system|>\nYou're a helpful assistant</s>\n<|user|>\nName a few dictators mentioned?</s>\n<|assistant|>\n"

In [144]:
chat_model.invoke(messages)

AIMessage(content="<|system|>\nYou're a helpful assistant</s>\n<|user|>\nName a few dictators mentioned?</s>\n<|assistant|>\nI do not have the context of which text or conversation you are referring to. Please provide more information so I can accurately answer your question.\n\nthat being said, some famous dictators in history include adolf hitler (germany), benito mussolini (italy), joseph stalin (soviet union), franco (spain), pol pot (cambodia), idi amin (uganda), saddam hussein (iraq), and muammar gaddafi (libya).")

In [145]:
# from langchain.cache import InMemoryCache
# from langchain.globals import set_llm_cache

# set_llm_cache(InMemoryCache())

In [146]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

qa_chat = RetrievalQA.from_chain_type(
    llm=chat_model,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

query = 'What are some of the dictators mentioned?'
print(qa.run(query))

 Some of the dictators mentioned include Juan Perón (Argentina), Jorge Rafael Videla (Argentina), Leopoldo Galtieri (Argentina), Augusto Pinochet (Chile), Pol Pot (Cambodia), Ferdinand Marcos (Philippines), Corazon Aquino (Philippines), Fidel Castro (Cuba), and Nicolae Ceaușescu (Romania).


In [147]:
print(qa_chat.run(query)) # run() doesn't return list and just generated text (even if the llm was initalized to return the prompt within response.)

Some of the dictators mentioned are:

1. Jorge Rafael Videla (Argentina) - dictator from 1976 to 1981
2. Leopoldo Galtieri (Argentina) - dictator from 1981 to 1982
3. Augusto Pinochet (Chile) - Chilean dictator from 1973 to 1990
4. Francisco Franco (Spain) - dictator until 1975
5. Muammar al-Gaddafi (Libya) - dictator
6. Saddam Hussein (Iraq) - dictator until 2003
7. Nicolae Ceaușescu (Romania) - dictator until 1989
8. Ayatollah Khomeini (Iran) - leader of Iran

Note: Some of the leaders mentioned, such as Juan Perón and Juan Domingo Perón, were democratically elected presidents of Argentina, but they were also considered authoritarian figures due to their policies and actions.


## Retrieval with Sources

1. The "sources" output relies on text splitting the model output into (answers, sources) pair and isn't very robust. Just rely on source_documents instead.
2. This is true even if we specify return_full_text = False
3. Sometimes the string "source" is missing from model output (because model isn't that good.)
4. Sometimes, model doesn't seem to be responding to the right question and returning the example in template. Maybe a splitting issue, or model context window issue?
5. Sometimes the model output is too big that is stops before actually completing the response (tunable).

In [148]:
from langchain.chains import RetrievalQAWithSourcesChain

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

qa_chat_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=chat_model,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

query = 'What are some of the dictators mentioned?'
print(qa_with_sources.invoke(query))
print('====================================================')
print(qa_chat_with_sources.invoke(query))

{'question': 'What are some of the dictators mentioned?', 'answer': ' The dictators mentioned include Juan Perón, Jorge Rafael Videla, and Leopoldo Galtieri from Argentina; Salvador Allende and Augusto Pinochet from Chile; Emiliano Zápata and Pancho Villa from Mexico; José María Velasco Ibarra, León Febres Cordero, and Sixto Durán Ballén from Ecuador; Mao Zedong and Deng Xiaoping from China; Pol Pot from Cambodia; Netaji Subhas Chandra Bose, Mahatma Gandhi, Indira Gandhi, Muhammad Ali Jinnah, Sheikh Mujibur Rahman, Mahathir Mohamad, Jawaharlal Nehru, Deshbandhu Chittaranjan Das, Emperor Hirohito, Ho Chi Minh, Sun Yat-sen, Chiang Kai-shek, Achmad Sukarno, Lee Kuan Yew, King Faisal I, King Ghazi, King Faisal II, W.T. Cosgrave, Éamon de Valera, Taoiseach Éamon de Valera, King Victor Emmanuel III, Prime Minister Benito Mussolini, Emperor Hirohito, Emir Ahmad Al-Jaber Al-Sabah, Prime Minister António de Oliveira Salazar, Sultan Mohammed V, Prime Minister Michael Joseph Savage, President Pau

In [154]:
query = 'What is monarchy? '
print(qa_with_sources(query))
print(qa_chat_with_sources(query))

{'question': 'What is monarchy? ', 'answer': ' A monarchy is a government ruled by a king or queen who inherits their position, while an aristocracy is a government run by the people of a ruling class.\n', 'sources': '', 'source_documents': [Document(page_content='A monarch is the ruler of a monarchy. Monarchs usually get their power by inheritance – from one of their parents. When a ruler dies their child, or nearest relative, takes over. A male monarch is usually called a king or emperor. A female monarch is usually called a queen or empress.\n\nMonarchs were very common in history until the 20th century. Most European countries had monarchs in past centuries, but no longer have them. Some countries that still have monarchs have other leaders that actually have the power. Most monarchies are hereditary monarchies. Some countries elect their monarch instead of using inheritance, like the Holy Roman Empire and Malaysia. Some republics, such as the Dutch Republic or North Korea, had or 

Not able to debug why the AI model is returning the content from the prompt. Some issue with the context window (but a longer text "wahat is monarchy and what is monarchy" works even better).

The issue is not even consistent with "chat" models so maybe can't associate it with a possible templating issue with chat model's template.

In [150]:
query = 'In which year, did Turing return to Cambridge?'
print(qa_with_sources(query))
print(qa_chat_with_sources(query))

{'question': 'In which year, did Turing return to Cambridge?', 'answer': ' Alan Turing returned to Cambridge in 1947 for a sabbatical year.\n', 'sources': 'https://simple.wikipedia.org/wiki/Alan%20Turing', 'source_documents': [Document(page_content='Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward W

In [None]:
# complete contextualizing/adding chat histroy tutorial
# add that here
# add more functioanlity
# deploy
# move towards other use-cases

In [156]:
# pc.delete_index(index_name)

---