In [1]:
%pip install "pinecone-client[grpc]" mwparserfromhell


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset

data = load_dataset('wikipedia', '20220301.simple', split='train[:10000]')
data

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset wikipedia (/Users/jeevagayathri/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 10000
})

In [3]:
data[6]

{'id': '13',
 'url': 'https://simple.wikipedia.org/wiki/Alan%20Turing',
 'title': 'Alan Turing',
 'text': 'Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dub

In [4]:
import tiktoken

encoding_name = tiktoken.encoding_for_model('gpt-3.5-turbo').name
tokenizer = tiktoken.get_encoding(encoding_name)

def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)

In [6]:
chunks = text_splitter.split_text(data[6]['text'])
len(chunks)

4

In [7]:
tiktoken_len(chunks[0]), tiktoken_len(chunks[1]), tiktoken_len(chunks[2]), tiktoken_len(chunks[3])

(299, 323, 382, 157)

In [8]:
from langchain.embeddings.openai import OpenAIEmbeddings

embed = OpenAIEmbeddings(model='text-embedding-ada-002')

In [9]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

embeddings = embed.embed_documents(texts)
len(embeddings), len(embeddings[0])

(2, 1536)

In [10]:
index_name = 'langchain-retrieval-augmentation'

In [11]:
import os
import pinecone

pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'], 
    environment=os.environ['PINECONE_ENVIRONMENT']
)

if index_name not in pinecone.list_indexes():
    pinecone.create_index(name=index_name, metric='cosine', dimension=len(embeddings[0]))

In [12]:
index = pinecone.GRPCIndex(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 15797}},
 'total_vector_count': 15797}

In [13]:
import uuid
from tqdm.auto import tqdm

batch_size = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(data)):
    metadata = {
        'wiki-id': record['id'],
        'source': record['url'],
        'title': record['title'],
    }

    record_chunks = text_splitter.split_text(record['text'])
    record_metadatas = [
        {'chunk': j, 'text': chunk, **metadata}
        for j, chunk in enumerate(record_chunks)
    ]
    texts.extend(record_chunks)
    metadatas.extend(record_metadatas)

    if len(texts) >= batch_size:
        ids = [str(uuid.uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid.uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

 24%|██▍       | 2389/10000 [03:00<09:35, 13.23it/s]


KeyboardInterrupt: 

In [14]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 23056}},
 'total_vector_count': 23056}

In [16]:
from langchain.vectorstores import Pinecone

index = pinecone.Index(index_name)
vectorstore = Pinecone(index, embed.embed_query, 'text')

In [18]:
query = 'What is English?'

vectorstore.similarity_search(query, k =3)

[Document(page_content='English is a language that started in Anglo-Saxon England. It is originally from Anglo-Frisian and Old Saxon dialects. English is now used as a global language. There are about 375 million native speakers (people who use it as their first language) in the world.\n\nFrisian is the language closest to English. The vocabulary of English was influenced by other Germanic languages in the early Middle Ages and later by Romance languages, especially French. \n\nEnglish is the only official language or one of the official languages of nearly 60 countries. It is also the main language of more countries in the world than any other. It is the primary language in the United Kingdom, Ireland, the United States, Canada, Australia, and New Zealand. It is one of the official languages in Singapore, India, Hong Kong, and South Africa. It is widely spoken in parts of the Caribbean, Africa, and South Asia.\n\nIn 2005, it was estimated that there were over 2 billion speakers of Eng

In [20]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)
chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())
chain.run(query)

'English is a language that originated in Anglo-Saxon England and is now used as a global language. It is the primary language in the United Kingdom, Ireland, the United States, Canada, Australia, and New Zealand, and is one of the official languages in Singapore, India, Hong Kong, and South Africa. English is the largest language by number of speakers, with about 375 million native speakers and over 2 billion speakers worldwide. It is also the most widely-spoken Germanic language and an official language of the United Nations, European Union, and many other international organizations.'

In [25]:
from langchain.chains import RetrievalQAWithSourcesChain

chain = RetrievalQAWithSourcesChain.from_chain_type(llm, retriever=vectorstore.as_retriever())
chain(query)

{'question': 'What is English?',
 'answer': 'English is a language that started in Anglo-Saxon England and is now used as a global language. It is the primary language in the United Kingdom, Ireland, the United States, Canada, Australia, and New Zealand, and is one of the official languages in Singapore, India, Hong Kong, and South Africa. English is the largest language by number of speakers, with over 2 billion speakers worldwide. English grammar has become simpler and less Germanic over time, and the loss of case in grammar is a classic example of this. English literature has many famous stories and plays, including those by William Shakespeare. The history of the British Empire has added to the spread of English, and many people learn English as an additional language for science, business, and diplomacy. \n',
 'sources': 'https://simple.wikipedia.org/wiki/English%20language'}