# Semantic Search



## Installing Dependencies

In [None]:
! pip install cohere dataset pinecone-client langchain datasets openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cohere
  Downloading cohere-4.6.0-py3-none-any.whl (33 kB)
Collecting dataset
  Downloading dataset-1.6.0-py2.py3-none-any.whl (18 kB)
Collecting pinecone-client
  Downloading pinecone_client-2.2.1-py3-none-any.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.2/177.2 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.0.181-py3-none-any.whl (934 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m934.6/934.6 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-0.27.7-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

### Importing libraries

In [None]:
#import libraries
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.embeddings import CohereEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains import LLMChain, SimpleSequentialChain, RetrievalQA
from langchain.prompts import PromptTemplate

import cohere
from datasets import load_dataset
import pinecone
import torch

## Loading Dataset of Cohere from HuggingFace

In [None]:
co = cohere.Client("ojTslYOISpyFNL7teU3cbR4PRbOs6AVsIIhGWIMe")

In [None]:
from datasets import load_dataset

#downloading at once
def download_data(path = "Cohere/wikipedia-22-12-simple-embeddings"):
  articles = load_dataset(f"{path}",
                    split="train")
  return articles

In [None]:
docs = download_data()
type(docs)



datasets.arrow_dataset.Dataset

In [None]:
#samples in arrow data
def show_samples(docs):
  j = 0
  for doc in docs:
    print(doc)
    j+=1
    if j > 10:
      break

In [None]:
show_samples(docs)

{'id': 0, 'title': '24-hour clock', 'text': 'The 24-hour clock is a way of telling the time in which the day runs from midnight to midnight and is divided into 24 hours, numbered from 0 to 23. It does not use a.m. or p.m. This system is also referred to (only in the US and the English speaking parts of Canada) as military time or (only in the United Kingdom and now very rarely) as continental time. In some parts of the world, it is called railway time. Also, the international standard notation of time (ISO 8601) is based on this format.', 'url': 'https://simple.wikipedia.org/wiki?curid=9985', 'wiki_id': 9985, 'views': 2450.62548828125, 'paragraph_id': 0, 'langs': 30, 'emb': [0.07711287587881088, 0.3197174072265625, -0.2051590085029602, 0.6302579045295715, 0.032093219459056854, 0.200703963637352, 0.16665680706501007, -0.31295087933540344, 0.17575109004974365, 0.5308129191398621, -0.37528499960899353, 0.3338659405708313, -0.046272162348032, 0.07841536402702332, -0.3490406274795532, 0.271

## Creation of Pinecone Vector DB

In [None]:
#creation pincone vector db
def create_pinecone_index(table_name,
                          dimension=768,
                          metric="cosine",
                          pod_type="p1"):
  pinecone.init(api_key="f6e73bf8-43dc-4ce4-b29b-19430caa8543",
              environment="us-west4-gcp-free")
  if table_name not in pinecone.list_indexes():
    pinecone.create_index(table_name,
                          dimension=dimension,
                          metric=metric,
                          pod_type=pod_type)
    index = pinecone.Index(table_name)

  else:
    index = pinecone.Index(table_name)

  return index

In [None]:
PINECONE_TABLE_NAME = "chat-qa-wikipedia"
index = create_pinecone_index(PINECONE_TABLE_NAME)

In [None]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 1.0,
 'namespaces': {'': {'vector_count': 207488}},
 'total_vector_count': 207488}

In [None]:
batch_size = 128
vector_space = len(docs)
vector_space

485859

### Upserting Vectors to Pincone

In [None]:
from pinecone.core.client.rest import ApiException
def upsert_vectors_to_pine(index, docs, vector_space, batch_size, to_start=0):
  for i in range(to_start, vector_space, batch_size):

  
    i_end = min(i+batch_size,vector_space)
    temp = docs[i:i_end]
    ids = [str(id) for id in temp["id"]]
    embeds =  temp["emb"]
    print("embeds done")
    meta = [{"title":article[0],
            "text":article[1],
            "url":article[2],
            "wiki_id":article[3]} for article in zip(temp["title"],
                                                      temp["text"],
                                                      temp["url"],
                                                      temp["wiki_id"])]
    to_upsert = list(zip(ids, embeds, meta))
    try:
      index.upsert(vectors=to_upsert)
    except ApiException:
      print("index limit approached")
      break
      


In [None]:
upsert_vectors_to_pine(index, docs, vector_space, batch_size, to_start=207488)

embeds done
index limit approached


In [None]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 1.0,
 'namespaces': {'': {'vector_count': 207744}},
 'total_vector_count': 207744}

## Cohere Embeddings Similarity Search

In [None]:
# create the query embedding
def similarity_search(co, query, neighbors=5, model='multilingual-22-12'):
  xq = co.embed(
      texts=[query],
      model=model,
      truncate='LEFT',
      ).embeddings
  res = index.query(xq, top_k=neighbors, include_metadata=True)
  for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

In [None]:
similarity_search(co, "what caused the 2008 financial crisis?")

0.93: Total financial losses from lost economic activity and stock market declines have been estimated at $15 trillion.
0.91: In September 2008 many large financial firms in the United States collapsed, merged, or went under conservatorship (a person is assigned to manage a company when it cannot manage itself). The factors that led to the crisis were reported in business journals many months before September 2008.
0.91: The financial crisis of 2007–2008 was a major financial crisis, the worst of its kind since the Great Depression in the 1930s. Until the COVID-19 recession, the financial crisis of 2007-2008 is considered the worst financial crisis during the 21st century.
0.91: In the 2008 American economic failure, creditors lent money to debtors who could not pay back that money. This lowered housing prices and hurt the economy.
0.91: The biggest issue during the campaign was the bad economy. Other issues included health care, the Iraq War, the war on terrorism, and energy independe

# OpenAI LLM using Langchain API

## Pincone Vectorstore

In [None]:
embeddings = CohereEmbeddings(
    cohere_api_key="ojTslYOISpyFNL7teU3cbR4PRbOs6AVsIIhGWIMe",
     model = "multilingual-22-12")

In [None]:
text_field = "text"

# switch back to normal index for langchain

vectorstore = Pinecone(
    index, embeddings.embed_query, text_field
)

In [None]:
query = "tell me about anand mohan"
results = vectorstore.similarity_search(query,k=3)

In [None]:
for res in results:
  print(res.page_content)

"Read... in the name of God Who made man from a drop of blood... God is Most Rewarding... He Who taught man to write with pen... and taught man what he knew not."
Premchand lived a life of financial struggle. Once he took a loan of two-and-a-half rupees to buy some clothes. He had to struggle for three years to pay it back.
"I testify that there is no other god but Allah, and I testify that Muhammad is the Messenger of Allah."


## LLM and Retrieval QA

In [None]:

# completion llm
llm = ChatOpenAI(
    openai_api_key="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",
    model_name='gpt-3.5-turbo',
    temperature=0.0
)
#retrieval QA
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
)


In [None]:
qa.run("tell me about anand mohan")

"I'm sorry, I don't have enough context to answer your question. Could you please provide more information about who Anand Mohan is?"

## Custom Template similar to Retrieval QA

In [None]:
from langchain.chains.question_answering import load_qa_chain

#indexing vectors
articles = vectorstore.similarity_search("what is capital of france")

#custom template
prompt_template = """Use the following pieces of context to answer the question
 at the end. If you don't know the answer, just say that you don't know,
  don't try to make up an answer. Note you are an assitant created by Sham.

{context}

Question: {question}"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

#load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT)

chain({"input_documents": articles, "question": "what is capital of france"}, return_only_outputs=False)

{'input_documents': [Document(page_content='The capital of France is Paris. In the course of history, the national capital has been in many locations other than Paris.', metadata={'title': 'Capital of France', 'url': 'https://simple.wikipedia.org/wiki?curid=316223', 'wiki_id': 316223.0}),
  Document(page_content='Its capital, Lyon, is at to the southwest of Paris, the national capital, at to the northwest of Marseille and at to the west of Bordeaux.', metadata={'title': 'Auvergne-Rhône-Alpes', 'url': 'https://simple.wikipedia.org/wiki?curid=542023', 'wiki_id': 542023.0}),
  Document(page_content='The 101 French "départements" are now grouped into 13 metropolitan and five overseas régions. Their capitals are called préfectures.', metadata={'title': 'Departments of France', 'url': 'https://simple.wikipedia.org/wiki?curid=28272', 'wiki_id': 28272.0}),
  Document(page_content='Its capital, Toulouse, is at to the southwest of Paris, the national capital, at to the west of Marseille, at to t

In [None]:
articles

[Document(page_content='The capital of France is Paris. In the course of history, the national capital has been in many locations other than Paris.', metadata={'title': 'Capital of France', 'url': 'https://simple.wikipedia.org/wiki?curid=316223', 'wiki_id': 316223.0}),
 Document(page_content='Its capital, Lyon, is at to the southwest of Paris, the national capital, at to the northwest of Marseille and at to the west of Bordeaux.', metadata={'title': 'Auvergne-Rhône-Alpes', 'url': 'https://simple.wikipedia.org/wiki?curid=542023', 'wiki_id': 542023.0}),
 Document(page_content='The 101 French "départements" are now grouped into 13 metropolitan and five overseas régions. Their capitals are called préfectures.', metadata={'title': 'Departments of France', 'url': 'https://simple.wikipedia.org/wiki?curid=28272', 'wiki_id': 28272.0}),
 Document(page_content='Its capital, Toulouse, is at to the southwest of Paris, the national capital, at to the west of Marseille, at to the southeast of Bordeau

## Output Parser

In [None]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate

In [None]:
response_schemas = [
    ResponseSchema(name="answer", description="answer to the user's question"),
    ResponseSchema(name="url", description="source used to answer the user's question, should be a website.")
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [None]:
prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template("answer the users question as best as possible.\n{format_instructions}\n{question}")  
    ],
    input_variables=["question"],
    partial_variables={"format_instructions": format_instructions}
)

In [None]:
_input = prompt.format_prompt(question="what's the capital of france?")
output = llm(_input.to_messages())

In [None]:
output_parser.parse(output.content)

{'answer': 'Paris', 'url': 'https://en.wikipedia.org/wiki/Paris'}