In [1]:
import openai
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone #this below has been replaced by the below import
from langchain_community.vectorstores import Pinecone #Importing Pinecone class, specifically using the alias PineconeStore for convenience.
#from langchain.llms import OpenAI  #this below has been replaced by the below import
from langchain_openai import OpenAI
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

  from tqdm.autonotebook import tqdm


In [15]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["HUG_API_KEY"] = os.getenv("HUG_API_KEY")
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")

In [4]:
# Function to read documents

def load_docs(directory):
  loader = PyPDFDirectoryLoader(directory)
  documents = loader.load()
  return documents

In [5]:
# Passing the directory to the 'load_docs' function

directory = 'D:/Lab Setup/DataConnections/Docs'
documents = load_docs(directory)
len(documents)

3

In [6]:
documents

[Document(metadata={'source': 'D:\\Lab Setup\\DataConnections\\Docs\\Doc 1.pdf', 'page': 0}, page_content="India, officially known as the Republic of India, is a diverse and vibrant country located in South\nAsia. With a rich history spanning thousands of years, India is known for its cultural heritage, \nreligious diversity, and vast landscapes. From the majestic Himalayas in the north to the serene\nbackwaters of Kerala in the south, India encompasses a wide range of geographical features, \nincluding deserts, plains, mountains, and coastlines, making it a land of incredible natural \nbeauty.\nIndia is the seventh-largest country by land area and the second-most populous country in the \nworld, with a population exceeding 1.3 billion people. It is a federal parliamentary democratic \nrepublic, with a president as the head of state and a prime minister as the head of government. \nThe country follows a multi-tiered administrative structure, with 28 states and 9 union territories,\neac

In [7]:
#This function will split the documents into chunks

def split_docs(documents, chunk_size=1000, chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

In [8]:
docs = split_docs(documents)
print(len(docs))

7


# Generate Text Embeddings

In [9]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


In [10]:
query_result = embeddings.embed_query("Hello Buddy")
len(query_result)

384

In [11]:
query_result

[-0.069788359105587,
 0.05420626699924469,
 0.07814787328243256,
 0.033901240676641464,
 0.024947505444288254,
 -0.0967373475432396,
 0.05952315405011177,
 0.058978162705898285,
 -0.01789671741425991,
 -0.023178840056061745,
 -0.019000211730599403,
 0.0005969092599116266,
 0.024666039273142815,
 -0.07030832022428513,
 -0.007522563450038433,
 0.010224507190287113,
 -0.011180819943547249,
 -0.02124859392642975,
 -0.0385945588350296,
 0.026550382375717163,
 -0.0650523379445076,
 0.06500021368265152,
 0.009431798942387104,
 -0.06271222978830338,
 -0.023625466972589493,
 -0.030638093128800392,
 0.05996112897992134,
 0.07367486506700516,
 -0.032867807894945145,
 -0.026061033830046654,
 -0.006967142224311829,
 0.03061792254447937,
 0.05939663201570511,
 0.0014719826867803931,
 0.012021631933748722,
 0.028293736279010773,
 -0.059225261211395264,
 -0.07919756323099136,
 0.04896372929215431,
 0.02309011109173298,
 0.055362798273563385,
 -0.02625139430165291,
 -0.01732112467288971,
 0.00551110040

# Pinecone

**Pinecone allows for data to be uploaded into a vector database and true semantic search can be performed.**
**Not only is conversational data highly unstructured, but it can also be complex. Vector search and vector databases allows for similarity searches.**

**We will initialize Pinecone and create a Pinecone index by passing our documents, embeddings model and mentioning the specific INDEX which has to be used**
**Vector databases are designed to handle the unique structure of vector embeddings, which are dense vectors of numbers that represent text. They are used in machine learning to capture the meaning of words and map their semantic meaning.**

**These databases index vectors for easy search and retrieval by comparing values and finding those that are most similar to one another, making them ideal for natural language processing and AI-driven applications.**

In [16]:
# Due to recent changes from Pinecone team, there are some minor changes we have to implement, as a part of this we Initialize the Pinecone client
# Please update your pinecone-client package version >=3.0.1

from pinecone import Pinecone as PineconeClient # Importing the Pinecone class from the pinecone package
from langchain_community.vectorstores import Pinecone


# Set your Pinecone API key
# Recent changes by langchain team, expects ""PINECONE_API_KEY" environment variable for Pinecone usage! So we are creating it here
# we are setting the environment variable "PINECONE_API_KEY" to the value and in the next step retrieving it :)

# Initialize the Pinecone client
PineconeClient(api_key = os.getenv("PINECONE_API_KEY"), environment="gcp-starter")
index_name="dataconnections"
index = Pinecone.from_documents(docs, embeddings, index_name = index_name)

# Retrieve Answers

In [17]:
# This function will help us in fetching the top relevent documents from our vector store - Pinecone

def get_similiar_docs(query, k=2):
    similar_docs = index.similarity_search(query, k=k)
    return similar_docs

**'load_qa_chain' Loads a chain that you can use to do QA over a set of documents.**

**And we will be using Huggingface for the reasoning purpose**

In [18]:
from langchain.chains.question_answering import load_qa_chain

#from langchain.llms import HuggingFaceHub
#The above have been updated recently, so going forward we have to use the below :)

from langchain.llms import HuggingFaceEndpoint

**BigScience Large Open-science Open-access Multilingual Language Model (BLOOM) is a transformer-based large language model.**

**It was created by over 1000 AI researchers to provide a free large language model for everyone who wants to try. Trained on around 366 billion tokens over March through July 2022, it is considered an alternative to OpenAI's GPT-3 with its 176 billion parameters.**

In [None]:
# The earlier mentioned 'HuggingFaceHub' class has been depreciated, so please use the below class'HuggingFaceEndpoint' 
# and the below mentioned model outperforms most of the available open source LLMs

# llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2") # Model link : https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
# llm

In [19]:
llm = OpenAI()

**Different Types Of Chain_Type:**

**"map_reduce": It divides the texts into batches, processes each batch separately with the question, and combines the answers to provide the final answer.**

**"refine": It divides the texts into batches and refines the answer by sequentially processing each batch with the previous answer.**

**"map-rerank": It divides the texts into batches, evaluates the quality of each answer from LLM, and selects the highest-scoring answers from the batches to generate the final answer. These alternatives help handle token limitations and improve the effectiveness of the question-answering process.**

In [20]:
chain = load_qa_chain(llm, chain_type="stuff")

stuff: https://python.langchain.com/v0.2/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/v0.2/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/v0.2/docs/how_to/#qa-with-rag
  chain = load_qa_chain(llm, chain_type="stuff")


In [21]:
# This function will help us get the answer to the question that we raise

def get_answer(query):
  relevant_docs = get_similiar_docs(query)
  print(relevant_docs)
  response = chain.run(input_documents=relevant_docs, question=query)
  return response

**Passing our question to the above created function**

In [22]:
our_query = "How is India's economy?"
answer = get_answer(our_query)
print(answer)

[Document(metadata={'page': 0.0, 'source': 'D:\\Lab Setup\\DataConnections\\Docs\\Doc 2.pdf'}, page_content='However, India also faces various socio-economic challenges. Poverty, income inequality, and \nunemployment are persistent issues that the country strives to address. Efforts are being made\nto improve education, healthcare, infrastructure, and social welfare programs to uplift \nmarginalized sections of society.\nEducation plays a vital role in India, with a strong emphasis on academic excellence. The \ncountry has a vast network of schools, colleges, and universities, producing a large number of \ngraduates every year. Indian professionals have made significant contributions in various fields \nglobally, particularly in science, technology, engineering, and mathematics (STEM).\nThe Indian film industry, popularly known as Bollywood, is a global phenomenon, producing the\nlargest number of films annually. Indian cinema reflects the diversity and cultural richness of \nthe count

  response = chain.run(input_documents=relevant_docs, question=query)


 It is facing various socio-economic challenges, such as poverty, income inequality, and unemployment. However, efforts are being made to improve education, healthcare, infrastructure, and social welfare programs.


# Structuring the Output

In [23]:
import re
import json

In [24]:
# from langchain.chat_models import ChatOpenAI # this has been replaced by the below import
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

In [26]:
response_schemas = [
    ResponseSchema(name="question", description="Question generated from provided input text data."),
    ResponseSchema(name="choices", description="Available options for a multiple-choice question in comma separated."),
    ResponseSchema(name="answer", description="Correct answer for the asked question.")
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
output_parser

StructuredOutputParser(response_schemas=[ResponseSchema(name='question', description='Question generated from provided input text data.', type='string'), ResponseSchema(name='choices', description='Available options for a multiple-choice question in comma separated.', type='string'), ResponseSchema(name='answer', description='Correct answer for the asked question.', type='string')])

**This helps us fetch the instructions the langchain creates to fetch the response in desired format**

In [27]:
format_instructions = output_parser.get_format_instructions()
 
print(format_instructions)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"question": string  // Question generated from provided input text data.
	"choices": string  // Available options for a multiple-choice question in comma separated.
	"answer": string  // Correct answer for the asked question.
}
```


In [28]:
# create ChatGPT object

chat_model = ChatOpenAI()

In [29]:
chat_model

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x000001EC71456310>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x000001EC713CB130>, root_client=<openai.OpenAI object at 0x000001EC713B1580>, root_async_client=<openai.AsyncOpenAI object at 0x000001EC71456370>, openai_api_key=SecretStr('**********'), openai_proxy='')

**The below snippet will give out a string that contains instructions for how the response should be formatted, and we then insert that into our prompt.**

In [30]:
prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template("""When a text input is given by the user, please generate multiple choice questions 
        from it along with the correct answer. 
        \n{format_instructions}\n{user_prompt}""")  
    ],
    input_variables=["user_prompt"],
    partial_variables={"format_instructions": format_instructions}
)

In [31]:
final_query = prompt.format_prompt(user_prompt = answer)
print(final_query)

messages=[HumanMessage(content='When a text input is given by the user, please generate multiple choice questions \n        from it along with the correct answer. \n        \nThe output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"question": string  // Question generated from provided input text data.\n\t"choices": string  // Available options for a multiple-choice question in comma separated.\n\t"answer": string  // Correct answer for the asked question.\n}\n```\n It is facing various socio-economic challenges, such as poverty, income inequality, and unemployment. However, efforts are being made to improve education, healthcare, infrastructure, and social welfare programs.')]


In [32]:
final_query.to_messages()

[HumanMessage(content='When a text input is given by the user, please generate multiple choice questions \n        from it along with the correct answer. \n        \nThe output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"question": string  // Question generated from provided input text data.\n\t"choices": string  // Available options for a multiple-choice question in comma separated.\n\t"answer": string  // Correct answer for the asked question.\n}\n```\n It is facing various socio-economic challenges, such as poverty, income inequality, and unemployment. However, efforts are being made to improve education, healthcare, infrastructure, and social welfare programs.')]

In [33]:
final_query_output = chat_model.invoke(final_query.to_messages())
print(final_query_output.content)

```json
{
	"question": "What are some of the socio-economic challenges faced by the country?",
	"choices": "A. Poverty, B. Income inequality, C. Unemployment, D. All of the above",
	"answer": "D. All of the above"
}
```


**While working with scenarios like above where we have to process multi-line strings(separated by newline characters – ‘\n’). In such situations, we use re.DOTALL.**

In [34]:
# Let's extract JSON data from Markdown text that we have

markdown_text = final_query_output.content
json_string = re.search(r'{(.*?)}', markdown_text, re.DOTALL).group(1)

In [35]:
print(json_string)


	"question": "What are some of the socio-economic challenges faced by the country?",
	"choices": "A. Poverty, B. Income inequality, C. Unemployment, D. All of the above",
	"answer": "D. All of the above"

