In [33]:
import openai
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone as pinecone
from langchain.llms import OpenAI
import os
from pathlib import Path
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv,find_dotenv
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain_openai import OpenAI,ChatOpenAI
import re
import json
from langchain.schema import HumanMessage
from langchain.prompts import (HumanMessagePromptTemplate,
                               PromptTemplate,ChatPromptTemplate)
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

<font color="green'>

In [6]:
load_dotenv(dotenv_path=find_dotenv())

True

### Load Documents

In [3]:
def loadDocs(directory):
    loader=PyPDFDirectoryLoader(path=directory)
    documents=loader.load()
    return documents

In [4]:
directoryPath=Path("Docs")
documents=loadDocs(directory=directoryPath)

In [5]:
documents

[Document(metadata={'source': 'Docs\\Doc 1.pdf', 'page': 0}, page_content="India, officially known as the Republic of India, is a diverse and vibrant country located in South\nAsia. With a rich history spanning thousands of years, India is known for its cultural heritage, \nreligious diversity, and vast landscapes. From the majestic Himalayas in the north to the serene\nbackwaters of Kerala in the south, India encompasses a wide range of geographical features, \nincluding deserts, plains, mountains, and coastlines, making it a land of incredible natural \nbeauty.\nIndia is the seventh-largest country by land area and the second-most populous country in the \nworld, with a population exceeding 1.3 billion people. It is a federal parliamentary democratic \nrepublic, with a president as the head of state and a prime minister as the head of government. \nThe country follows a multi-tiered administrative structure, with 28 states and 9 union territories,\neach having its own elected governm

In [6]:
len(documents)

3

### Transform Documents

In [7]:
def splitDocs(documents, chunkSize=1000,chunkOverlap=20):
    textSplitter=RecursiveCharacterTextSplitter(
        chunk_size=chunkSize,
        chunk_overlap=chunkOverlap
        )
    docs=textSplitter.split_documents(documents=documents)
    return docs

In [8]:
docs=splitDocs(documents=documents)

In [9]:
docs[:3]

[Document(metadata={'source': 'Docs\\Doc 1.pdf', 'page': 0}, page_content='India, officially known as the Republic of India, is a diverse and vibrant country located in South\nAsia. With a rich history spanning thousands of years, India is known for its cultural heritage, \nreligious diversity, and vast landscapes. From the majestic Himalayas in the north to the serene\nbackwaters of Kerala in the south, India encompasses a wide range of geographical features, \nincluding deserts, plains, mountains, and coastlines, making it a land of incredible natural \nbeauty.\nIndia is the seventh-largest country by land area and the second-most populous country in the \nworld, with a population exceeding 1.3 billion people. It is a federal parliamentary democratic \nrepublic, with a president as the head of state and a prime minister as the head of government. \nThe country follows a multi-tiered administrative structure, with 28 states and 9 union territories,\neach having its own elected governm

In [10]:
len(docs)

7

### Generate Text Embeddings

In [11]:
embeddings=SentenceTransformerEmbeddings(
    model_name="all-MiniLM-L6-v2"
)

  warn_deprecated(


In [12]:
queryResult=embeddings.embed_query(text="Hello Buddy")

In [13]:
len(queryResult)

384

In [14]:
queryResult[3:10]

[0.033901240676641464,
 0.024947505444288254,
 -0.0967373475432396,
 0.05952315405011177,
 0.058978162705898285,
 -0.01789671741425991,
 -0.023178840056061745]

In [15]:
pc=Pinecone()

In [16]:
indexName="mcq-create"
if indexName not in pc.list_indexes().names():
    print(f"Creating Index {indexName}")
    pc.create_index(
        name=indexName,
        dimension=len(queryResult),
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print(f"Index Created: {indexName}")
else:
    print(f"Index Name: {indexName} already Exists")

Creating Index mcq-create
Index Created: mcq-create


In [17]:
index=pc.Index(name=indexName)

In [18]:
index

<pinecone.data.index.Index at 0x23c981bf3d0>

In [19]:
vectorStore=pinecone.from_documents(
    documents=docs,
    embedding=embeddings,
    index_name=indexName
)

In [20]:
vectorStore

<langchain_community.vectorstores.pinecone.Pinecone at 0x23cf0a372d0>

### Retrieve Answers

In [21]:
def get_similar_docs(vectorStore,query,k=2):
    similarDocs=vectorStore.similarity_search(
        query=query,
        k=3
    )
    return similarDocs

In [22]:
llm=OpenAI(model="gpt-3.5-turbo-instruct",temperature=0.2)


In [34]:
chain=load_qa_chain(llm=llm,chain_type="stuff")

In [35]:
def getAnswer(query,chain,vectorStore):
    relevantDocs=get_similar_docs(vectorStore=vectorStore,query=query)
    print(relevantDocs)
    response=chain.run(input_documents=relevantDocs,question=query)
    return response

In [36]:
query="How is India's Economy?"

In [37]:
answer=getAnswer(query=query,chain=chain,vectorStore=vectorStore)

[Document(metadata={'page': 0.0, 'source': 'Docs\\Doc 2.pdf'}, page_content='However, India also faces various socio-economic challenges. Poverty, income inequality, and \nunemployment are persistent issues that the country strives to address. Efforts are being made\nto improve education, healthcare, infrastructure, and social welfare programs to uplift \nmarginalized sections of society.\nEducation plays a vital role in India, with a strong emphasis on academic excellence. The \ncountry has a vast network of schools, colleges, and universities, producing a large number of \ngraduates every year. Indian professionals have made significant contributions in various fields \nglobally, particularly in science, technology, engineering, and mathematics (STEM).\nThe Indian film industry, popularly known as Bollywood, is a global phenomenon, producing the\nlargest number of films annually. Indian cinema reflects the diversity and cultural richness of \nthe country and has a massive following b

In [38]:
print(answer)

 India's economy is one of the fastest-growing in the world, transitioning from an agrarian economy to a service-oriented and industrialized economy. It is known for its software and information technology services, pharmaceuticals, textiles, agriculture, and manufacturing sectors. Major cities like Mumbai, Delhi, Bangalore, and Chennai are hubs of business and commerce, attracting investments and fostering innovation.


### Structure the Output

In [39]:
responseSchemas=[
    ResponseSchema(
        name="question",
        description="Question generated from provided input text data"
        ),
    ResponseSchema(
        name="choices",
        description="Available options for a multiple-choice question in comma separated"
    ),
    ResponseSchema(
        name="answer",
        description="Correct answer for the asked question"
    )
]

In [40]:
outputParser=StructuredOutputParser.from_response_schemas(
    response_schemas=responseSchemas
)

In [41]:
print(outputParser)

response_schemas=[ResponseSchema(name='question', description='Question generated from provided input text data', type='string'), ResponseSchema(name='choices', description='Available options for a multiple-choice question in comma separated', type='string'), ResponseSchema(name='answer', description='Correct answer for the asked question', type='string')]


In [42]:
formatInstructions=outputParser.get_format_instructions()
print(formatInstructions)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"question": string  // Question generated from provided input text data
	"choices": string  // Available options for a multiple-choice question in comma separated
	"answer": string  // Correct answer for the asked question
}
```


In [43]:
chatModel=ChatOpenAI(name="gpt-3.5-turbo")

In [44]:
chatModel

ChatOpenAI(name='gpt-3.5-turbo', client=<openai.resources.chat.completions.Completions object at 0x0000023C9C58EE50>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x0000023C9C4319D0>, openai_api_key=SecretStr('**********'), openai_proxy='')

In [45]:
prompt=ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(
            template=""" 
            When a text input is given by the user, please generate multiple choice questions from
            it along with the correct answer
            \n{user_prompt}\n{format_instructions}
            """)
            ],
    input_variables=["user_prompt"],
    partial_variables={"format_instructions":formatInstructions}
)

In [46]:
finalQuery=prompt.format_prompt(user_prompt=answer)
print(finalQuery)

messages=[HumanMessage(content=' \n            When a text input is given by the user, please generate multiple choice questions from\n            it along with the correct answer\n            \n India\'s economy is one of the fastest-growing in the world, transitioning from an agrarian economy to a service-oriented and industrialized economy. It is known for its software and information technology services, pharmaceuticals, textiles, agriculture, and manufacturing sectors. Major cities like Mumbai, Delhi, Bangalore, and Chennai are hubs of business and commerce, attracting investments and fostering innovation.\nThe output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"question": string  // Question generated from provided input text data\n\t"choices": string  // Available options for a multiple-choice question in comma separated\n\t"answer": string  // Correct answer for the asked question\

In [47]:
finalQuery.to_messages()

[HumanMessage(content=' \n            When a text input is given by the user, please generate multiple choice questions from\n            it along with the correct answer\n            \n India\'s economy is one of the fastest-growing in the world, transitioning from an agrarian economy to a service-oriented and industrialized economy. It is known for its software and information technology services, pharmaceuticals, textiles, agriculture, and manufacturing sectors. Major cities like Mumbai, Delhi, Bangalore, and Chennai are hubs of business and commerce, attracting investments and fostering innovation.\nThe output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"question": string  // Question generated from provided input text data\n\t"choices": string  // Available options for a multiple-choice question in comma separated\n\t"answer": string  // Correct answer for the asked question\n}\n```\n

In [51]:
finalQueryOutput=chatModel.invoke(input=finalQuery.to_messages())

In [49]:
print(finalQueryOutput.content)

```json
{
	"question": "Which sector is India known for in its economy?",
	"choices": "A) Software and information technology services, B) Pharmaceuticals, C) Textiles, D) All of the above",
	"answer": "D) All of the above"
}
```
```json
{
	"question": "Which cities in India are hubs of business and commerce?",
	"choices": "A) Mumbai, B) Delhi, C) Bangalore, D) Chennai",
	"answer": "D) Chennai"
}
```


In [89]:
answer

" India's economy is one of the fastest-growing in the world, transitioning from an agrarian economy to a service-oriented and industrialized economy. It is known for its software and information technology services, pharmaceuticals, textiles, agriculture, and manufacturing sectors. Major cities like Mumbai, Delhi, Bangalore, and Chennai are hubs of business and commerce, attracting investments and fostering innovation."

In [58]:
print(re.search(pattern=r"{(.+)}",string=finalQueryOutput.content,flags=re.DOTALL).group(0))

{
	"question": "Which sector is India known for in its economy?",
	"choices": "A) Software and information technology services, B) Textiles, C) Tourism, D) Mining",
	"answer": "A) Software and information technology services"
}
{
	"question": "Which cities in India are hubs of business and commerce?",
	"choices": "A) Mumbai, B) Delhi, C) Bangalore, D) Chennai",
	"answer": "A) Mumbai, B) Delhi, C) Bangalore, D) Chennai"
}
{
	"question": "What is India transitioning from in terms of its economy?",
	"choices": "A) Agrarian economy, B) Industrialized economy, C) Service-oriented economy, D) All of the above",
	"answer": "D) All of the above"
}
