### Importing bunch of librarires

In [47]:
# LLM and Langchain stuff
import langchain
from langchain.llms import GPT4All

# Document loader and Splitting Text
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Vector DB
import pinecone
import weaviate
from langchain.vectorstores import Pinecone, Weaviate

# Embedding
from sentence_transformers import SentenceTransformer
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

from langchain.chains import RetrievalQA

### Read the document

In [2]:
def readDocument(dirr):
    fileLoader = PyPDFDirectoryLoader(dirr)
    doc = fileLoader.load()
    return doc

In [3]:
doc = readDocument('doc/')
len(doc)

58

In [20]:
print(doc[0].page_content)

GOVERNMENT OF INDIA
BUDGET 2023-2024
SPEECH
OF
NIRMALA SITHARAMAN
MINISTER OF FINANCE
February 1,  2023


### Split document into small chunks

In [24]:
def docIntoChunk(document, chunk_size=800, chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(document)
    return doc

In [25]:
documents = docIntoChunk(document=doc)

In [26]:
len(documents)

141

### Embedding

In [8]:
embeddings = SentenceTransformer('all-MiniLM-L6-v2')
embeddings


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [17]:
vectors = embeddings.encode("Jai Shree Ganesh")
len(vectors)

384

### Setting up vector DB

In [10]:
from dotenv import load_dotenv
import os

load_dotenv()

WEVIATE_KEY = os.getenv('WEVIATE_KEY')
PINECONE_KEY = os.getenv('PINECONE_KEY')

### Weaviate desnot support gpt4all for ARM chip 

In [11]:

# auth_config = weaviate.AuthApiKey(api_key=WEVIATE_KEY)

# client = weaviate.Client(
#   url="https://notgpt-l76a3avb.weaviate.network",
#   auth_client_secret=auth_config
# )

In [12]:
# client.schema.delete_all()
# client.schema.get()
# schema = {
#   "classes": [
#     {
#       "class": "Chatbot",
#       "description": "Documents for chatbot",
#       "vectorizer": "text2vec-gpt4all",
#       "moduleConfig": {
#         "text2vec-gpt4all": {
#           "vectorizeClassName": False
#         }
#       },
#       "properties": [
#         {
#           "name": "content",
#           "dataType": ["text"],
#           "description": "Content that will be vectorized",
#           "moduleConfig": {
#             "text2vec-gpt4all": {
#               "skip": False,
#               "vectorizePropertyName": False
#             }
#           }
#         }
#       ]
#     }
#   ]
# }

# client.schema.create(schema)

# vectorstore = Weaviate(client, "Chatbot", "content", attributes=["source"])

# text_meta_pair = [(doc.page_content, doc.metadata) for doc in documents]
# texts, meta = list(zip(*text_meta_pair))
# vectorstore.add_texts(texts, meta)

### Pinecone vector DB

In [30]:
pinecone.init(
    api_key=PINECONE_KEY,
    environment="gcp-starter"
)
index_name="notagpt"

In [50]:
db=Pinecone.from_documents(documents, embeddings, index_name=index_name)

### Setting up LLM model (got4all)

In [44]:
PATH = "/Users/rohan/Library/Application_Support/nomic.ai/GPT4All/"

In [45]:
llm = GPT4All(model=PATH, verbose=True )

ValidationError: 1 validation error for GPT4All
__root__
  Invalid model directory: /Users/rohan/Library/Application_Support/nomic.ai/GPT4All/ (type=value_error)

In [51]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    verbose=False,
)

### Ask Qus

In [None]:
query = qa(f"""
    How much the agriculture target will be increased by how many crore?
""")
print(query["result"])