### Importing bunch of librarires

In [1]:
# LLM and Langchain stuff
import langchain
from langchain.llms import gpt4all

# Document loader and Splitting Text
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Vector DB
import pinecone
from langchain.vectorstores import pinecone

# Embedding
from sentence_transformers import SentenceTransformer
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import tqdm


### Read the document

In [2]:
def readDocument(dirr):
    fileLoader = PyPDFDirectoryLoader(dirr)
    doc = fileLoader.load()
    return doc

In [17]:
doc = readDocument('doc/')
len(doc)

21

In [18]:
doc

[Document(page_content='Node.js Interview Questions\nTo view the live version of the\npage, click here.\n© Copyright by Interviewbit', metadata={'source': 'doc/NODEJS.pdf', 'page': 0}),
 Document(page_content='Beginner Node.js Interview Questions\n1.\xa0\xa0\xa0What is a first class function in Javascript?\n2.\xa0\xa0\xa0What is Node.js and how it works?\n3.\xa0\xa0\xa0How do you manage packages in your node.js project?\n4.\xa0\xa0\xa0How is Node.js better than other frameworks most popularly used?\n5.\xa0\xa0\xa0Explain the steps how “Control Flow” controls the functions calls?\n6.\xa0\xa0\xa0What are some commonly used timing features of Node.js?\n7.\xa0\xa0\xa0What are the advantages of using promises instead of callbacks?\n8.\xa0\xa0\xa0What is fork in node JS?\n9.\xa0\xa0\xa0Why is Node.js single-threaded?\n10.\xa0\xa0\xa0How do you create a simple server in Node.js that returns Hello World?\n11.\xa0\xa0\xa0How many types of API functions are there in Node.js?\n12.\xa0\xa0\xa0What

### Split document into small chunks

In [13]:
def docIntoChunk(document, chunk_size=800, chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(document)
    return doc

In [19]:
documents = docIntoChunk(document=doc)

In [20]:
len(documents)

40

### Embedding

In [21]:
embedding = SentenceTransformer('all-MiniLM-L6-v2')
embedding


.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 1.72MB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 667kB/s]
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 17.5MB/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 1.15MB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 405kB/s]
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 208kB/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:25<00:00, 3.57MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 64.4kB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 150kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 787kB/s]
tokenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 769kB/s]
train_script.py: 100%|██████████| 13.2k/13.2k [00:00<00:00, 10.7MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 533kB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 744kB/s]


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [24]:
vectors = embedding.encode("Jai Shree Ganesh")
print(vectors)

[-2.73949578e-02  9.53443125e-02  1.34449974e-02 -3.70028801e-02
 -3.86600452e-03  4.48064022e-02  6.55440763e-02 -8.27802718e-02
  1.79681275e-02  3.87205416e-03 -2.00292356e-02 -9.01242271e-02
  4.70864959e-02 -8.23198408e-02  4.72266935e-02 -6.78705126e-02
 -1.31075624e-02  9.27465130e-03  7.09793791e-02 -1.38876334e-01
 -4.97406833e-02  2.98134014e-02 -1.70937628e-02 -2.83141285e-02
 -5.88037781e-02  1.04672564e-02  1.60903912e-02 -5.13635799e-02
 -5.42719439e-02 -1.26373217e-01  1.57906599e-02  4.64421064e-02
 -8.28607678e-02 -1.00579327e-02 -8.88630599e-02 -1.18072075e-03
 -2.23248322e-02  3.39737423e-02 -4.74631414e-02 -2.29368601e-02
  1.14889378e-02 -3.41931768e-02  2.84699202e-02 -1.38607115e-01
  3.75299975e-02 -8.84197950e-02 -7.02269524e-02 -5.56514524e-02
  4.19602245e-02  2.11821385e-02 -6.11359961e-02  6.14755118e-05
 -1.85231548e-02 -1.85014028e-02  9.40959603e-02 -9.43906829e-02
  3.55852954e-02 -3.74081619e-02  1.82257313e-02  2.50587184e-02
 -2.66539659e-02  5.74460

In [25]:
len(vectors)

384

### Setting up vector DB

In [None]:
base = ''