## Load the data from directory you need using PyPDFDirectoryLoader 

In [122]:
import pinecone 
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

doc=read_doc('./data')
len(doc)

376

## Convert the documents into data chunks using RecursiveCharacterTextSplitter

In [123]:
def data_chunks(docs,chunk_size=500,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return doc

chunked_docs=data_chunks(doc)
for doc in chunked_docs:
    print(doc)

page_content='Python Basics: A Practical Introduction\nto Python 3\nReal Python' metadata={'source': 'data\\python-basics-sample-chapters.pdf', 'page': 1}
page_content='Python Basics: A Practical Introduction to Python 3\nRevised and Updated 4th Edition\nDavid Amos, Dan Bader, Joanna Jablonski, Fletcher Heisler\nCopyright © Real Python (realpython.com ), 2012–2020\nFor online information and ordering of this and other books by Real\nPython, please visit realpython.com . For more information, please\ncontact us at info@realpython.com.\nISBN: 9781775093329 (paperback)\nISBN: 9781775093336 (electronic)\nCover design by Aldren Santos' metadata={'source': 'data\\python-basics-sample-chapters.pdf', 'page': 2}
page_content='Cover design by Aldren Santos\nAdditional editing and proofreading by Jacob Schmitt\n“Python” and the Python logos are trademarks or registered trad e-\nmarks of the Python Software Foundation, used by Real Python with\npermission from the Foundation.\nThankyoufordownloadi

In [124]:
len(chunked_docs)

1369

## Use SentenceTransformerEmbeddings from langchain and model used is "all-MiniLM-L6-v2"

In [125]:
from langchain.embeddings import SentenceTransformerEmbeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


## Used Pinecone to store vectors  

In [127]:
from langchain_pinecone import Pinecone

import os 
os.environ['PINECONE_API_KEY'] =os.getenv('PINECONE_API_KEY')
index_name = "langchain"
vector_database_index = Pinecone.from_documents(
                                            index_name = index_name, 
                                            documents = chunked_docs, 
                                            embedding = embeddings)

## It will give you similarity search socre from vector score 

In [131]:
def get_similar_docs(query,k=1,score=False):
    if score:
        similar_docs=vector_database_index.similarity_search_with_score(query=query,k=k)
    else:
        similar_docs=vector_database_index.similarity_search(query=query,k=k)
    return similar_docs

similar_docs=get_similar_docs('what is manipulate string with docs',k=1,score=True)
similar_docs

[(Document(page_content='4.3. Manipulate Strings With Methods\n4.3 ManipulateStringsWithMethods\nStringscomebundledwithspecialfunctionscalled stringmethods\nthat you can use to work with and manipulate strings. There are n u-\nmerousstringmethodsavailable,butwe’llfocusonsomeofthe most\ncommonly used ones.\nIn this section, you’ll learn how to:\n• Convert a string to uppercase or lowercase\n• Remove whitespace from a string\n• Determine if a string begins or ends with certain characters\nLet’s go!\nConvertingStringCase', metadata={'page': 79.0, 'source': 'data\\python-basics-sample-chapters.pdf'}),
  0.556713283)]