In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [2]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [3]:
extracted_data = load_pdf("../data/")

In [4]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [5]:
text_chunks = text_split(extracted_data)

In [6]:
print(len(text_chunks))

4359


In [7]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [8]:
embeddings = download_hugging_face_embeddings()



In [26]:
embeddings.embed_documents

<bound method HuggingFaceEmbeddings.embed_documents of HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})>

In [11]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result), type(query_result))



Length 384 <class 'list'>
[-0.03447727486491203, 0.03102312609553337, 0.006734980270266533, 0.026108933612704277, -0.03936205804347992, -0.16030246019363403, 0.06692394614219666, -0.006441438104957342, -0.047450482845306396, 0.014758863486349583, 0.07087534666061401, 0.05552757531404495, 0.019193356856703758, -0.02625126577913761, -0.01010954286903143, -0.026940442621707916, 0.022307462990283966, -0.02222665585577488, -0.14969263970851898, -0.017493024468421936, 0.007676282897591591, 0.054352231323719025, 0.0032544038258492947, 0.03172588348388672, -0.08462139964103699, -0.029405992478132248, 0.051595550030469894, 0.048124078661203384, -0.003314835485070944, -0.05827915295958519, 0.04196925833821297, 0.022210702300071716, 0.1281888633966446, -0.022338951006531715, -0.011656239628791809, 0.06292837113142014, -0.03287634998559952, -0.09122604131698608, -0.03117534890770912, 0.052699536085128784, 0.04703483358025551, -0.08420310169458389, -0.030056182295084, -0.020744839683175087, 0.00951

In [33]:
from langchain.vectorstores import Pinecone as langchain_pinecone
from pinecone import Pinecone

pinecone_api_key = ""

pc = Pinecone(api_key= pinecone_api_key)
index = pc.Index("chatbot")
vectorstore = langchain_pinecone(index, embeddings.embed_query, text_key="content")

In [None]:
vectorstore.add_documents(t for t in text_chunks)

In [44]:
index.describe_index_stats()
# result = vectorstore.similarity_search_with_score(query="What is BERT?", k=4)

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4685}},
 'total_vector_count': 4685}

In [55]:
query = "What is BeRT?"
similarity_result = index.query(vector=embeddings.embed_query(query), top_k=4)

{'matches': [{'id': 'ed1837af-859b-42a2-8154-9a3deb913a53',
              'score': 0.582077086,
              'values': []},
             {'id': 'd1bb1eb8-726c-4412-ab53-334a868786a2',
              'score': 0.562841952,
              'values': []},
             {'id': '7156364c-5e21-4f81-a7f2-beed6b321f54',
              'score': 0.528902709,
              'values': []},
             {'id': '2721f573-d2b5-4ac7-8e70-6a9e1c2823de',
              'score': 0.514712095,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

In [68]:
# query_answer = result['vectors'][id1]['metadata']['content']

In [69]:
# print(query_answer)

• It is replaced with the unique vocabulary token [MASK] .
• It is replaced with another token from the vocabulary, randomly sampled
based on token unigram probabilities.
• It is left unchanged.
In BERT, 15% of the input tokens in a training sequence are sampled for learning.
Of these, 80% are replaced with [MASK] , 10% are replaced with randomly selected
tokens, and the remaining 10% are left unchanged.
The MLM training objective is to predict the original inputs for each of the
