# Install dependencies
pip install langchain
pip install tiktoken
pip install transformers

In [31]:
pdfUrl = 'gpt-4.pdf'

In [32]:
""" Read the pdf file """

from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(pdfUrl)
data = []
all_content = []
pages = loader.load_and_split()
if not len(pages):
    raise ValueError("No data found")
for page in pages:
    content = page.page_content
    # content = clean_string(content)
    meta_data = page.metadata
    meta_data["url"] = pdfUrl
    data.append(
        {
            "content": content,
            "meta_data": meta_data,
        }
    )
    all_content.append(content)
# doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()

print(data)

"""
data = [
  {
    content: string;
    meta_data: {
      source: string;
      page: number;
      url: string;
    }
  }
]
"""




'\ndata = [\n  {\n    content: string;\n    meta_data: {\n      source: string;\n      page: number;\n      url: string;\n    }\n  }\n]\n'

In [33]:
""" Tokenize the data """

from transformers import AutoTokenizer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter

text_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=0, 
    model_name="sentence-transformers/all-MiniLM-L6-v2", 
    tokens_per_chunk=256
)

chunked_data = []

for page in data:
  chunks = text_splitter.split_text(page["content"])
  for chunk in chunks:
    chunked_data.append({
      "content": chunk,
      "meta_data": page["meta_data"]
    })

print(chunked_data)



In [34]:
""" Embed all chunks """

import pyembeddings
pyembeddings.init('f257bbe3-bbc3-4885-90a1-3bd48e6ec591')

gen = pyembeddings.Generator()
gen.set_model("MiniLM")

# chunked_data = chunked_data[:10]

embeddings = []

for chunk in chunked_data:
  embeddings.append(gen.embed(chunk["content"])[0])

print(len(embeddings))
print(len(chunked_data))

print(embeddings[0])

db = pyembeddings.Database()
collection = db.create_collection('pdf_collection', 'MiniLM')

documents = []
metadatas = []
ids = []

for index, chunk in enumerate(chunked_data):
  documents.append(chunk["content"])
  metadatas.append(chunk["meta_data"])
  ids.append(str(index))

print('embeddings', embeddings)

addAll = collection.add(
  documents=documents,
  embeddings=embeddings,
  metadatas=metadatas,
  ids=ids,
)

print(addAll)


342
342
[-0.057403162121772766, -0.04463536664843559, -0.006716681178659201, -0.007699054665863514, 0.044755760580301285, -0.05124007537961006, -0.016915036365389824, 0.07832682132720947, 0.05007052794098854, -0.04052967578172684, -0.05890699848532677, -0.020623521879315376, 0.01638336293399334, 0.10389180481433868, -0.016955144703388214, -0.06533598154783249, 0.004229317419230938, 0.020080504938960075, -0.1000128760933876, -0.05718523636460304, -0.02374526485800743, 0.04605412110686302, 0.06637968122959137, -0.023126501590013504, 0.02326352894306183, 0.04151042178273201, -0.00249940506182611, -0.10326822102069855, -0.014342015609145164, -0.04879685118794441, 0.019547652453184128, -0.03638960048556328, 0.04739947244524956, 0.09669756144285202, -0.11693989485502243, 0.08863189816474915, -0.0512777678668499, -0.028218254446983337, 0.01511848159134388, -0.06708843261003494, -0.005175513215363026, -0.031087461858987808, -0.007732637692242861, 0.014337843284010887, 0.13786906003952026, -0.0

In [36]:
db.delete_collection('pdf_collection')

In [35]:
query_string = "What is the performance of GPT-4"
query_embedding = gen.embed(query_string)[0]

query = collection.query(
  embedding=query_embedding, 
  n_results=5
)

print(query)

{'ids': [['207', '335', '32', '26', '231']], 'distances': [[0.29842185974121094, 0.34719158165904773, 0.34865903854370117, 0.37366604804992676, 0.3806065320968628]], 'embeddings': None, 'metadatas': [[{'page': 57, 'source': 'gpt-4.pdf', 'url': 'gpt-4.pdf'}, {'page': 97, 'source': 'gpt-4.pdf', 'url': 'gpt-4.pdf'}, {'page': 10, 'source': 'gpt-4.pdf', 'url': 'gpt-4.pdf'}, {'page': 7, 'source': 'gpt-4.pdf', 'url': 'gpt-4.pdf'}, {'page': 64, 'source': 'gpt-4.pdf', 'url': 'gpt-4.pdf'}]], 'documents': [['. [ 91 ] from conversations with our launch partners, we understand that gpt - 4 makes it easier and more straightforward', 'figure 11 : results on if evaluations across gpt3. 5, gpt3. 5 - turbo, gpt - 4 - launch 98', '7. performance of gpt - 4 on truthfulqa. accuracy is shown on the y - axis, higher is better. we compare gpt - 4 under zero - shot prompting, few - shot prompting, and after rlhf ﬁne - tuning. gpt - 4 signiﬁcantly outperforms both gpt - 3. 5 and anthropic - lm from bai et al. [

In [37]:
""" Put it all together """

from langchain.document_loaders import PyPDFLoader
from transformers import AutoTokenizer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
import pyembeddings

def embedPdf(pdfUrl):
  loader = PyPDFLoader(pdfUrl)
  data = []
  pages = loader.load_and_split()
  if not len(pages):
    raise ValueError("No data found")
  for page in pages:
    content = page.page_content
    # content = clean_string(content)
    meta_data = page.metadata
    meta_data["url"] = pdfUrl
    data.append(
        {
            "content": content,
            "meta_data": meta_data,
        }
    )

  text_splitter = SentenceTransformersTokenTextSplitter(
      chunk_overlap=0, 
      model_name="sentence-transformers/all-MiniLM-L6-v2", 
      tokens_per_chunk=256
  )

  chunked_data = []

  for page in data:
    chunks = text_splitter.split_text(page["content"])
    for chunk in chunks:
      chunked_data.append({
        "content": chunk,
        "meta_data": page["meta_data"]
      })

  pyembeddings.init('f257bbe3-bbc3-4885-90a1-3bd48e6ec591')

  gen = pyembeddings.Generator()
  gen.set_model("MiniLM")

  # chunked_data = chunked_data[:10]

  embeddings = []

  for chunk in chunked_data:
    embeddings.append(gen.embed(chunk["content"])[0])

  print(len(embeddings))
  print(len(chunked_data))

  print(embeddings[0])

  db = pyembeddings.Database()
  collection = db.create_collection('pdf_collection', 'MiniLM')

  documents = []
  metadatas = []
  ids = []

  for index, chunk in enumerate(chunked_data):
    documents.append(chunk["content"])
    metadatas.append(chunk["meta_data"])
    ids.append(str(index))

  print('embeddings', embeddings)

  addAll = collection.add(
    documents=documents,
    embeddings=embeddings,
    metadatas=metadatas,
    ids=ids,
  )

  print(addAll)

embedPdf('gpt-4.pdf')

query_string = "What is the performance of GPT-4"
query_embedding = gen.embed(query_string)[0]

query = collection.query(
  embedding=query_embedding, 
  n_results=5
)

print(query)

342
342
[-0.057403162121772766, -0.04463536664843559, -0.006716681178659201, -0.007699054665863514, 0.044755760580301285, -0.05124007537961006, -0.016915036365389824, 0.07832682132720947, 0.05007052794098854, -0.04052967578172684, -0.05890699848532677, -0.020623521879315376, 0.01638336293399334, 0.10389180481433868, -0.016955144703388214, -0.06533598154783249, 0.004229317419230938, 0.020080504938960075, -0.1000128760933876, -0.05718523636460304, -0.02374526485800743, 0.04605412110686302, 0.06637968122959137, -0.023126501590013504, 0.02326352894306183, 0.04151042178273201, -0.00249940506182611, -0.10326822102069855, -0.014342015609145164, -0.04879685118794441, 0.019547652453184128, -0.03638960048556328, 0.04739947244524956, 0.09669756144285202, -0.11693989485502243, 0.08863189816474915, -0.0512777678668499, -0.028218254446983337, 0.01511848159134388, -0.06708843261003494, -0.005175513215363026, -0.031087461858987808, -0.007732637692242861, 0.014337843284010887, 0.13786906003952026, -0.0