<a href="https://colab.research.google.com/github/rawlingsnsame/langchain/blob/main/generate_pdf_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain pinecone langchain-openai pypdf langchain_pinecone -q

In [None]:
!pip install langchain_community -q

In [None]:
!pip install "unstructured[pdf]"

In [None]:
!apt-get install poppler-utils -q # Install the poppler library containing pdfinfo utility

In [None]:
import os
import glob
import pinecone
from pypdf import PdfReader
from langchain_community.document_loaders import DirectoryLoader
from langchain_pinecone import Pinecone
from langchain.text_splitter import CharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

In [None]:
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API_KEY")

In [None]:
pdf_path = "/content/drive/MyDrive/chatdocs/tax-code-full.pdf"

In [None]:
def extract_text_from_pdf(pdf_path):
  try:
    reader = PdfReader(pdf_path)
    documents = []

    for page_num, page in enumerate(reader.pages):
      text = page.extract_text()
      print(text)
      doc = Document(
          page_content=text,
          metadata={
              "page_number": page_num + 1,
          }
      )
      documents.append(doc)
  except Exception as e:
    print(f"Error extracting text from PDF: {e}")
    return None

  return documents

In [None]:
def split_documents(documents, chunk_size=250, chunk_overlap=100):
  try:
      text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
      )

      texts = text_splitter.split_documents(documents)
      return texts
  except Exception as e:
    print(f"Error splitting documents: {e}")
    return None

In [None]:
from langchain_pinecone import Pinecone
from pinecone import ServerlessSpec

def create_embeddings_index(documents):
  try:
    embeddings = OpenAIEmbeddings(
      model="text-embedding-ada-002",
      openai_api_key=os.environ["OPENAI_API_KEY"]
    )
    pc = pinecone.Pinecone(api_key=os.environ.get("PINECONE_API_KEY"), environment="us-east-1-aws")

    index_name = "cameroon-tax-doc"

    if index_name not in pc.list_indexes().names():
      pc.create_index(
          name=index_name,
          dimension=1536,
          metric="euclidean",
          spec=ServerlessSpec(
          cloud="aws",
          region="us-east-1"
        )
      )
    Pinecone.from_documents(
      documents=documents,
      embedding=embeddings,
      index_name=index_name
    )
    # return pc
  except Exception as e:
    print(f"Error creating embeddings index: {e}")
    # return None


In [None]:
def main():
  documents = extract_text_from_pdf(pdf_path_2)
  print(documents)
  if documents is not None:
    documents = split_documents(documents)
    print(documents)
    if documents is not None:
      create_embeddings_index(documents)
      print(f"Created embeddings index with {len(documents)} documents.")

In [None]:
main()