In [46]:
import os

In [50]:


def process_pdf_with_processor(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str
):
    
    from google.api_core.client_options import ClientOptions
    from google.cloud import documentai
    """
    Uses an existing processor to parse a PDF file via Document AI.
    """
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # Build the processor resource name
    processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
    # Read the PDF into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Prepare the raw document
    raw_document = documentai.RawDocument(
        content=image_content, mime_type="application/pdf"
    )

    # Construct the request
    request = documentai.ProcessRequest(
        name=processor_name,
        raw_document=raw_document,
    )

    # Send the request
    result = client.process_document(request=request)
    document_object = result.document

    print("DOCUMENT TEXT PARSED")
    print("\n\nSENDING FOR TRANSLATION")
    return document_object.text




In [37]:
def chunking_text(full_text: str, chunk_size: int = 1000, overlap: int = 200):
  
  
    chunks = []
    start = 0
    end = chunk_size

    chunk_index = 0
    while start < len(full_text):
        # Extract the chunk
        text_chunk = full_text[start:end]
        
        # Construct metadata. You can add page numbers if you have them.
        metadata = {
            "chunk_index": chunk_index
        }
        
        chunks.append({
            "text": text_chunk,
            "metadata": metadata
        })
        
        # Move the window
        start = end - overlap
        end = start + chunk_size
        chunk_index += 1

    return chunks

In [69]:
def translate_text(target_language: str, text: str, folder: str = "processed_files", filename: str = "translated_doc.txt") -> dict:
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """
    from google.cloud import translate_v2

    translate_client = translate_v2.Client()

    if isinstance(text, bytes):
        text = text.decode("utf-8")

    detection = translate_client.detect_language(text)
    source_lang = detection["language"]

    if source_lang != target_language:
        translation = translate_client.translate(
            text, target_language=target_language
        )
        result =  translation["translatedText"]

    else:
        result = text
        

    os.makedirs(folder, exist_ok=True)

    file_path = os.path.join(folder, filename)

    with open(file_path, "w", encoding="utf-8") as file:
        file.write(result)
   

    print(f"Translation saved to {file_path}")
    print(f"Detected source language: {source_lang}")

    


In [70]:
import config
def parse_and_translate_pdf(target_language, file_location):
    parsed_text = process_pdf_with_processor(
        config.PROJECT_ID,
        config.LOCATION,
        config.PROCESSOR_ID,
        file_location
    )
    translate_text(target_language, parsed_text)
    

In [71]:
file_path = "sample.pdf"
parse_and_translate_pdf('en', file_path)

DOCUMENT TEXT PARSED


SENDING FOR TRANSLATION
Translation saved to processed_files/translated_doc.txt
Detected source language: mr


In [72]:
import config


def parse_and_translate_pdf(target_language, file_location):
  
    # Parsing PDF text
    parsed_text = process_pdf_with_processor(
        config.PROJECT_ID,
        config.LOCATION,
        config.PROCESSOR_ID,
        file_location
    )

    # Chunkin the text into smaller pieces
    chunks = chunking_text(parsed_text)

    # translating each chunk
    translated_chunks = []
    for chunk_data in chunks:
        chunk_text = chunk_data["text"]
        metadata = chunk_data["metadata"]
    
        result = translate_text(target_language, chunk_text)
        translated_chunk = {
            "translated_text": result["translatedText"],
            "detected_source_language": result["detectedSourceLanguage"],
            # Include metadata so we keep track of position, page, etc.
            "metadata": metadata
        }
        translated_chunks.append(translated_chunk)

    return translated_chunks

In [41]:
import openai
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Replace with your real OpenAI API key

def get_openai_embedding(text: str, file_path: str) -> list:
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    loader = DirectoryLoader(file_path, glob="*.txt", loader_cls=TextLoader)  # Load only .txt files
    documents = loader.load()

    # Use a TextSplitter to split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
    split_documents = text_splitter.split_documents(documents)
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=text
    )
    embedding_vector = response["data"][0]["embedding"]  # a list[float]
    return embedding_vector

In [2]:
import os
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai

# Set your API keys for OpenAI

# Initialize OpenAI Embeddings using LangChain
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")  # Specify which embedding model

# Load all text files from a directory
directory_path = "./processed_files"  # directory path with all the national weather service documents
loader = DirectoryLoader(directory_path, glob="*.txt", loader_cls=TextLoader)  # Load only .txt files
documents = loader.load()

# Use a TextSplitter to split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
split_documents = text_splitter.split_documents(documents)

# Connect to the Pinecone index using LangChain's Pinecone wrapper
# Add all the split documents into the Pinecone vector database
pinecone_index_name = "real-estate-docs"
vectorstore = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
vectorstore.add_documents(documents=split_documents )

print("Embeddings from text files residing in the directory, created, and inserted in Pinecone Vector Database successfully!")

Embeddings from text files residing in the directory, created, and inserted in Pinecone Vector Database successfully!


In [3]:
vectorstore = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)

In [None]:
vectorstore.delete(delete_all=True)

In [44]:
def embed_and_index_chunks(
    chunks: list,
    index,   # pinecone.Index object
    document_id: str,
    references: list = None
):

    vectors_to_upsert = []

    # Example of how you might represent relationships across docs
    # in metadata if you have them
    doc_relationships = references or []  # e.g. ["magma-circular-2023", "DCPR-guidelines-v2"]

    for chunk in chunks:
        text = chunk["translated_text"]
        chunk_meta = chunk["metadata"]

        # Create a unique ID for the chunk in this doc
        chunk_index = chunk_meta.get("chunk_index", 0)
        vector_id = f"{document_id}-chunk{chunk_index}"

        # Embed the chunk with OpenAI
        embedding_vector = get_openai_embedding(text)

        # Add any extra metadata you want:
        #  - the text content (if you want to keep it in Pinecone)
        #  - the doc ID
        #  - cross-reference doc relationships
        pinecone_metadata = {
            "chunk_index": chunk_index,
            "document_id": document_id,
            "relationships": doc_relationships, 
            "text": text,  # you can store the chunk text, or a summary
        }

        vectors_to_upsert.append({
            "id": vector_id,
            "values": embedding_vector,
            "metadata": pinecone_metadata
        })

    # Upsert all chunks in a single call
    index.upsert(vectors=vectors_to_upsert, namespace="circulars")
    print(f"Upserted {len(vectors_to_upsert)} chunks from '{document_id}' to Pinecone.")


In [45]:
index = pinecone_init()
file_path = "sample.pdf"

chunks = parse_and_translate_pdf('en',file_path)
embed_and_index_chunks(chunks, index, document_id="circulars")

##################file path is sample.pdf
DOCUMENT TEXT PARSED


SENDING FOR TRANSLATION
Translation: Subject: Brihanmumbai Municipal Corporation Garden Department Circular No.0041/33/2013- JTMC-DMU dated 17/06/13 Trees NoC Cantalo Under Section 8 of the Maharashtra (Urban Areas) Tree Preservation Act, 1975, permission is also given for cutting and replanting of trees coming under various development works. For this, applications are received from various public infrastructure development organizations, as well as private building, professionals, architects. The application for infrastructure from various development organizations is circulated to the Tree Authority officers. In addition, the proposal for cutting of trees coming under development on private land is received from the concerned developer. Before submitting the proposal to the Garden Department, the Building Proposal Department gives permission for the development work against the No Objection Certificate of the Tree Auth

APIRemovedInV1: 

You tried to access openai.Embedding, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742
