In [46]:
import os

In [50]:


def process_pdf_with_processor(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str
):
    
    from google.api_core.client_options import ClientOptions
    from google.cloud import documentai
    """
    Uses an existing processor to parse a PDF file via Document AI.
    """
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # Build the processor resource name
    processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
    # Read the PDF into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Prepare the raw document
    raw_document = documentai.RawDocument(
        content=image_content, mime_type="application/pdf"
    )

    # Construct the request
    request = documentai.ProcessRequest(
        name=processor_name,
        raw_document=raw_document,
    )

    # Send the request
    result = client.process_document(request=request)
    document_object = result.document

    print("DOCUMENT TEXT PARSED")
    print("\n\nSENDING FOR TRANSLATION")
    return document_object.text




In [37]:
def chunking_text(full_text: str, chunk_size: int = 1000, overlap: int = 200):
  
  
    chunks = []
    start = 0
    end = chunk_size

    chunk_index = 0
    while start < len(full_text):
        # Extract the chunk
        text_chunk = full_text[start:end]
        
        # Construct metadata. You can add page numbers if you have them.
        metadata = {
            "chunk_index": chunk_index
        }
        
        chunks.append({
            "text": text_chunk,
            "metadata": metadata
        })
        
        # Move the window
        start = end - overlap
        end = start + chunk_size
        chunk_index += 1

    return chunks

In [69]:
def translate_text(target_language: str, text: str, folder: str = "processed_files", filename: str = "translated_doc.txt") -> dict:
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """
    from google.cloud import translate_v2

    translate_client = translate_v2.Client()

    if isinstance(text, bytes):
        text = text.decode("utf-8")

    detection = translate_client.detect_language(text)
    source_lang = detection["language"]

    if source_lang != target_language:
        translation = translate_client.translate(
            text, target_language=target_language
        )
        result =  translation["translatedText"]

    else:
        result = text
        

    os.makedirs(folder, exist_ok=True)

    file_path = os.path.join(folder, filename)

    with open(file_path, "w", encoding="utf-8") as file:
        file.write(result)
   

    print(f"Translation saved to {file_path}")
    print(f"Detected source language: {source_lang}")

    


In [70]:
import config
def parse_and_translate_pdf(target_language, file_location):
    parsed_text = process_pdf_with_processor(
        config.PROJECT_ID,
        config.LOCATION,
        config.PROCESSOR_ID,
        file_location
    )
    translate_text(target_language, parsed_text)
    

In [None]:
file_path = "sample.pdf"
parse_and_translate_pdf('en', file_path)

In [72]:
import config


def parse_and_translate_pdf(target_language, file_location):
  
    # Parsing PDF text
    parsed_text = process_pdf_with_processor(
        config.PROJECT_ID,
        config.LOCATION,
        config.PROCESSOR_ID,
        file_location
    )

    # Chunkin the text into smaller pieces
    chunks = chunking_text(parsed_text)

    # translating each chunk
    translated_chunks = []
    for chunk_data in chunks:
        chunk_text = chunk_data["text"]
        metadata = chunk_data["metadata"]
    
        result = translate_text(target_language, chunk_text)
        translated_chunk = {
            "translated_text": result["translatedText"],
            "detected_source_language": result["detectedSourceLanguage"],
            # Include metadata so we keep track of position, page, etc.
            "metadata": metadata
        }
        translated_chunks.append(translated_chunk)

    return translated_chunks

In [41]:
import openai
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Replace with your real OpenAI API key

def get_openai_embedding(text: str, file_path: str) -> list:
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    loader = DirectoryLoader(file_path, glob="*.txt", loader_cls=TextLoader)  # Load only .txt files
    documents = loader.load()

    # Use a TextSplitter to split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
    split_documents = text_splitter.split_documents(documents)
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=text
    )
    embedding_vector = response["data"][0]["embedding"]  # a list[float]
    return embedding_vector

In [None]:
import os
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai

# Set your API keys for OpenAI

# Initialize OpenAI Embeddings using LangChain
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")  # Specify which embedding model

# Load all text files from a directory
directory_path = "./processed_files"  # directory path with all the national weather service documents
loader = DirectoryLoader(directory_path, glob="*.txt", loader_cls=TextLoader)  # Load only .txt files
documents = loader.load()

# Use a TextSplitter to split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
split_documents = text_splitter.split_documents(documents)

# Connect to the Pinecone index using LangChain's Pinecone wrapper
# Add all the split documents into the Pinecone vector database
pinecone_index_name = "real-estate-docs"
vectorstore = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
vectorstore.add_documents(documents=split_documents )

print("Embeddings from text files residing in the directory, created, and inserted in Pinecone Vector Database successfully!")

In [25]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

pinecone_index_name = "real-estate-docs"
from langchain_pinecone import PineconeVectorStore
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") 
vectorstore = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings, namespace='circulars')

In [6]:
import shutil
shutil.rmtree("split_files")

In [26]:
vectorstore.delete(delete_all=True)

NotFoundException: (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'Date': 'Sun, 12 Jan 2025 14:02:39 GMT', 'Content-Type': 'application/json', 'Content-Length': '55', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '33', 'x-pinecone-request-id': '8680537073599987404', 'x-envoy-upstream-service-time': '33', 'server': 'envoy'})
HTTP response body: {"code":5,"message":"Namespace not found","details":[]}


In [44]:
def embed_and_index_chunks(
    chunks: list,
    index,   # pinecone.Index object
    document_id: str,
    references: list = None
):

    vectors_to_upsert = []

    # Example of how you might represent relationships across docs
    # in metadata if you have them
    doc_relationships = references or []  # e.g. ["magma-circular-2023", "DCPR-guidelines-v2"]

    for chunk in chunks:
        text = chunk["translated_text"]
        chunk_meta = chunk["metadata"]

        # Create a unique ID for the chunk in this doc
        chunk_index = chunk_meta.get("chunk_index", 0)
        vector_id = f"{document_id}-chunk{chunk_index}"

        # Embed the chunk with OpenAI
        embedding_vector = get_openai_embedding(text)

        # Add any extra metadata you want:
        #  - the text content (if you want to keep it in Pinecone)
        #  - the doc ID
        #  - cross-reference doc relationships
        pinecone_metadata = {
            "chunk_index": chunk_index,
            "document_id": document_id,
            "relationships": doc_relationships, 
            "text": text,  # you can store the chunk text, or a summary
        }

        vectors_to_upsert.append({
            "id": vector_id,
            "values": embedding_vector,
            "metadata": pinecone_metadata
        })

    # Upsert all chunks in a single call
    index.upsert(vectors=vectors_to_upsert, namespace="circulars")
    print(f"Upserted {len(vectors_to_upsert)} chunks from '{document_id}' to Pinecone.")


In [None]:
index = pinecone_init()
file_path = "sample.pdf"

chunks = parse_and_translate_pdf('en',file_path)
embed_and_index_chunks(chunks, index, document_id="circulars")

In [23]:
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import ConfigurableField
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase
from langchain_community.vectorstores import Neo4jVector

In [1]:
import config
import os 
NEO4J_URI="neo4j+ssc://5b0e3ff0.databases.neo4j.io"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="yAw1UTwy_2EBdajk1i-aNCYKJSjdLo9TA0szLoTVV8g"


os.environ["OPENAI_API_KEY"] = config.OPENAI_API_KEY
os.environ["NEO4J_URI"] = NEO4J_URI
os.environ["NEO4J_USERNAME"] = NEO4J_USERNAME
os.environ["NEO4J_PASSWORD"] = NEO4J_PASSWORD
     

In [None]:
from parsing import parse_and_translate_pdf

parse_and_translate_pdf('en', "./temp_files/20130617 Tree-Cuting and replant Circular Act 1975.pdf", "test.txt")

In [None]:
%pip install --upgrade --quiet  langchain langchain-community langchain-openai langchain-experimental neo4j tiktoken yfiles_jupyter_graphs


In [13]:
from langchain_neo4j import GraphCypherQAChain, Neo4jGraph
from langchain_openai import ChatOpenAI

from langchain_experimental.graph_transformers import LLMGraphTransformer
# Load all text files from a directory
directory_path = "./processed_files"  # directory path with all the national weather service documents
loader = DirectoryLoader(directory_path, glob="*.txt", loader_cls=TextLoader)  # Load only .txt files
documents = loader.load()

# Use a TextSplitter to split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(documents)


llm=ChatOpenAI(temperature=0, model_name="gpt-4o-mini-2024-07-18")

llm_transformer = LLMGraphTransformer(llm=llm)

graph_documents = llm_transformer.convert_to_graph_documents(documents)



graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD)
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)


In [None]:
chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0), graph=graph, verbose=True, allow_dangerous_requests=True
)

chain.invoke({"query": "What is it about tree cutting?"})

In [None]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_neo4j import  Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
from langchain.text_splitter import TokenTextSplitter
from typing import Tuple, List, Optional
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_neo4j.vectorstores.neo4j_vector import remove_lucene_chars

graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD)

# Load all text files from a directory
directory_path = "./processed_files"  # directory path with all the national weather service documents
loader = DirectoryLoader(directory_path, glob="*.txt", loader_cls=TextLoader)  # Load only .txt files
documents = loader.load()

# Use a TextSplitter to split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(documents)


llm=ChatOpenAI(temperature=0, model_name="gpt-4o-mini-2024-07-18")

llm_transformer = LLMGraphTransformer(llm=llm)

graph_documents = llm_transformer.convert_to_graph_documents(documents)


graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)


# directly show the graph resulting from the given Cypher query
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 50"



vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(),
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)



# Extract entities from text
class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)
     

entity_chain = prompt | llm.with_structured_output(Entities)

def generate_full_text_query(input: str) -> str:
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

     

# Fulltext index query
def structured_retriever(question: str) -> str:
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
    return result



In [None]:

graph.query("CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

In [None]:
structured_retriever("Tree cutting")

In [None]:
def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
                {structured_data}
                Unstructured data:
                {"#Document ". join(unstructured_data)}
                 """
    return final_data

In [None]:
from langchain_core.output_parsers import StrOutputParser
prompt = "Whst is the policy about?"
prompt_template = PromptTemplate(
    template="""
    Use the following context to answer the question as accurately as possible:
    Context: {context}
    Question: {question}
    Answer:""",
    input_variables=["context", "question"]
    )
chain = prompt_template | llm | StrOutputParser()
context = retriever(prompt)
print(context)
output = chain.invoke({"context": context, "question": prompt})
print(output)

In [None]:
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase
     

try:
  import google.colab
  from google.colab import output
  output.enable_custom_widget_manager()
except:
  pass
     

def showGraph(cypher: str = default_cypher):
    # create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    display(widget)
    return widget
     

showGraph()