In [None]:
# May 2025
# transforming documents into graph-based representations using a large language model (LLM) 

In [None]:
from dotenv import load_dotenv
import os

# 1st method: using .env file.
load_dotenv()
# Access them using os.getenv or os.environ
api_key = os.getenv("GROQ_API_KEY")

# 2nd method: using hard code
# api_key = "<put the api key here>"
# if not os.environ.get("GROQ_API_KEY"):
#     os.environ["GROQ_API_KEY"] = api_key #getpass.getpass("Enter API key for Groq: ")



from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [None]:
prompt = PromptTemplate(
    input_variables=["text_chunk"],
    template="""
Extract entities and relationships from the following text. 
Return them in a structured format like this:

Entities:
- Entity1
- Entity2
...

Relationships:
- (Entity1) --[relationship_type]--> (Entity2)
...

Text:
{text_chunk}
"""
)


In [None]:
# llm = ChatOpenAI(temperature=0)
output_parser = StrOutputParser()
chain = LLMChain(llm=llm, prompt=prompt, output_parser=output_parser)


In [None]:
loader = TextLoader("my_doc2.txt")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(docs)


In [None]:
results = []
for chunk in chunks:
    result = chain.run(text_chunk=chunk.page_content)
    results.append(result)


In [None]:
results

In [None]:
import networkx as nx

G = nx.DiGraph()

for res in results:
    lines = res.splitlines()
    for line in lines:
        if "--[" in line:
            parts = line.strip("()").split("--[")
            entity1 = parts[0].strip()
            relation, entity2 = parts[1].split("]-->")
            relation = relation.strip()
            entity2 = entity2.strip("()")
            G.add_edge(entity1, entity2, label=relation)


In [None]:
import matplotlib.pyplot as plt

pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray')
edge_labels = nx.get_edge_attributes(G, 'label')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
plt.show()


In [None]:
# the rest is rubnish. The best path from here is to implement a Knowldege Graph RAG.
# Still studying that...

In [None]:
# --- 1. Imports ---
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA, LLMChain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema import Document, BaseRetriever
from typing import List
import networkx as nx
import os

# # --- 2. Set API Key ---
# os.environ["OPENAI_API_KEY"] = "your-openai-key"

# --- 3. Load and Split Document ---
loader = TextLoader("my_doc2.txt")  # Load your text file
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(docs)

# --- 4. Triple Extraction Prompt ---
triple_prompt = PromptTemplate(
    input_variables=["text_chunk"],
    template="""
Extract subject-predicate-object triples from the following text.
Return in format: (Subject) --[Relation]--> (Object)

Text:
{text_chunk}
"""
)

# llm = ChatOpenAI(temperature=0)
extractor_chain = LLMChain(llm=llm, prompt=triple_prompt)

# --- 5. Extract Triples ---
triples = []
for chunk in chunks:
    output = extractor_chain.run(text_chunk=chunk.page_content)
    triples.extend(output.strip().split("\n"))

# --- 6. Build Knowledge Graph ---
G = nx.DiGraph()
for triple in triples:
    if "--[" in triple and "]-->" in triple:
        try:
            subj = triple.split(")")[0].strip("(").strip()
            pred = triple.split("--[")[1].split("]")[0].strip()
            obj = triple.split("]-->")[1].strip("() ").strip()
            G.add_edge(subj, obj, label=pred)
        except:
            continue

# --- 7. Graph-Based Retriever ---
class GraphRetriever(BaseRetriever):
    graph: nx.DiGraph

    def get_relevant_documents(self, query: str) -> List[Document]:
        results = []
        for node in self.graph.nodes:
            if query.lower() in node.lower():
                for neighbor in self.graph.neighbors(node):
                    relation = self.graph.get_edge_data(node, neighbor)["label"]
                    sentence = f"{node} {relation} {neighbor}."
                    results.append(Document(page_content=sentence))
        return results

retriever = GraphRetriever(graph=G)

# --- 8. Custom Prompt (to limit LLM hallucination) ---
qa_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Use ONLY the information below to answer the question.
If the answer is not contained, say "I don't know."

Context:
{context}

Question: {question}
Answer:
"""
)

# --- 9. RAG Pipeline ---
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": qa_prompt}
)

# --- 10. Run RAG QA ---
question = " billion freezing?"
answer = rag_chain.run(question)

print(f"\nQ: {question}")
print(f"A: {answer}")


In [None]:
for t in triples:
    print(t)

In [None]:
# Create a Graph Retriever
from langchain.schema import BaseRetriever, Document
from typing import List
import networkx as nx

class GraphRetriever(BaseRetriever):
    graph: nx.DiGraph  # 👈 Explicitly declare it for Pydantic

    def get_relevant_documents(self, query: str) -> List[Document]:
        relevant_docs = []
        for node in self.graph.nodes:
            if query.lower() in node.lower():
                neighbors = self.graph.neighbors(node)
                for neighbor in neighbors:
                    relation = self.graph.get_edge_data(node, neighbor)['label']
                    doc = Document(page_content=f"({node}) --[{relation}]--> ({neighbor})")
                    relevant_docs.append(doc)
        return relevant_docs


In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

retriever = GraphRetriever(graph=G)  # ✅ Now this will work

rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff"
)


In [None]:
query = "What Trump did?"
response = rag_chain.run(query)
print(response)

In [None]:
import os
import networkx as nx
from typing import List

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA, LLMChain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema import Document, BaseRetriever
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# --- 1. Set your OpenAI API key ---
# os.environ["OPENAI_API_KEY"] = "your-api-key"  # Replace with your actual key

# --- 2. Load and split document ---
loader = TextLoader("my_doc2.txt")  # Replace with your file
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(docs)

# --- 3. Triple extraction prompt ---
triple_prompt = PromptTemplate(
    input_variables=["text_chunk"],
    template="""
Extract clear subject-verb-object triples from the following text.

Example:
Text: "Einstein developed the theory of relativity."
Output: (Einstein) --[developed]--> (theory of relativity)

Text:
{text_chunk}
"""
)
# llm = ChatOpenAI(temperature=0)
extractor_chain = LLMChain(llm=llm, prompt=triple_prompt)
from langchain.embeddings import HuggingFaceEmbeddings

# --- 4. Extract triples ---
triples = []
for chunk in chunks:
    response = extractor_chain.run(text_chunk=chunk.page_content)
    triples.extend(response.strip().splitlines())

# --- 5. Build the graph ---
G = nx.DiGraph()
facts = []  # For FAISS vector store

for triple in triples:
    if '--[' in triple and ']-->' in triple:
        try:
            subj = triple.split(')')[0].strip('(').strip()
            pred = triple.split('--[')[1].split(']')[0].strip()
            obj = triple.split(']-->')[1].strip('() ').strip()
            G.add_edge(subj, obj, label=pred)
            fact = f"{subj} {pred} {obj}."
            facts.append(Document(page_content=fact))
        except:
            continue

# --- 6. Embed the graph facts using FAISS ---
embedding_model = HuggingFaceEmbeddings()
vectorstore = FAISS.from_documents(facts, embedding_model)

# --- 7. Semantic retriever using FAISS over graph facts ---
class GraphSemanticRetriever(BaseRetriever):
    vectorstore: FAISS

    def get_relevant_documents(self, query: str) -> List[Document]:
        return self.vectorstore.similarity_search(query, k=5)

retriever = GraphSemanticRetriever(vectorstore=vectorstore)

# --- 8. Constrained prompt to reduce hallucinations ---
qa_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Answer the question using ONLY the context below.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question: {question}
Answer:
"""
)

# --- 9. Build the RAG QA chain ---
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": qa_prompt}
)

# --- 10. Ask a question ---
question = "What did Einstein develop?"
answer = rag_chain.run(question)

print(f"\nQuestion: {question}")
print(f"Answer: {answer}")


In [None]:
# --- 1. Imports ---
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA, LLMChain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema import Document, BaseRetriever
from typing import List
import networkx as nx
import os


# --- 3. Load and Split Document ---
loader = TextLoader("my_doc.txt")  # Replace with your text file path
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(docs)

# --- 4. Triple Extraction Prompt ---
triple_prompt = PromptTemplate(
    input_variables=["text_chunk"],
    template="""
Extract subject-predicate-object triples from the following text.
Return in format: (Subject) --[Relation]--> (Object)

Text:
{text_chunk}
"""
)

# llm = ChatOpenAI(temperature=0)
extractor_chain = LLMChain(llm=llm, prompt=triple_prompt)

# --- 5. Extract Triples from Chunks ---
all_triples = []

for chunk in chunks:
    response = extractor_chain.run(text_chunk=chunk.page_content)
    all_triples.append(response)

# --- 6. Build Knowledge Graph ---
G = nx.DiGraph()

for response in all_triples:
    for line in response.splitlines():
        if '--[' in line and ']-->' in line:
            try:
                subj = line.split(')')[0].strip('(').strip()
                pred = line.split('--[')[1].split(']')[0].strip()
                obj = line.split(']-->')[1].strip('() ').strip()
                G.add_edge(subj, obj, label=pred)
            except:
                continue

# --- 7. Custom GraphRetriever with Rich Sentences ---
class GraphRetriever(BaseRetriever):
    graph: nx.DiGraph

    def get_relevant_documents(self, query: str) -> List[Document]:
        relevant_docs = []
        for node in self.graph.nodes:
            if query.lower() in node.lower():
                neighbors = self.graph.neighbors(node)
                for neighbor in neighbors:
                    relation = self.graph.get_edge_data(node, neighbor)['label']
                    # Rich sentence instead of raw triple
                    sentence = f"{node} is related to {neighbor} via '{relation}'."
                    relevant_docs.append(Document(page_content=sentence))
        return relevant_docs

retriever = GraphRetriever(graph=G)

# --- 8. Constrained Prompt to Force Use of Retrieved Facts ---
qa_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful assistant. Use ONLY the information below to answer the question.
If the answer is not present, say "I don't know."

Context:
{context}

Question: {question}
Answer:
"""
)

# --- 9. RAG Chain Setup ---
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": qa_prompt}
)

# --- 10. Ask Questions ---
query = "Why student visas are retracted?"
answer = rag_chain.run(query)

print(f"\nQuestion: {query}")
print(f"Answer: {answer}")
