In [None]:
import json
import os
from langchain.llms import Ollama
from langchain.text_splitter import CharacterTextSplitter
import openai
from neo4j import GraphDatabase
from langchain_openai import ChatOpenAI

In [None]:
uri = os.environ["NEO4J_URI"]
user=os.environ["NEO4J_USERNAME"]
password = os.environ["NEO4J_PASSWORD"]
OPENAI_API_KEY = os.getenv["OPENAI_API_KEY"] 

driver = GraphDatabase.driver(uri, auth=(user, password))
llm = ChatOpenAI(temperature=0, model_name="gpt-4o", api_key=OPENAI_API_KEY)


In [14]:
# Initialize OpenAI client (new method)
client = openai.OpenAI(api_key=OPENAI_API_KEY)

def ask_openai(question, model="gpt-4o"):
    """Sends a question to OpenAI's API and returns the response."""
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": question}]
    )
    return response.choices[0].message.content

# Example Usage
question = "What is differential privacy?"
answer = ask_openai(question)
print("OpenAI Response:", answer)


OpenAI Response: Differential privacy is a concept and set of techniques designed to ensure that the privacy of individuals' data is protected when datasets are analyzed and shared. It provides a mathematical framework for quantifying and controlling the privacy risks associated with releasing information derived from data that includes personal information.

Here are the key aspects of differential privacy:

1. **Privacy Guarantee**: Differential privacy provides a guarantee that the removal or addition of a single individual's data from a dataset does not significantly affect the output of any analysis, thereby protecting the individual's privacy. This means that attackers cannot infer whether a particular individual's data was included in the dataset, even with access to all other information.

2. **Mathematical Definition**: The differential privacy definition involves a parameter typically denoted by ε (epsilon), which quantifies the level of privacy guarantee. Smaller values of ε

In [15]:
ollama = Ollama(base_url='http://localhost:11434', model="llama3.1:70b")

In [16]:
# system_prompt = """
# # Knowledge Graph Instructions for llama
# ## 1. Overview
# You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
# - **Nodes** represent entities and concepts.
# - **Relationships** represent the links between nodes.
# - Ensure numerical values (e.g., episode count, dates, measurements, or other quantifiable details) are captured and assigned appropriately.
# - The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
# - Your goal is to provide **clear and structured JSON output** that captures all relevant relationships, numerical attributes, and entities explicitly.


# ## 2. Formatting the Output
# Your output must follow this strict JSON structure:
# json
# {
#   "nodes": [
#     {
#       "id": "unique_node_id",
#       "label": "nodetype",
#       "attributes": {
#         "key1": "value1",
#         "key2": "value2"
#       }
#     }
#   ],
#   "relationships": [
#     {
#       "source": "source_node_id",
#       "target": "target_node_id",
#       "type": "RELATIONSHIP_TYPE",
#       "attributes": {
#         "key1": "value1",
#         "key2": "value2"
#       }
#     }
#   ]
# }


# 3. Extraction Guidelines
# 3.1 Entity Nodes
# Extract all key entities such as people, places, events, works (e.g., TV shows, movies), and dates.
# Numerical data must always be associated with its corresponding entity as a property or attribute.
# Example: For "23 episodes", create a node for the entity Chicago Fire Season 4 and attach the episodeCount: 23 property.
# 3.2 Relationship Nodes
# Establish connections (relationships) between entities. Relationships must have meaningful labels and attributes where applicable.
# Use ALL CAPS for relationship type names.
# If numerical data involves a relationship (e.g., "contained 23 episodes"), encode it as an attribute in the relationship.
# 3.3 Numerical Data and Dates
# Always extract numerical data (e.g., "23", "May 17, 2016") and attach it as an attribute to the appropriate node or relationship.
# DO NOT create separate nodes for numerical values. Instead, add them as properties.
# 3.4 Coreference Resolution
# Maintain entity consistency throughout the graph. For example, "Chicago Fire Season 4" must remain consistent across all relationships and nodes.

# ## 4. Example Output. Following is just sample exmaple. DO NOT take it as it is.
# Input Text:
# The fourth season of Chicago Fire, an American drama television series with executive producer Dick Wolf, and producers Derek Haas, Michael Brandt, and Matt Olmstead, was ordered on February 5, 2015, by NBC, and premiered on October 13, 2015, and concluded on May 17, 2016. The season contained 23 episodes.

# Output:
# {
#   "nodes": [
#     {
#       "id": "chicago_fire_season_4",
#       "label": "television_series",
#       "attributes": {
#         "name": "chicago fire season 4",
#         "episodeCount": 23,
#         "premiereDate": "2015-10-13",
#         "endDate": "2016-05-17",
#         "documentId": "doc10"
#       }
#     },
#     {
#       "id": "elvis_presley",
#       "label": "person",
#       "attributes": {
#         "name": "elvis presley",
#         "occupation": "singer",
#         "documentId": "doc1"
#       }
#     },
#     {
#       "id": "nbc",
#       "label": "network",
#       "attributes": {
#         "name": "nbc",
#         "documentId": "doc12"
#       }
#     },
#     {
#       "id": "dick_wolf",
#       "label": "person",
#       "attributes": {
#         "name": "dick wolf",
#         "role": "executive producer",
#         "documentId": "doc0"
#       }
#     },
#     {
#       "id": "derek_haas",
#       "label": "person",
#       "attributes": {
#         "name": "derek haas",
#         "role": "producer",
#         "documentId": "doc8"
#       }
#     }
#   ],
#   "relationships": [
#     {
#       "source": "nbc",
#       "target": "chicago_fire_season_4",
#       "type": "BROADCASTED_BY",
#       "attributes": {"documentId": "doc10"}
#     },
#     {
#       "source": "dick_wolf",
#       "target": "chicago_fire_season_4",
#       "type": "PRODUCED",
#       "attributes": {
#         "role": "executive producer",
#         "documentId": "doc11"
#       }
#     },
#     {
#       "source": "chicago_fire_season_4",
#       "target": "23",
#       "type": "CONTAINS_EPISODES",
#       "attributes": {
#         "count": 23,
#         "documentId": "doc4"
#       }
#     }
#   ]
# }

# ## 5. Do not give any other explaination to output. for exmaple: Here is the output in JSON format:
# Strictly follow required output format.

# ## 6. Do not impute missing values. If found nothing do not return anything
        
# ## 7. Coreference Resolution
# - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
# If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"), 
# always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.  
# Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial. 

# ## 8. - **Property Format**: Properties must be in a key-value format.
# - **Quotation Marks**: Never use escaped single or double quotes within property values.
# - **Naming Convention**: Use camelCase for property keys, e.g., birthDate.
# - Do not add any spaces to keys names.

# ## 9. Labeling Nodes
# - **Consistency**: Ensure you use basic or elementary types for node labels.
#   - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".

# ## 10. Strict Compliance
# Adhere to the rules strictly. Non-compliance will result in termination. 

# ## 11. Key Instructions
# Numerical Values: Always include numerical information (like episode counts, years, or measurements) as node or relationship attributes.
# Consistency: Maintain uniform entity IDs and relationship types.
# Strict Formatting: Any deviation from the specified format will be treated as an error.
# Precision: Only extract information explicitly present in the text. Avoid assumptions.

# ## 12. In json while storing relationship use 'source' and 'target' instead of 'from' and 'to'.

# ## 13. Relationship type should be in capitals for example: 'CONTAINS_EPISODES', 'BROADCASTED_BY'.

# ## 14. Coreference Resolution and Entity Normalization
# - **Normalize Entities**: Ensure that all entities are consolidated to their singular or canonical forms. For example:
#   - "point" and "points" should be normalized to "point".
#   - Use lemmatization or a similar linguistic technique to achieve normalization.
# - **Avoid Duplicates**: Before adding a new entity, check if a similar entity (by name or attributes) already exists and use the same if appropriate.
# - **Consistency**: All node labels should follow the same naming convention (e.g., singular form).

# ## 15. Labeling Nodes: Vey Important
# - **Consistency**: Use **lowercase** for node labels (e.g., "person", "song") and properties.
#   - Convert all properties and labels to lowercase for uniformity.
# - **Relationship Types**: Always use **uppercase** for relationship types.

# ## 16. Keep All attribute values in lower case only. Striclty follow this.

# ## 17. Use Suggested Labels
# - The following label list will be provided as input: {labels_list}. Always attempt to use these labels before creating new ones.
# - New labels should only be introduced if absolutely necessary and must be in lowercase.

# ## 18. Only extract from following context.
# context is: 

# """

In [17]:
system_prompt = """
# Knowledge Graph Extraction Instructions

## Role
You are an advanced information extraction agent specialized in generating clean, structured knowledge graphs from natural language. Your output helps build entity-relation-based systems.

## Objective
Extract **entities as nodes** and **connections as relationships** from the given context. Numerical values and dates must be assigned as attributes, not as separate nodes. The graph must be:
- **Simple and human-readable**
- **Normalized** (avoid duplication, plural/singular mismatch)
- **Strictly JSON-compliant** (no comments or markdown)

## Output Format (JSON)
Strictly follow this structure:
{
  "nodes": [
    {
      "id": "unique_node_id",
      "label": "nodetype",
      "attributes": {
        "key1": "value1",
        "key2": "value2"
      }
    }
  ],
  "relationships": [
    {
      "source": "source_node_id",
      "target": "target_node_id",
      "type": "RELATIONSHIP_TYPE",
      "attributes": {
        "key1": "value1",
        "key2": "value2"
      }
    }
  ]
}

## Key Rules for Extraction

### 1. Nodes
- Represent people, places, events, organizations, works (e.g., songs, shows), concepts, etc.
- Must include key descriptive attributes (e.g., name, title, role, episodeCount).
- Include `"documentId": "docX"` inside `attributes` for every node.
- Labels must be lowercase and generic (e.g., `person`, `organization`, not `musician` or `tech_company`).
- Normalize entity names: e.g., “points” → “point”.

### 2. Relationships
- Represent meaningful connections (e.g., PRODUCED, BROADCASTED_BY).
- Use ALL CAPS for the `"type"` value.
- Attach attributes if available (e.g., role, percentage).
- Include `"documentId": "docX"` inside `attributes` for every relationship.

### 3. Attributes
- Must be key-value pairs using **camelCase** keys and **lowercase** string values.
- Avoid escaped characters and markdown syntax.
- Do not create properties like `"value": "23"` — use a clear descriptive key: `"episodeCount": 23`.

### 4. Dates and Numbers
- Store as attributes (e.g., `premiereDate: "2015-10-13"`, `ownershipPercentage: "less than 50"`).
- Never create separate nodes for numbers or dates.

### 5. Consistency & Coreference
- Use consistent IDs (e.g., "Chicago Fire Season 4" → `"chicago_fire_season_4"`).
- Resolve pronouns and aliases to full identifiers (e.g., "he" → "John Doe").

### 6. Strict Compliance
- DO NOT return explanations, markdown, or extra text — only valid JSON.
- If nothing relevant is found, return an empty graph:
  {
    "nodes": [],
    "relationships": []
  }

### 7. Label Suggestion
- Labels for node must be chosen from this predefined set of possible labels:
  { "person", "organization", "event", "television_series", "song","location", "work", "concept","other"}


### 8. Example Output (DO NOT mimic directly — just for structure)

Input Text:
The fourth season of Chicago Fire, an American drama television series with executive producer Dick Wolf, and producers Derek Haas, Michael Brandt, and Matt Olmstead, was ordered on February 5, 2015, by NBC, and premiered on October 13, 2015, and concluded on May 17, 2016. The season contained 23 episodes.

Output:
{
  "nodes": [
    {
      "id": "chicago_fire_season_4",
      "label": "television_series",
      "attributes": {
        "name": "chicago fire season 4",
        "episodeCount": 23,
        "premiereDate": "2015-10-13",
        "endDate": "2016-05-17",
        "documentId": "doc10"
      }
    },
    {
      "id": "dick_wolf",
      "label": "person",
      "attributes": {
        "name": "dick wolf",
        "role": "executive producer",
        "documentId": "doc10"
      }
    }
  ],
  "relationships": [
    {
      "source": "nbc",
      "target": "chicago_fire_season_4",
      "type": "BROADCASTED_BY",
      "attributes": {
        "documentId": "doc10"
      }
    }
  ]
}

## Final Reminder
- Output must be valid JSON, no markdown.
- Document-specific metadata (`documentId`) must be attached to **every node and relationship** as an attribute.
- Follow formatting, naming, and structural rules precisely.
- If no extractable info is found, return an empty graph.

## Context to Analyze:
"""


In [18]:
# Function to read text from a file
def read_text_from_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def split_text(input_text, chunk_size=1000, overlap=100):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        length_function=len,
    )
    return text_splitter.split_text(input_text)

In [19]:
def extract_knowledge_graph(input_text_chunk, labels_list):
    #updated_prompt = system_prompt.replace("{labels_list}", json.dumps(labels_list))
    #print(f"Labels List:\n{labels_list}\n")
    prompt = f"{system_prompt}\n\nInput Text:\n{input_text_chunk}"
    response = ask_openai(prompt)
    return response

In [20]:
def process_single_document(doc, combined_graph, labels_list, failed_chunks_file, max_retries=3):
    doc_id = doc["_id"]
    text = doc["text"]
    retries = 0

    while retries < max_retries:
        try:
            response = extract_knowledge_graph(text, list(labels_list))
            extracted_graph = json.loads(response)

            for node in extracted_graph.get("nodes", []):
                node.setdefault("attributes", {})
                node["attributes"]["documentId"] = doc_id
                if "label" in node:
                    labels_list.add(node["label"])
                if node not in combined_graph["nodes"]:
                    combined_graph["nodes"].append(node)

            for rel in extracted_graph.get("relationships", []):
                rel.setdefault("attributes", {})
                rel["attributes"]["documentId"] = doc_id
                if rel not in combined_graph["relationships"]:
                    combined_graph["relationships"].append(rel)

            break  # success
        except json.JSONDecodeError as e:
            retries += 1
            print(f"[ERROR] JSONDecodeError on doc {doc_id}, retry {retries}/{max_retries}: {e}")
            if retries >= max_retries:
                with open(failed_chunks_file, "r+", encoding="utf-8") as f:
                    failed_responses = json.load(f)
                    failed_responses.append({
                        "doc_id": doc_id,
                        "text": text,
                        "response": response
                    })
                    f.seek(0)
                    json.dump(failed_responses, f, indent=4)


In [21]:
def process_jsonl_file(input_path, output_path, failed_chunks_file):
    if os.path.exists(output_path):
        with open(output_path, "r", encoding="utf-8") as f:
            combined_graph = json.load(f)
    else:
        combined_graph = {"nodes": [], "relationships": []}

    if not os.path.exists(failed_chunks_file):
        with open(failed_chunks_file, "w", encoding="utf-8") as f:
            json.dump([], f, indent=4)

    labels_list = {node["label"] for node in combined_graph["nodes"] if "label" in node}

    with open(input_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, 1):
            try:
                doc = json.loads(line)
                print(f"\nProcessing document {line_num}: {doc.get('_id')}")
                process_single_document(doc, combined_graph, labels_list, failed_chunks_file)
                with open(output_path, "w", encoding="utf-8") as out_f:
                    json.dump(combined_graph, out_f, indent=4)
            except Exception as e:
                print(f"[ERROR] Failed to process document {line_num}: {e}")

    print(f"\nCompleted. Graph saved at {output_path}.")

In [22]:
# File paths
input_file_path = "/home/sbhavsar/PoisonedRAG/datasets/nq/combined_by_title_from after_seminar.jsonl"
output_file_path = "/home/sbhavsar/PoisonedRAG/after_seminar_small_kg/jsons/23_04_2025_knowledge_graph_new_sys.json"
failed_chunks_file_path = "/home/sbhavsar/PoisonedRAG/after_seminar_small_kg/jsons/23_04_2025_failed_chunks_new_sys.json"

# Run the processor
process_jsonl_file(input_file_path, output_file_path, failed_chunks_file_path)


Processing document 1: merged_doc0



Processing document 2: merged_doc1

Processing document 3: merged_doc2

Processing document 4: merged_doc3
[ERROR] JSONDecodeError on doc merged_doc3, retry 1/3: Expecting value: line 1 column 1 (char 0)

Processing document 5: merged_doc4

Processing document 6: merged_doc5

Processing document 7: merged_doc6

Processing document 8: merged_doc7

Processing document 9: merged_doc8

Processing document 10: merged_doc9
[ERROR] JSONDecodeError on doc merged_doc9, retry 1/3: Expecting value: line 1 column 1 (char 0)
[ERROR] JSONDecodeError on doc merged_doc9, retry 2/3: Expecting value: line 1 column 1 (char 0)
[ERROR] JSONDecodeError on doc merged_doc9, retry 3/3: Expecting value: line 1 column 1 (char 0)

Processing document 11: merged_doc10

Processing document 12: merged_doc11
[ERROR] JSONDecodeError on doc merged_doc11, retry 1/3: Expecting value: line 1 column 1 (char 0)
[ERROR] JSONDecodeError on doc merged_doc11, retry 2/3: Expecting value: line 1 column 1 (char 0)
[ERROR] JSONDec