In [1]:
import os

In [2]:
%pwd

'd:\\infosys\\notebook'

In [3]:
os.chdir("../")
%pwd

'd:\\infosys'

In [4]:
from dataclasses import dataclass
from pathlib import Path
@dataclass
class DataTransformationConfig:
    input_json: Path
    entities_output: Path
    relationships_output: Path
    triples_output: Path
    neo4j_uri: str
    neo4j_username: str
    neo4j_password: str

In [5]:
from src.knowledge_graph.utils.common import read_yaml
from src.knowledge_graph.constants import *


In [None]:
class ConfigurationManager:
    def __init__(self, config_path=CONFIG_FILE_PATH):
            self.config = read_yaml(config_path)

    def get_transform_data_config(self) -> DataTransformationConfig:
        config = self.config.transform_data

        return DataTransformationConfig(
            input_json=config.input_json,
            entities_output=config.entities_output,
            relationships_output=config.relationships_output,
            triples_output=config.triples_output,
            neo4j_uri=config.neo4j.uri,
            neo4j_username=config.neo4j.username,
            neo4j_password=config.neo4j.password,
        )


In [None]:
import spacy
from neo4j import GraphDatabase
from src.knowledge_graph.utils.common import read_json, write_json
from src.knowledge_graph.logger.logging import logger
import re
import sys

class DataTransformation:

    def __init__(self, config):
        self.config = config
        self.docs = read_json(config.input_json)
        self.nlp = spacy.load("en_core_web_sm")
        
        # Storage
        self.entity_map = {} 
        self.triples = []

    def clean_text(self, text):
        return text.strip().lower().replace('"', '').replace("'", "")

    def clean_relation(self, text):
        clean = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        return clean.strip().replace(" ", "_").upper()

    # 1️⃣ PROCESS TEXT (Matches Cypher Keys)
    def process_text(self):
        logger.info("Starting NLP Processing...")
        
        all_relationships = []
        
        for doc in self.docs:
            spacy_doc = self.nlp(doc["text"])
            
            # --- Map spans to entities ---
            span_to_entity = {} 
            for ent in spacy_doc.ents:
                clean_key = f"{self.clean_text(ent.text)}_{ent.label_}"
                
                # Create Dictionary with keys matching Cypher (id, name, label)
                if clean_key not in self.entity_map:
                    self.entity_map[clean_key] = {
                        "id": clean_key,           # FIXED: Matches row.id
                        "name": ent.text.strip(),  # FIXED: Matches row.name
                        "label": ent.label_,       # FIXED: Matches row.label
                        "doc_id": doc.get("id")
                    }
                
                for token in ent:
                    span_to_entity[token.i] = self.entity_map[clean_key]

            # --- Extract Relations ---
            for token in spacy_doc:
                if token.pos_ == "VERB":
                    subj, obj = None, None
                    for child in token.children:
                        if "subj" in child.dep_: subj = child
                        if "obj" in child.dep_: obj = child
                    
                    if subj and obj:
                        subj_ent = span_to_entity.get(subj.i)
                        obj_ent = span_to_entity.get(obj.i)
                        
                        # Ensure both are entities and distinct
                        if subj_ent and obj_ent and subj_ent["id"] != obj_ent["id"]:
                            
                            clean_predicate = self.clean_relation(token.lemma_)
                            
                            # 1. Triples for Graph
                            self.triples.append({
                                "head_id": subj_ent["id"],
                                "relation": clean_predicate,
                                "tail_id": obj_ent["id"],
                                "doc_id": doc.get("id")
                            })

                            # 2. Detailed Relationships for JSON
                            rel_id = f"{subj_ent['id']}|{clean_predicate}|{obj_ent['id']}"
                            all_relationships.append({
                                "relation_id": rel_id,
                                "subject": subj_ent["name"],
                                "predicate": clean_predicate,
                                "object": obj_ent["name"],
                                "document_id": doc.get("id")
                            })

        # Save Files
        write_json(self.config.entities_output, list(self.entity_map.values()))
        write_json(self.config.relationships_output, all_relationships)
        write_json(self.config.triples_output, self.triples)

        logger.info(f"Processed: {len(self.entity_map)} Entities, {len(self.triples)} Triples")

    # 2️⃣ BUILD GRAPH 
    def build_graph(self):
        logger.info("Starting Graph Ingestion (Standard Cypher)...")
        neo = self.config
        driver = GraphDatabase.driver(
            neo.neo4j_uri, 
            auth=(neo.neo4j_username, neo.neo4j_password)
        )

        with driver.session() as session:
            
            # A. Create Uniqueness Constraint
            try:
                session.run("CREATE CONSTRAINT FOR (n:Entity) REQUIRE n.id IS UNIQUE")
            except Exception:
                pass 

            # B. Batch Insert Entities
            logger.info("Inserting Entities...")
            
            # Query Logic:
            # 1. Create node with generic :Entity label
            # 2. Set 'type' property (since we can't set dynamic labels without APOC)
            entity_query = """
            UNWIND $batch AS row
            MERGE (e:Entity {id: row.id})
            SET e.name = row.name, 
                e.type = row.label
            """
            
            batch_size = 1000
            entities_list = list(self.entity_map.values())
            
            for i in range(0, len(entities_list), batch_size):
                batch = entities_list[i:i+batch_size]
                session.run(entity_query, batch=batch)

            # C. Batch Insert Relationships
            logger.info("Inserting Relationships...")
            
            # Group by relationship type
            triples_by_type = {}
            for t in self.triples:
                r_type = t["relation"]
                if r_type not in triples_by_type: triples_by_type[r_type] = []
                triples_by_type[r_type].append(t)

            for rel_type, batch_data in triples_by_type.items():
                
                # Safe F-String because rel_type is sanitized (A-Z only)
                rel_query = f"""
                UNWIND $batch AS row
                MATCH (h:Entity {{id: row.head_id}})
                MATCH (t:Entity {{id: row.tail_id}})
                MERGE (h)-[:{rel_type}]->(t)
                """
                
                for i in range(0, len(batch_data), batch_size):
                    sub_batch = batch_data[i:i+batch_size]
                    session.run(rel_query, batch=sub_batch)

        logger.info("Graph Successfully Loaded")

In [None]:
config = ConfigurationManager()
config = config.get_transform_data_config()
obj = DataTransformation(config)
obj.extract_entities()
obj.extract_relationships()
obj.create_triples()
obj.build_graph()


[2025-12-30 12:01:35,626: INFO: common: YAML file: config\config.yaml loaded successfully]
[2025-12-30 12:01:36,379: INFO: 1510705564: Entity Extraction Initialised]
[2025-12-30 12:01:56,351: INFO: 1510705564: Entity extraction completed]
[2025-12-30 12:01:56,352: INFO: 1510705564: Relationship Extraction Initialised]
[2025-12-30 12:02:12,886: INFO: 1510705564: Relationship extraction completed]
[2025-12-30 12:02:12,888: INFO: 1510705564: Triple Creation Initialised]
[2025-12-30 12:02:12,900: INFO: 1510705564: Triple creation completed]
[2025-12-30 12:02:12,902: INFO: 1510705564: Graph Building Intialised]
[2025-12-30 12:02:22,838: INFO: 1510705564: Graph construction completed]
