In [5]:
# 25 Feb Main Code
import pandas as pd
import numpy as np
from neo4j import GraphDatabase
import google.generativeai as genai
import logging
import time
from typing import Dict, List, Tuple, Any
import json
import re

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class GraphRAGSystem:
    def __init__(self, neo4j_uri: str, neo4j_user: str, neo4j_password: str, gemini_api_key: str):
        """Initialize the Graph RAG system with necessary connections"""
        self.setup_database(neo4j_uri, neo4j_user, neo4j_password)
        self.setup_llm(gemini_api_key)
        self.last_api_call = 0
        self.min_delay = 2.0  # Minimum delay between API calls
        self.knowledge_base = {}  # Store extracted knowledge

    def setup_database(self, uri: str, user: str, password: str) -> None:
        """Setup Neo4j connection"""
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        logger.info("✅ Connected to Neo4j database")

    def setup_llm(self, api_key: str) -> None:
        """Setup Gemini LLM"""
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel(
            model_name="gemini-1.5-pro",
            generation_config={
                "temperature": 0.7,
                "max_output_tokens": 800,
            }
        )
        self.chat = self.model.start_chat(history=[])
        logger.info("✅ Initialized Gemini LLM")

    def load_datasets(self, ecommerce_path: str, chat_logs_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Load both datasets and perform initial preprocessing"""
        try:
            # Load e-commerce data
            ecommerce_df = pd.read_csv(ecommerce_path)
            logger.info(f"Loaded e-commerce data: {ecommerce_df.shape}")

            # Load chat logs
            chat_logs_df = pd.read_csv(chat_logs_path)
            logger.info(f"Loaded chat logs: {chat_logs_df.shape}")

            # Store original dataframes for later use
            self.ecommerce_df = ecommerce_df
            self.chat_logs_df = chat_logs_df
            
            # Convert data to knowledge graph
            self._convert_to_knowledge_graph(ecommerce_df, chat_logs_df)

            return ecommerce_df, chat_logs_df

        except Exception as e:
            logger.error(f"Error loading datasets: {str(e)}")
            raise

    def _convert_to_knowledge_graph(self, ecommerce_df: pd.DataFrame, chat_logs_df: pd.DataFrame) -> None:
        """Convert data directly to knowledge graph without predefined structure"""
        try:
            # First, clear existing database
            self._clear_database()
            
            # Create basic constraints and indices for performance
            self._create_basic_indices()
            
            # Extract entities and relationships using LLM
            self._discover_entities_relationships(ecommerce_df, chat_logs_df)
            
            # Create additional relationships and insights
            self._enrich_knowledge_graph()
            
            logger.info("✅ Successfully converted data to knowledge graph")
            
        except Exception as e:
            logger.error(f"Error converting to knowledge graph: {str(e)}")
            raise

    def _clear_database(self) -> None:
        """Clear existing graph database"""
        with self.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
            logger.info("Cleared existing database")

    def _create_basic_indices(self) -> None:
        """Create basic indices without restricting node types"""
        with self.driver.session() as session:
            # Create some basic indices for common properties
            indices = [
                "CREATE INDEX entity_id_index IF NOT EXISTS FOR (n) ON (n.id)",
                "CREATE INDEX entity_name_index IF NOT EXISTS FOR (n) ON (n.name)",
                "CREATE INDEX entity_type_index IF NOT EXISTS FOR (n) ON (n.entity_type)",
                "CREATE INDEX relationship_type_index IF NOT EXISTS FOR ()-[r]-() ON TYPE(r)"
            ]
            
            for idx in indices:
                try:
                    session.run(idx)
                except Exception as e:
                    logger.warning(f"Index creation warning: {str(e)}")
            
            logger.info("Created basic indices")

    def _discover_entities_relationships(self, ecommerce_df: pd.DataFrame, chat_logs_df: pd.DataFrame) -> None:
        """Use LLM to discover entities and relationships in the data"""
        # First, analyze the dataframes to understand their structure
        ecommerce_columns = ecommerce_df.columns.tolist()
        chat_columns = chat_logs_df.columns.tolist()
        
        # Get sample data
        ecommerce_sample = ecommerce_df.head(5).to_dict('records')
        chat_sample = chat_logs_df.head(5).to_dict('records')
        
        # Use LLM to identify entities and relationships
        schema_prompt = f"""
        Analyze these datasets to identify entities and relationships for a knowledge graph:
        
        E-commerce Dataset Columns: {ecommerce_columns}
        E-commerce Sample Data: {ecommerce_sample}
        
        Chat Logs Dataset Columns: {chat_columns}
        Chat Logs Sample Data: {chat_sample}
        
        Please identify:
        1. All potential entity types (nodes) in this data
        2. Key properties for each entity type
        3. Potential relationships between entities
        4. Complex patterns or insights that might emerge from analyzing this data
        
        Return your analysis as a JSON object with:
        - "entity_types": a list of entity types and their key identifying properties
        - "relationships": potential relationships between entities
        - "insights": complex patterns or insights that could be extracted
        """
        
        try:
            schema_response = self._safe_llm_call(schema_prompt)
            schema_analysis = json.loads(schema_response)
            
            # Extract entity types and relationships from LLM response
            entity_types = schema_analysis.get('entity_types', [])
            relationship_types = schema_analysis.get('relationships', [])
            
            # Process e-commerce data to create entities
            self._process_ecommerce_data(ecommerce_df, entity_types)
            
            # Process chat data to create entities
            self._process_chat_data(chat_logs_df, entity_types)
            
            # Create the discovered relationships
            self._create_discovered_relationships(ecommerce_df, chat_logs_df, relationship_types)
            
            logger.info(f"Created {len(entity_types)} entity types and {len(relationship_types)} relationship types")
            
        except json.JSONDecodeError:
            logger.error("Failed to parse LLM schema analysis as JSON")
            # Fallback: Use a simpler approach with predefined entities
            self._fallback_schema_creation(ecommerce_df, chat_logs_df)
            
        except Exception as e:
            logger.error(f"Error discovering entities and relationships: {str(e)}")
            self._fallback_schema_creation(ecommerce_df, chat_logs_df)

    def _process_ecommerce_data(self, df: pd.DataFrame, entity_types: List[Dict]) -> None:
        """Process e-commerce data to create entity nodes based on discovered types"""
        with self.driver.session() as session:
            # Process each entity type that can be found in the e-commerce data
            for entity_type in entity_types:
                type_name = entity_type.get('name')
                id_property = entity_type.get('id_property')
                properties = entity_type.get('properties', [])
                
                if not type_name or not id_property:
                    continue
                
                # Check if the id_property exists in the dataframe
                if id_property not in df.columns:
                    continue
                
                # Find valid properties that exist in dataframe
                valid_properties = [p for p in properties if p in df.columns]
                
                # Group by the ID property to create unique entities
                unique_entities = df.groupby(id_property).first().reset_index()
                
                # Create entities in batches to avoid memory issues
                batch_size = 1000
                for i in range(0, len(unique_entities), batch_size):
                    batch = unique_entities.iloc[i:i+batch_size]
                    
                    # Create a parameter list for batch processing
                    params_list = []
                    for _, row in batch.iterrows():
                        entity_id = row[id_property]
                        # Create properties dict with non-null values
                        props = {
                            'id': str(entity_id),
                            'entity_type': type_name
                        }
                        
                        for prop in valid_properties:
                            if prop in row and not pd.isna(row[prop]):
                                props[prop] = str(row[prop])
                        
                        params_list.append(props)
                    
                    # Create entities in batch
                    if params_list:
                        query = f"""
                        UNWIND $params AS param
                        MERGE (n:{type_name} {{id: param.id}})
                        SET n += param
                        """
                        session.run(query, params=params_list)
                
                logger.info(f"Created {len(unique_entities)} {type_name} entities")

    def _process_chat_data(self, df: pd.DataFrame, entity_types: List[Dict]) -> None:
        """Process chat data to create entity nodes based on discovered types"""
        with self.driver.session() as session:
            # Process each entity type that can be found in the chat data
            for entity_type in entity_types:
                type_name = entity_type.get('name')
                id_property = entity_type.get('id_property')
                properties = entity_type.get('properties', [])
                message_property = entity_type.get('message_property')
                
                # Skip entity types that don't use message data or lack required properties
                if not type_name or not message_property or message_property not in df.columns:
                    continue
                
                # For chat data, we need to extract entities from messages
                # Use LLM to detect entities in the messages
                
                # Sample messages for entity extraction
                sample_messages = df[message_property].sample(min(100, len(df))).tolist()
                
                entity_prompt = f"""
                Analyze these customer service messages to identify {type_name} entities:
                
                Messages: {sample_messages}
                
                Extract all instances of {type_name} entities mentioned in these messages.
                For each entity, identify key properties: {properties}
                
                Return as a JSON list of entities with their properties.
                """
                
                try:
                    entity_response = self._safe_llm_call(entity_prompt)
                    detected_entities = json.loads(entity_response)
                    
                    # Create the detected entities
                    if detected_entities and isinstance(detected_entities, list):
                        # Prepare batch parameters
                        params_list = []
                        for entity in detected_entities:
                            # Ensure the entity has a unique ID
                            if id_property not in entity:
                                continue
                            
                            entity_id = entity[id_property]
                            entity['id'] = str(entity_id)
                            entity['entity_type'] = type_name
                            params_list.append(entity)
                        
                        # Create entities in batch
                        if params_list:
                            query = f"""
                            UNWIND $params AS param
                            MERGE (n:{type_name} {{id: param.id}})
                            SET n += param
                            """
                            session.run(query, params=params_list)
                            
                        logger.info(f"Created {len(params_list)} {type_name} entities from chat data")
                        
                except json.JSONDecodeError:
                    logger.warning(f"Failed to parse entity extraction for {type_name} as JSON")
                except Exception as e:
                    logger.warning(f"Error processing chat data for {type_name}: {str(e)}")

    def _create_discovered_relationships(self, ecommerce_df: pd.DataFrame, chat_df: pd.DataFrame, 
                                         relationship_types: List[Dict]) -> None:
        """Create relationships based on discovered patterns"""
        with self.driver.session() as session:
            # For each relationship type discovered by the LLM
            for rel_type in relationship_types:
                source_type = rel_type.get('source_type')
                target_type = rel_type.get('target_type')
                relationship = rel_type.get('relationship')
                source_id = rel_type.get('source_id')
                target_id = rel_type.get('target_id')
                dataset = rel_type.get('dataset', 'ecommerce')
                
                if not all([source_type, target_type, relationship, source_id, target_id]):
                    continue
                
                # Select the appropriate dataset
                df = ecommerce_df if dataset == 'ecommerce' else chat_df
                
                # Check if required columns exist
                if source_id not in df.columns or target_id not in df.columns:
                    continue
                
                # For related entities in the same dataset
                try:
                    # Extract unique relationship pairs
                    rel_data = df[[source_id, target_id]].drop_duplicates()
                    rel_data = rel_data.dropna()
                    
                    # Create relationships in batches
                    batch_size = 1000
                    for i in range(0, len(rel_data), batch_size):
                        batch = rel_data.iloc[i:i+batch_size]
                        
                        # Prepare relationship batch parameters
                        params_list = []
                        for _, row in batch.iterrows():
                            source_value = str(row[source_id])
                            target_value = str(row[target_id])
                            params_list.append({
                                'source_id': source_value, 
                                'target_id': target_value,
                            })
                        
                        # Create relationships in batch
                        if params_list:
                            query = f"""
                            UNWIND $params AS param
                            MATCH (a:{source_type} {{id: param.source_id}})
                            MATCH (b:{target_type} {{id: param.target_id}})
                            MERGE (a)-[r:{relationship}]->(b)
                            """
                            session.run(query, params=params_list)
                    
                    logger.info(f"Created {len(rel_data)} {relationship} relationships from {source_type} to {target_type}")
                
                except Exception as e:
                    logger.warning(f"Error creating {relationship} relationships: {str(e)}")

    def _enrich_knowledge_graph(self) -> None:
        """Analyze the graph to discover additional relationships and insights"""
        try:
            # Get summary of current graph
            with self.driver.session() as session:
                # Count nodes by label
                node_count = session.run("""
                    CALL db.labels() YIELD label
                    MATCH (n:`$label`)
                    RETURN label, count(n) AS count
                    ORDER BY count DESC
                """).data()
                
                # Count relationships by type
                rel_count = session.run("""
                    MATCH ()-[r]->()
                    RETURN type(r) AS type, count(r) AS count
                    ORDER BY count DESC
                """).data()
            
            # Use LLM to identify additional insights
            enrich_prompt = f"""
            Analyze this knowledge graph structure to identify additional relationships and insights:
            
            Node counts by type: {node_count}
            Relationship counts by type: {rel_count}
            
            Suggest:
            1. Additional relationship types that could be inferred from existing ones
            2. Potential property patterns that could be used to create new relationships
            3. Complex graph patterns that could reveal insights (like product co-purchasing)
            
            Return as a JSON object with:
            - "new_relationships": a list of Cypher queries to create new relationship types
            - "property_patterns": a list of Cypher queries to set new properties
            - "complex_patterns": a list of Cypher queries to detect and create complex patterns
            """
            
            enrich_response = self._safe_llm_call(enrich_prompt)
            
            try:
                enrichment = json.loads(enrich_response)
                
                with self.driver.session() as session:
                    # Execute the suggested queries to create new relationships
                    for query in enrichment.get('new_relationships', []):
                        try:
                            session.run(query)
                        except Exception as e:
                            logger.warning(f"Error executing enrichment query: {str(e)}")
                    
                    # Set new properties based on patterns
                    for query in enrichment.get('property_patterns', []):
                        try:
                            session.run(query)
                        except Exception as e:
                            logger.warning(f"Error setting properties: {str(e)}")
                    
                    # Create complex pattern-based relationships
                    for query in enrichment.get('complex_patterns', []):
                        try:
                            session.run(query)
                        except Exception as e:
                            logger.warning(f"Error creating complex patterns: {str(e)}")
                
                logger.info("✅ Enriched knowledge graph with additional relationships and patterns")
                
            except json.JSONDecodeError:
                logger.warning("Failed to parse enrichment suggestions as JSON")
                
        except Exception as e:
            logger.error(f"Error enriching knowledge graph: {str(e)}")

    def _fallback_schema_creation(self, ecommerce_df: pd.DataFrame, chat_logs_df: pd.DataFrame) -> None:
        """Create a basic schema if the LLM-based discovery fails"""
        logger.info("Using fallback schema creation")
        
        with self.driver.session() as session:
            # Create basic Product nodes
            if 'SKU' in ecommerce_df.columns:
                for _, row in ecommerce_df.drop_duplicates('SKU').iterrows():
                    props = {'id': str(row['SKU']), 'entity_type': 'Product'}
                    for col in ecommerce_df.columns:
                        if col != 'SKU' and not pd.isna(row[col]):
                            props[col] = str(row[col])
                    
                    session.run("""
                        MERGE (p:Product {id: $id})
                        SET p += $props
                    """, id=props['id'], props=props)
            
            # Create basic Category nodes
            if 'Category' in ecommerce_df.columns:
                for category in ecommerce_df['Category'].dropna().unique():
                    session.run("""
                        MERGE (c:Category {id: $id, name: $name, entity_type: 'Category'})
                    """, id=str(category), name=str(category))
            
            # Create product-category relationships
            if 'SKU' in ecommerce_df.columns and 'Category' in ecommerce_df.columns:
                for _, row in ecommerce_df[['SKU', 'Category']].dropna().drop_duplicates().iterrows():
                    session.run("""
                        MATCH (p:Product {id: $sku})
                        MATCH (c:Category {id: $category})
                        MERGE (p)-[:BELONGS_TO]->(c)
                    """, sku=str(row['SKU']), category=str(row['Category']))
            
            # Extract entities from chat messages
            if 'message' in chat_logs_df.columns:
                sample_messages = chat_logs_df['message'].sample(min(50, len(chat_logs_df))).tolist()
                
                # Create basic CustomerIssue nodes
                issue_prompt = f"""
                Extract customer issues from these messages:
                {sample_messages}
                
                Return a list of unique customer issues in JSON format with the issue type and description.
                """
                
                try:
                    issues_response = self._safe_llm_call(issue_prompt)
                    issues = json.loads(issues_response)
                    
                    for i, issue in enumerate(issues):
                        issue_type = issue.get('type', f"Unknown{i}")
                        description = issue.get('description', '')
                        
                        session.run("""
                            CREATE (i:CustomerIssue {
                                id: $id,
                                type: $type,
                                description: $description,
                                entity_type: 'CustomerIssue'
                            })
                        """, id=str(issue_type), type=str(issue_type), description=str(description))
                except:
                    logger.warning("Failed to create customer issues from chat data")
            
            logger.info("Created basic fallback schema")

    def _safe_llm_call(self, prompt: str) -> str:
        """Make LLM API call with rate limiting"""
        current_time = time.time()
        time_since_last = current_time - self.last_api_call
        if time_since_last < self.min_delay:
            time.sleep(self.min_delay - time_since_last)
        
        try:
            response = self.chat.send_message(prompt).text
            self.last_api_call = time.time()
            return response
        except Exception as e:
            logger.error(f"Error in LLM call: {str(e)}")
            return "{}"  # Return empty JSON as fallback

    def process_query(self, query: str) -> Dict:
        """Process user query through Graph RAG pipeline"""
        try:
            # Step a: Use LLM to understand query and transform to graph query pattern
            cypher_query = self._query_to_cypher(query)
            logger.info(f"Generated Cypher query: {cypher_query}")
            
            # Step b: Execute query on knowledge graph
            graph_results = self._execute_knowledge_graph_query(cypher_query)
            logger.info(f"Found {len(graph_results)} results in knowledge graph")
            
            # Step c: Use LLM to analyze and enhance graph results with additional context
            enhanced_results = self._augment_results_with_llm(query, graph_results)
            
            return enhanced_results
            
        except Exception as e:
            logger.error(f"Error processing query: {str(e)}")
            return {"error": str(e), "query": query}

    def _query_to_cypher(self, query: str) -> str:
        """Convert natural language query to Cypher query using LLM"""
        try:
            # First, get schema information
            with self.driver.session() as session:
                # Get node labels
                labels = session.run("CALL db.labels() YIELD label RETURN collect(label) AS labels").single()["labels"]
                
                # Get relationship types
                rel_types = session.run("CALL db.relationshipTypes() YIELD relationshipType RETURN collect(relationshipType) AS types").single()["types"]
                
                # Get a sample of node properties for each label
                node_properties = {}
                for label in labels:
                    sample = session.run(f"MATCH (n:{label}) RETURN n LIMIT 1").data()
                    if sample:
                        props = sample[0]['n']
                        node_properties[label] = list(props.keys())
            
            # Use LLM to generate Cypher query
            cypher_prompt = f"""
            Convert this natural language query to a Cypher query for Neo4j:
            
            Query: "{query}"
            
            Available node labels: {labels}
            Available relationship types: {rel_types}
            Node properties by label: {node_properties}
            
            Return ONLY the Cypher query, no explanation, no markdown, no code blocks.
            The query should always include LIMIT to prevent excessive returns.
            """
            
            cypher_text = self._safe_llm_call(cypher_prompt)
            
            # Clean up the response - remove markdown code blocks if present
            cypher_text = re.sub(r'```(?:cypher)?\s*([\s\S]*?)\s*```', r'\1', cypher_text).strip()
            
            # Ensure the query has a LIMIT if RETURN is present
            if "RETURN" in cypher_text.upper() and "LIMIT" not in cypher_text.upper():
                cypher_text += " LIMIT 25"
                
            return cypher_text
            
        except Exception as e:
            logger.error(f"Error converting query to Cypher: {str(e)}")
            # Return a simple fallback query
            return "MATCH (n) RETURN n LIMIT 10"

    def _execute_knowledge_graph_query(self, cypher_query: str) -> List[Dict]:
        """Execute query on Neo4j knowledge graph"""
        try:
            # Clean any markdown or code blocks that might be in the query
            cleaned_query = re.sub(r'```(?:cypher)?\s*([\s\S]*?)\s*```', r'\1', cypher_query).strip()
            
            with self.driver.session() as session:
                result = session.run(cleaned_query)
                return [record.data() for record in result]
        except Exception as e:
            logger.error(f"Error executing knowledge graph query: {str(e)}")
            return [{"error": str(e)}]

    def _augment_results_with_llm(self, original_query: str, graph_results: List[Dict]) -> Dict:
        """Augment graph results with LLM analysis to produce final answer"""
        try:
            # Use LLM to analyze graph results and generate answer
            augment_prompt = f"""
            Original query: "{original_query}"
            
            Results from knowledge graph: {graph_results}
            
            Based on these results from the knowledge graph:
            1. Provide a direct answer to the original query
            2. Include relevant context and insights from the graph results
            3. If the results seem incomplete, explain what might be missing
            
            Return your response as JSON with:
            - "answer": A direct, comprehensive answer to the query
            - "insights": Additional insights or context from the graph data
            - "confidence": Your confidence in the answer (high, medium, low)
            """
            
            augment_response = self._safe_llm_call(augment_prompt)
            
            try:
                final_response = json.loads(augment_response)
            except json.JSONDecodeError:
                # Create a fallback response if JSON parsing fails
                final_response = {
                    "answer": f"Based on the knowledge graph, here's what I found: {graph_results[:5] if graph_results else 'No results found.'}",
                    "insights": "The knowledge graph provided the above information based on your query.",
                    "confidence": "medium"
                }
            
            # Include the original results and query
            final_response["query"] = original_query
            final_response["graph_results"] = graph_results
            
            return final_response
            
        except Exception as e:
            logger.error(f"Error augmenting results with LLM: {str(e)}")
            return {
                "answer": "Error processing results. Please try again.",
                "error": str(e),
                "query": original_query,
                "graph_results": graph_results
            }

# Main function to run the system
def run_graph_rag(neo4j_uri, neo4j_user, neo4j_password, gemini_api_key, ecommerce_path, chat_logs_path, queries):
    """Run the Graph RAG system with the given parameters"""
    try:
        # Initialize the system
        rag_system = GraphRAGSystem(
            neo4j_uri=neo4j_uri,
            neo4j_user=neo4j_user,
            neo4j_password=neo4j_password,
            gemini_api_key=gemini_api_key
        )
        
        # Load datasets and convert to knowledge graph
        rag_system.load_datasets(ecommerce_path, chat_logs_path)
        
        # Process queries
        results = {}
        
        for query in queries:
            logger.info(f"Processing query: {query}")
            query_result = rag_system.process_query(query)
            results[query] = query_result
            logger.info(f"Query result: {query_result.get('answer', 'No answer generated')}")
            time.sleep(2)  # Add delay between queries
        
        return results
        
    except Exception as e:
        logger.error(f"Error running Graph RAG system: {str(e)}")
        return {"error": str(e)}

# Example usage
if __name__ == "__main__":
    # Configuration
    NEO4J_URI = "bolt://localhost:7687"
    NEO4J_USER = "neo4j"
    NEO4J_PASSWORD = "pulseinsights"
    GEMINI_API_KEY = "AIzaSyCJ7XjljkXJHEPvYOMONw2BDoP4qGJ9NIY"
    
    # File paths
    ECOMMERCE_DATA_PATH = "Amazon Sale Report.csv"
    CHAT_LOGS_PATH = "pulseun.csv"
    
    # Example queries
    QUERIES = [
        "Give 5 names of products which sold last by date present in dataset?"
    ]
    
    # Run the system
    results = run_graph_rag(
        NEO4J_URI, 
        NEO4J_USER, 
        NEO4J_PASSWORD, 
        GEMINI_API_KEY,
        ECOMMERCE_DATA_PATH,
        CHAT_LOGS_PATH,
        QUERIES
    )
    
    print("\n=== RESULTS ===")
    for query, result in results.items():
        print(f"\nQuery: {query}")
        print(f"Answer: {result.get('answer', 'No answer generated')}")
        print(f"Insights: {result.get('insights', 'No insights generated')}")
        print("="*50)

2025-03-01 02:11:55,617 - INFO - ✅ Connected to Neo4j database
2025-03-01 02:11:55,643 - INFO - ✅ Initialized Gemini LLM
  ecommerce_df = pd.read_csv(ecommerce_path)
2025-03-01 02:11:57,968 - INFO - Loaded e-commerce data: (128975, 24)
2025-03-01 02:11:57,982 - INFO - Loaded chat logs: (1057, 1)
2025-03-01 02:12:02,137 - INFO - Cleared existing database
"CREATE INDEX entity_id_index IF NOT EXISTS FOR (n) ON (n.id)"
                                                  ^}
"CREATE INDEX entity_name_index IF NOT EXISTS FOR (n) ON (n.name)"
                                                    ^}
"CREATE INDEX entity_type_index IF NOT EXISTS FOR (n) ON (n.entity_type)"
                                                    ^}
"CREATE INDEX relationship_type_index IF NOT EXISTS FOR ()-[r]-() ON TYPE(r)"
                                                             ^}
2025-03-01 02:12:02,433 - INFO - Created basic indices
2025-03-01 02:12:11,051 - ERROR - Failed to parse LLM schema analysis as JSON
20


=== RESULTS ===

Query: Give 5 names of products which sold last by date present in dataset?
Answer: Based on the knowledge graph, here's what I found: [{'p.id': 'J0062-DR-M', 'p.Date': '06-29-22'}, {'p.id': 'SET414-KR-NP-XXXL', 'p.Date': '06-29-22'}, {'p.id': 'JNE3486-KR-XXL', 'p.Date': '06-29-22'}, {'p.id': 'SET444-KR-SH-M', 'p.Date': '06-29-22'}, {'p.id': 'SET444-KR-SH-L', 'p.Date': '06-29-22'}]
Insights: The knowledge graph provided the above information based on your query.


I'll explain step by step what's happening in this GraphRAGSystem code:

Class Initialization:

The system connects to a Neo4j graph database and sets up Gemini LLM (Google's large language model)
It establishes connections with proper authentication and configures API parameters


Data Loading:

Loads two datasets: e-commerce data and chat logs from CSV files
Prepares this data for conversion into a knowledge graph


Knowledge Graph Creation:

Clears existing database and sets up indices for performance
Uses LLM to discover entities (nodes) and relationships in the data
Processes e-commerce data to create entity nodes
Processes chat data to extract entities from messages
Creates relationships between entities based on patterns discovered by the LLM


Knowledge Graph Enrichment:

Analyzes the graph to discover additional insights
Uses LLM to suggest new relationships and patterns
Executes these suggestions to enhance the knowledge graph
Has a fallback schema creation if the LLM approach fails


Query Processing:

Converts natural language queries to Cypher (Neo4j's query language) using LLM
Executes these Cypher queries on the knowledge graph
Gets results from the graph database
Uses LLM again to analyze these results and enhance them with context


Result Generation:

Returns a JSON response with answers, insights, and confidence levels
Includes original query and graph results for reference


Main Execution:

Sets up configuration parameters (database URI, credentials, API keys)
Specifies file paths for datasets
Defines example queries to process
Runs the system and prints results



This is effectively a RAG (Retrieval-Augmented Generation) system that uses a graph database instead of vector embeddings, allowing for more complex relationship-based queries and insights.