In [1]:
import os
import json
import networkx as nx
import matplotlib.pyplot as plt
from typing import List, Dict, Any
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains.llm import LLMChain
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

os.environ["OPENAI_API_KEY"] = "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"


In [5]:
file_paths = [
        "/Users/sreekargudipati/Coding Projects/CompanyMapper/documents/apple_10K_2024_annualreport.pdf",
        "/Users/sreekargudipati/Coding Projects/CompanyMapper/documents/Apple_Environmental_Progress_Report_2024.pdf",
        "/Users/sreekargudipati/Coding Projects/CompanyMapper/documents/Apple-Supply-Chain-2025-Progress-Report.pdf"
]

In [7]:
from knowledge_graph import load_documents
documents = load_documents(file_paths)

incorrect startxref pointer(1)
parsing for Object Streams
found 0 objects within Object(14266,0) whereas 100 expected
found 0 objects within Object(14267,0) whereas 100 expected
found 0 objects within Object(14268,0) whereas 100 expected
found 0 objects within Object(14269,0) whereas 100 expected
found 0 objects within Object(14270,0) whereas 100 expected
found 0 objects within Object(14271,0) whereas 100 expected
found 0 objects within Object(14272,0) whereas 100 expected
found 0 objects within Object(14273,0) whereas 100 expected
found 0 objects within Object(14274,0) whereas 100 expected
found 0 objects within Object(14275,0) whereas 100 expected
found 0 objects within Object(14276,0) whereas 100 expected
found 0 objects within Object(14277,0) whereas 100 expected
found 0 objects within Object(14278,0) whereas 100 expected
found 0 objects within Object(14279,0) whereas 100 expected
found 0 objects within Object(14280,0) whereas 100 expected
found 0 objects within Object(14281,0) whe

In [11]:
print(f"Loaded {len(documents)} documents.")
print(f"First document: {documents[5].page_content[:100]}...")

Loaded 291 documents.
First document: The Company’s ability to compete successfully depends heavily on ensuring the continuing and timely ...


In [14]:
from knowledge_graph import split_documents

chunks = split_documents(documents)
print(f"Split into {len(chunks)} chunks.")
print(f"First chunk: {chunks[0].page_content[:100]}...")


Split into 550 chunks.
First chunk: UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☒    AN...


In [None]:
def extract_entities_and_relationships(chunk: Document):
    print(f"Processing chunk with metadata: {chunk.metadata}")
    llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", openai_api_key=os.environ["OPENAI_API_KEY"])
    print("LLM initialized")
    prompt_template = """
    You are an expert in extracting structured information about Apple Inc. from documents.
    
    Extract entities and their relationships from the following text. Focus on:
    1. Apple and its business units
    2. Suppliers and manufacturing partners
    3. Products and components
    4. Geographic locations of operations
    5. Materials used in products
    6. Environmental initiatives
    7. Regulations and compliance
    
    TEXT:
    {text}
    
    SOURCE:
    {source}
    
    OUTPUT FORMAT:
    Return a JSON object with these arrays:
    1. "entities": List of entities found with their types and properties
    2. "relationships": List of relationships between entities with their types and properties
    
    Example:
    {{
        "entities": [
            {{"id": "apple", "type": "Company", "name": "Apple Inc.", "properties": {{"description": "Technology company"}}}},
            {{"id": "foxconn", "type": "Supplier", "name": "Foxconn", "properties": {{"description": "Manufacturing partner", "risk_level": "Medium"}}}}
        ],
        "relationships": [
            {{"source": "foxconn", "target": "apple", "type": "Supplies", "properties": {{"details": "Assembles iPhones", "dependency_level": "Critical"}}}}
        ]
    }}
    
    Return ONLY valid JSON:
    """
    
    prompt = ChatPromptTemplate.from_template(prompt_template)
    chain = prompt | llm
    
    print("Chain initialized")

    result = chain.invoke({"text": chunk.page_content, "source": chunk.metadata["source"]}).content
    print(f"LLM output: {result}")
    
    try:
        extracted_data = json.loads(result)
        
        if not isinstance(extracted_data, dict):
            print(f"Error: Expected a dictionary, got {type(extracted_data)}")
            return {"entities": [], "relationships": []}
            
        if "entities" not in extracted_data or "relationships" not in extracted_data:
            print(f"Error: Missing 'entities' or 'relationships' keys in the output")
            return {"entities": [], "relationships": []}
            
        valid_entities = []
        for entity in extracted_data.get("entities", []):
            if not isinstance(entity, dict):
                print(f"Warning: Expected entity to be a dict, got {type(entity)}")
                continue
                
            if "id" not in entity or "type" not in entity or "name" not in entity:
                print(f"Warning: Entity missing required fields: {entity}")
                continue
                
            if "properties" not in entity or not isinstance(entity["properties"], dict):
                entity["properties"] = {}
                
            valid_entities.append(entity)
            
        valid_relationships = []
        for rel in extracted_data.get("relationships", []):
            if not isinstance(rel, dict):
                print(f"Warning: Expected relationship to be a dict, got {type(rel)}")
                continue
                
            if "source" not in rel or "target" not in rel or "type" not in rel:
                print(f"Warning: Relationship missing required fields: {rel}")
                continue
                
            if "properties" not in rel or not isinstance(rel["properties"], dict):
                rel["properties"] = {}
                
            valid_relationships.append(rel)
            
        return {
            "entities": valid_entities,
            "relationships": valid_relationships
        }
        
    except json.JSONDecodeError:
        print(f"Error parsing LLM output as JSON: {result}")
        return {"entities": [], "relationships": []}

    
def process_all_chunks(chunks: List[Document]):
    print(f"Processing {len(chunks)} chunks")
    all_entities = {}
    all_relationships = []
    
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}")
        extracted_data = extract_entities_and_relationships(chunk)
        
        print(f"Found {len(extracted_data['entities'])} entities and {len(extracted_data['relationships'])} relationships")

        
        for entity in extracted_data["entities"]:
            if not isinstance(entity, dict) or "id" not in entity:
                print(f"Warning: Skipping invalid entity: {entity}")
                continue
                
            entity_id = entity["id"]
            if entity_id not in all_entities:
                all_entities[entity_id] = entity
            else:
                if "properties" not in entity:
                    entity["properties"] = {}
                if "properties" not in all_entities[entity_id]:
                    all_entities[entity_id]["properties"] = {}
                all_entities[entity_id]["properties"].update(entity["properties"])
        
        for rel in extracted_data["relationships"]:
            if not isinstance(rel, dict) or "source" not in rel or "target" not in rel:
                print(f"Warning: Skipping invalid relationship: {rel}")
                continue
                
            all_relationships.append(rel)
            
        if i >= 30:
            print("Processed 30 chunks, stopping for now.")
            break
    
    entities_list = list(all_entities.values())
    
    return {
        "entities": entities_list,
        "relationships": all_relationships
    }
        


In [None]:

knowledge_graph = process_all_chunks(chunks)

Processing 550 chunks
Processing chunk 1/550
Processing chunk with metadata: {'producer': 'Wdesk Fidelity Content Translations Version 010.004.252', 'creator': 'Workiva', 'creationdate': '2024-10-31T19:17:25+00:00', 'author': 'anonymous', 'moddate': '2024-10-31T14:07:33-07:00', 'title': '10-K 2024, 09.28.2024-2024-10-31-12-16', 'source': '/Users/sreekargudipati/Coding Projects/CompanyMapper/documents/apple_10K_2024_annualreport.pdf', 'total_pages': 121, 'page': 0, 'page_label': '1'}
LLM initialized
Chain initialized
LLM output: {
    "entities": [
        {
            "id": "apple",
            "type": "Company",
            "name": "Apple Inc.",
            "properties": {
                "description": "Technology company",
                "fiscal_year_ended": "September 28, 2024",
                "headquarters": "One Apple Park Way, Cupertino, California 95014"
            }
        },
        {
            "id": "california",
            "type": "GeographicLocation",
            "

In [36]:
print(knowledge_graph['entities'][8])

{'id': 'notes_due_2029_2', 'type': 'FinancialInstrument', 'name': '3.050% Notes due 2029', 'properties': {}}


In [None]:
def create_networkx_graph(knowledge_graph: Dict[str, Any]) -> nx.DiGraph:
    G = nx.DiGraph()
    
    for entity in knowledge_graph["entities"]:
        attributes = {
            "type": entity["type"],
            "name": entity["name"],
        }
        
        for prop_key, prop_value in entity["properties"].items():
            attributes[f"prop_{prop_key}"] = prop_value
            
        if "source" in entity:
            attributes["source"] = entity["source"]
            
        G.add_node(entity["id"], **attributes)
    
    for rel in knowledge_graph["relationships"]:
        if rel["source"] not in G or rel["target"] not in G:
            continue
            
        rel_key = (rel["source"], rel["target"], rel["type"])
        
        attributes = {
            "type": rel["type"],
        }
        
        for prop_key, prop_value in rel["properties"].items():
            attributes[f"prop_{prop_key}"] = prop_value
            
        G.add_edge(rel["source"], rel["target"], **attributes)
    
    return G


In [None]:
G = create_networkx_graph(knowledge_graph)

print(f"Knowledge graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

print(G.nodes(data=True))


Knowledge graph created with 125 nodes and 134 edges
[('apple', {'type': 'Company', 'name': 'Apple Inc.', 'prop_description': 'Technology company', 'prop_fiscal_year_ended': 'September 28, 2024', 'prop_headquarters': 'One Apple Park Way, Cupertino, California 95014', 'prop_fiscal_quarter_revenue': '$2,628,553,000,000', 'prop_shares_outstanding': '15,115,823,000', 'prop_fiscal_year_end': 'September 28, 2024', 'prop_business_units': ['Consumer Electronics', 'Software', 'Services'], 'prop_employees': 164000, 'prop_operations': 'International', 'prop_global_business': True, 'prop_focus': 'Consumer electronics, software, and services', 'prop_market_share': 'minority in global smartphone, personal computer, and tablet markets'}), ('california', {'type': 'GeographicLocation', 'name': 'California', 'prop_description': 'State of incorporation'}), ('nasdaq', {'type': 'StockExchange', 'name': 'The Nasdaq Stock Market LLC', 'prop_description': 'Stock exchange where Apple is listed'}), ('common_sto