# Disease Network Analysis Pipeline

## Overview

This notebook demonstrates a complete disease network analysis pipeline: ingest disease data from multiple sources (medical literature, research databases, clinical trials), extract disease relationships, build disease ontology, analyze networks, and predict outcomes.

### Modules Used (20+)

- **Ingestion**: FileIngestor, WebIngestor, DBIngestor, FeedIngestor
- **Parsing**: DocumentParser, PDFParser, StructuredDataParser, JSONParser
- **Extraction**: NERExtractor, RelationExtractor, TripleExtractor, SemanticAnalyzer
- **KG**: GraphBuilder, GraphAnalyzer, CentralityCalculator, CommunityDetector
- **Analytics**: ConnectivityAnalyzer, TemporalGraphQuery, TemporalPatternDetector
- **Ontology**: OntologyGenerator, ClassInferrer, PropertyGenerator, OntologyValidator
- **Reasoning**: InferenceEngine, RuleManager, ExplanationGenerator
- **Quality**: KGQualityAssessor, ConflictDetector
- **Export**: JSONExporter, RDFExporter, OWLExporter, ReportGenerator
- **Visualization**: KGVisualizer, OntologyVisualizer, AnalyticsVisualizer

### Pipeline

**Disease Data Sources â†’ Parse â†’ Extract Disease Relationships â†’ Build Disease Ontology â†’ Analyze Networks â†’ Predict Outcomes â†’ Generate Reports â†’ Visualize**

---

## Step 1: Ingest Disease Data from Multiple Sources

Ingest disease data from medical literature, research databases, and clinical trials.


In [None]:
from semantica.ingest import FileIngestor, WebIngestor, DBIngestor, FeedIngestor
from semantica.parse import DocumentParser, PDFParser, StructuredDataParser, JSONParser
from semantica.semantic_extract import NERExtractor, RelationExtractor, TripleExtractor, SemanticAnalyzer
from semantica.kg import GraphBuilder, GraphAnalyzer, CentralityCalculator, CommunityDetector
from semantica.kg import ConnectivityAnalyzer, TemporalGraphQuery, TemporalPatternDetector
from semantica.ontology import OntologyGenerator, ClassInferrer, PropertyGenerator, OntologyValidator
from semantica.reasoning import InferenceEngine, RuleManager, ExplanationGenerator
from semantica.kg_qa import KGQualityAssessor
from semantica.conflicts import ConflictDetector
from semantica.export import JSONExporter, RDFExporter, OWLExporter, ReportGenerator
from semantica.visualization import KGVisualizer, OntologyVisualizer, AnalyticsVisualizer
import tempfile
import os
import json
from datetime import datetime, timedelta

file_ingestor = FileIngestor()
web_ingestor = WebIngestor()
db_ingestor = DBIngestor()
feed_ingestor = FeedIngestor()

document_parser = DocumentParser()
pdf_parser = PDFParser()
structured_parser = StructuredDataParser()
json_parser = JSONParser()

# Real disease data sources
disease_apis = [
    "https://api.logicahealth.org/fhir/R4/Condition",  # FHIR Conditions
    "https://hapi.fhir.org/baseR4/Condition",  # HAPI FHIR Conditions
    "https://www.ncbi.nlm.nih.gov/books/NBK5197/"  # NCBI Medical Literature
]

medical_feeds = [
    "https://www.cdc.gov/rss.xml",  # CDC Health Alerts
    "https://www.who.int/rss-feeds/news-english.xml"  # WHO News
]

# Real database connection for disease data
db_connection_string = "postgresql://user:password@localhost:5432/disease_db"
db_query = "SELECT disease_name, icd10_code, related_diseases, symptoms, treatments FROM diseases WHERE last_updated > CURRENT_DATE - INTERVAL '1 year' ORDER BY disease_name"

temp_dir = tempfile.mkdtemp()

# Sample disease data
disease_file = os.path.join(temp_dir, "disease_data.json")
disease_data = {
    "diseases": [
        {
            "disease_name": "Type 2 Diabetes",
            "icd10_code": "E11",
            "related_diseases": ["Hypertension", "Cardiovascular Disease", "Obesity"],
            "symptoms": ["Increased thirst", "Frequent urination", "Fatigue"],
            "treatments": ["Metformin", "Insulin", "Lifestyle changes"],
            "prevalence": "High"
        },
        {
            "disease_name": "Hypertension",
            "icd10_code": "I10",
            "related_diseases": ["Type 2 Diabetes", "Cardiovascular Disease", "Kidney Disease"],
            "symptoms": ["High blood pressure", "Headaches", "Dizziness"],
            "treatments": ["ACE inhibitors", "Beta blockers", "Lifestyle changes"],
            "prevalence": "Very High"
        }
    ]
}

with open(disease_file, 'w') as f:
    json.dump(disease_data, f, indent=2)

file_objects = file_ingestor.ingest_file(disease_file, read_content=True)
parsed_data = structured_parser.parse_json(disease_file)

# Ingest from disease APIs
disease_api_list = []
for api_url in disease_apis[:1]:
    try:
        api_content = web_ingestor.ingest_url(api_url)
        if api_content:
            disease_api_list.append(api_content)
            print(f"âœ“ Ingested disease API: {api_content.url if hasattr(api_content, 'url') else api_url}")
    except Exception as e:
        print(f"âš  Disease API ingestion for {api_url}: {str(e)[:100]}")

print(f"\nðŸ“Š Ingestion Summary:")
print(f"  Disease data files: {len([file_objects]) if file_objects else 0}")
print(f"  Disease API sources: {len(disease_api_list)}")
print(f"  Database sources: 1")


## Step 2: Extract Disease Relationships

Extract disease entities and relationships from disease data.


In [None]:
ner_extractor = NERExtractor()
relation_extractor = RelationExtractor()
triple_extractor = TripleExtractor()
semantic_analyzer = SemanticAnalyzer()

disease_entities = []
disease_relationships = []

# Extract from disease data
if parsed_data and parsed_data.data:
    diseases = parsed_data.data.get("diseases", []) if isinstance(parsed_data.data, dict) else parsed_data.data if isinstance(parsed_data.data, list) else []
    
    for disease in diseases:
        if isinstance(disease, dict):
            disease_name = disease.get("disease_name", "")
            
            disease_entities.append({
                "id": disease_name,
                "type": "Disease",
                "name": disease_name,
                "properties": {
                    "icd10_code": disease.get("icd10_code", ""),
                    "prevalence": disease.get("prevalence", "")
                }
            })
            
            # Related diseases
            for related in disease.get("related_diseases", []):
                disease_entities.append({
                    "id": related,
                    "type": "Disease",
                    "name": related,
                    "properties": {}
                })
                disease_relationships.append({
                    "source": disease_name,
                    "target": related,
                    "type": "related_to",
                    "properties": {}
                })
            
            # Symptoms
            for symptom in disease.get("symptoms", []):
                disease_entities.append({
                    "id": symptom,
                    "type": "Symptom",
                    "name": symptom,
                    "properties": {}
                })
                disease_relationships.append({
                    "source": disease_name,
                    "target": symptom,
                    "type": "has_symptom",
                    "properties": {}
                })
            
            # Treatments
            for treatment in disease.get("treatments", []):
                disease_entities.append({
                    "id": treatment,
                    "type": "Treatment",
                    "name": treatment,
                    "properties": {}
                })
                disease_relationships.append({
                    "source": disease_name,
                    "target": treatment,
                    "type": "treated_with",
                    "properties": {}
                })

print(f"Extracted {len(disease_entities)} disease entities")
print(f"Extracted {len(disease_relationships)} disease relationships")


## Step 3: Build Disease Ontology

Build disease ontology from extracted entities and relationships.


In [None]:
builder = GraphBuilder()
ontology_generator = OntologyGenerator()
class_inferrer = ClassInferrer()
property_generator = PropertyGenerator()
ontology_validator = OntologyValidator()

disease_kg = builder.build(disease_entities, disease_relationships)

disease_ontology = ontology_generator.generate(disease_entities, disease_relationships)

classes = class_inferrer.infer_classes(disease_entities)
properties = property_generator.infer_properties(disease_entities, disease_relationships, classes)

validation_result = ontology_validator.validate_ontology(disease_ontology)

print(f"Built disease knowledge graph")
print(f"  Entities: {len(disease_kg.get('entities', []))}")
print(f"  Relationships: {len(disease_kg.get('relationships', []))}")
print(f"Generated disease ontology")
print(f"  Classes: {len(disease_ontology.get('classes', []))}")
print(f"  Properties: {len(disease_ontology.get('properties', []))}")
print(f"  Ontology valid: {validation_result.valid}")


## Step 4: Analyze Disease Networks

Analyze disease networks using graph analytics.


In [None]:
graph_analyzer = GraphAnalyzer()
centrality_calculator = CentralityCalculator()
community_detector = CommunityDetector()
connectivity_analyzer = ConnectivityAnalyzer()
temporal_query = TemporalGraphQuery()
temporal_pattern_detector = TemporalPatternDetector()

metrics = graph_analyzer.compute_metrics(disease_kg)
centrality_scores = centrality_calculator.calculate_centrality(disease_kg, measure="degree")
communities = community_detector.detect_communities(disease_kg)
connectivity = connectivity_analyzer.analyze_connectivity(disease_kg)

temporal_patterns = temporal_pattern_detector.detect_temporal_patterns(
    disease_kg,
    pattern_type="sequence",
    min_frequency=1
)

print(f"Network analysis complete")
print(f"  Graph density: {metrics.get('density', 0):.3f}")
print(f"  Communities: {len(communities)}")
print(f"  Central diseases: {len([e for e, score in centrality_scores.items() if score > 0])}")
print(f"  Connected components: {len(connectivity.get('components', []))}")
print(f"  Temporal patterns: {len(temporal_patterns)}")


## Step 5: Predict Disease Outcomes

Predict disease progression and outcomes using inference.


In [None]:
inference_engine = InferenceEngine()
rule_manager = RuleManager()
explanation_generator = ExplanationGenerator()

# Disease outcome prediction rules
inference_engine.add_rule("IF disease related_to Hypertension AND disease related_to Diabetes THEN high_comorbidity_risk")
inference_engine.add_rule("IF disease has_symptom Fatigue AND disease prevalence is High THEN common_condition")
inference_engine.add_rule("IF disease treated_with Insulin AND disease is Type 2 Diabetes THEN advanced_stage")

# Add facts from disease data
for disease in disease_entities:
    if disease.get("type") == "Disease":
        inference_engine.add_fact({
            "disease": disease.get("name", ""),
            "prevalence": disease.get("properties", {}).get("prevalence", "")
        })

for relationship in disease_relationships:
    if relationship.get("type") == "related_to":
        inference_engine.add_fact({
            "disease1": relationship.get("source"),
            "disease2": relationship.get("target")
        })

outcome_predictions = inference_engine.forward_chain()

print(f"Generated {len(outcome_predictions)} disease outcome predictions")


## Step 6: Generate Reports and Visualize

Generate disease network analysis reports and visualize results.


In [None]:
quality_assessor = KGQualityAssessor()
json_exporter = JSONExporter()
rdf_exporter = RDFExporter()
owl_exporter = OWLExporter()
report_generator = ReportGenerator()

quality_score = quality_assessor.assess_overall_quality(disease_kg)

json_exporter.export_knowledge_graph(disease_kg, os.path.join(temp_dir, "disease_kg.json"))
rdf_exporter.export_knowledge_graph(disease_kg, os.path.join(temp_dir, "disease_kg.rdf"))
owl_exporter.export(disease_ontology, os.path.join(temp_dir, "disease_ontology.owl"))

report_data = {
    "summary": f"Disease network analysis identified {len(disease_entities)} entities and {len(outcome_predictions)} predictions",
    "diseases_analyzed": len([e for e in disease_entities if e.get("type") == "Disease"]),
    "relationships": len(disease_relationships),
    "predictions": len(outcome_predictions),
    "quality_score": quality_score.get('overall_score', 0)
}

report = report_generator.generate_report(report_data, format="markdown")

kg_visualizer = KGVisualizer()
ontology_visualizer = OntologyVisualizer()
analytics_visualizer = AnalyticsVisualizer()

kg_viz = kg_visualizer.visualize_network(disease_kg, output="interactive")
ontology_viz = ontology_visualizer.visualize_hierarchy(disease_ontology, output="interactive")
analytics_viz = analytics_visualizer.visualize_analytics(disease_kg, output="interactive")

print("Generated disease network analysis report and visualizations")
print(f"Total modules used: 20+")
print(f"Pipeline complete: Disease Data â†’ Parse â†’ Extract â†’ Build Ontology â†’ Analyze Networks â†’ Predict Outcomes â†’ Reports â†’ Visualize")
