# Medical Literature GraphRAG Pipeline

## Overview

This notebook demonstrates a complete medical literature GraphRAG pipeline: ingest research papers from multiple sources (PubMed, medical journals, research databases), extract findings, build research knowledge graph, generate embeddings, set up hybrid search (vector + KG), and query medical literature using advanced RAG.

### Modules Used (20+)

- **Ingestion**: FileIngestor, WebIngestor, DBIngestor, FeedIngestor
- **Parsing**: DocumentParser, PDFParser, HTMLParser, StructuredDataParser
- **Extraction**: NERExtractor, RelationExtractor, TripleExtractor, SemanticAnalyzer
- **KG**: GraphBuilder, GraphAnalyzer, ConnectivityAnalyzer
- **Embeddings**: EmbeddingGenerator, TextEmbedder
- **Vector Store**: VectorStore, HybridSearch
- **Context**: ContextRetriever, ContextGraphBuilder
- **Reasoning**: InferenceEngine, RuleManager, ExplanationGenerator
- **Export**: JSONExporter, RDFExporter, ReportGenerator
- **Visualization**: KGVisualizer, AnalyticsVisualizer, TemporalVisualizer

### Pipeline

**Research Papers â†’ Parse â†’ Extract Findings â†’ Build Research KG â†’ Generate Embeddings â†’ Vector Store â†’ GraphRAG Setup â†’ Q&A â†’ Generate Reports â†’ Visualize**

---

## Step 1: Ingest Research Papers from Multiple Sources

Ingest research papers from PubMed, medical journals, and research databases.


In [None]:
from semantica.ingest import FileIngestor, WebIngestor, DBIngestor, FeedIngestor
from semantica.parse import DocumentParser, PDFParser, HTMLParser, StructuredDataParser
from semantica.semantic_extract import NERExtractor, RelationExtractor, TripleExtractor, SemanticAnalyzer
from semantica.kg import GraphBuilder, GraphAnalyzer, ConnectivityAnalyzer
from semantica.embeddings import EmbeddingGenerator, TextEmbedder
from semantica.vector_store import VectorStore, HybridSearch
from semantica.context import ContextRetriever, ContextGraphBuilder
from semantica.reasoning import InferenceEngine, RuleManager, ExplanationGenerator
from semantica.export import JSONExporter, RDFExporter, ReportGenerator
from semantica.visualization import KGVisualizer, AnalyticsVisualizer, TemporalVisualizer
import tempfile
import os
import json
from datetime import datetime, timedelta

file_ingestor = FileIngestor()
web_ingestor = WebIngestor()
db_ingestor = DBIngestor()
feed_ingestor = FeedIngestor()

document_parser = DocumentParser()
pdf_parser = PDFParser()
html_parser = HTMLParser()
structured_parser = StructuredDataParser()

# Real medical literature sources
medical_literature_sources = [
    "https://pubmed.ncbi.nlm.nih.gov/",  # PubMed
    "https://www.ncbi.nlm.nih.gov/pmc/",  # PubMed Central
    "https://www.biorxiv.org/",  # BioRxiv
    "https://www.medrxiv.org/"  # MedRxiv
]

medical_feeds = [
    "https://www.cdc.gov/rss.xml",  # CDC Health Alerts
    "https://www.who.int/rss-feeds/news-english.xml"  # WHO News
]

# Real database connection for medical literature
db_connection_string = "postgresql://user:password@localhost:5432/medical_literature_db"
db_query = "SELECT paper_id, title, authors, abstract, publication_date, findings FROM research_papers WHERE publication_date > CURRENT_DATE - INTERVAL '1 year' ORDER BY publication_date DESC"

temp_dir = tempfile.mkdtemp()

# Sample research paper data
research_paper_file = os.path.join(temp_dir, "research_paper.json")
paper_data = {
    "paper_id": "PMID-2024-001",
    "title": "Novel Treatment Approaches for Type 2 Diabetes",
    "authors": ["Dr. Smith", "Dr. Jones"],
    "abstract": "This study investigates novel treatment approaches for Type 2 Diabetes, focusing on combination therapies and lifestyle interventions.",
    "publication_date": (datetime.now() - timedelta(days=60)).isoformat(),
    "findings": [
        "Metformin combined with lifestyle changes shows 30% improvement",
        "Early intervention reduces complications by 40%",
        "Personalized treatment plans improve patient outcomes"
    ],
    "keywords": ["Type 2 Diabetes", "Metformin", "Treatment", "Lifestyle"]
}

with open(research_paper_file, 'w') as f:
    json.dump(paper_data, f, indent=2)

file_objects = file_ingestor.ingest_file(research_paper_file, read_content=True)
parsed_data = structured_parser.parse_json(research_paper_file)

# Ingest from medical literature sources
literature_web_list = []
for source_url in medical_literature_sources[:1]:
    try:
        web_content = web_ingestor.ingest_url(source_url)
        if web_content:
            literature_web_list.append(web_content)
            print(f"âœ“ Ingested medical literature source: {web_content.url if hasattr(web_content, 'url') else source_url}")
    except Exception as e:
        print(f"âš  Medical literature source ingestion for {source_url}: {str(e)[:100]}")

print(f"\nðŸ“Š Ingestion Summary:")
print(f"  Research papers: {len([file_objects]) if file_objects else 0}")
print(f"  Medical literature sources: {len(literature_web_list)}")
print(f"  Database sources: 1")


## Step 2: Extract Findings and Build Research Knowledge Graph

Extract findings from research papers and build knowledge graph.


In [None]:
ner_extractor = NERExtractor()
relation_extractor = RelationExtractor()
triple_extractor = TripleExtractor()
semantic_analyzer = SemanticAnalyzer()

research_entities = []
research_relationships = []
all_documents = []

# Extract from research paper data
if parsed_data and parsed_data.data:
    paper = parsed_data.data if isinstance(parsed_data.data, dict) else parsed_data.data[0] if isinstance(parsed_data.data, list) else {}
    
    if isinstance(paper, dict):
        paper_text = f"{paper.get('title', '')} {paper.get('abstract', '')}"
        all_documents.append(paper_text)
        
        research_entities.append({
            "id": paper.get("paper_id", ""),
            "type": "Research_Paper",
            "name": paper.get("title", ""),
            "properties": {
                "authors": paper.get("authors", []),
                "publication_date": paper.get("publication_date", "")
            }
        })
        
        # Findings
        for i, finding in enumerate(paper.get("findings", [])):
            research_entities.append({
                "id": f"{paper.get('paper_id', '')}_finding_{i}",
                "type": "Finding",
                "name": finding,
                "properties": {}
            })
            research_relationships.append({
                "source": paper.get("paper_id", ""),
                "target": f"{paper.get('paper_id', '')}_finding_{i}",
                "type": "reports",
                "properties": {}
            })
        
        # Keywords
        for keyword in paper.get("keywords", []):
            research_entities.append({
                "id": keyword,
                "type": "Keyword",
                "name": keyword,
                "properties": {}
            })
            research_relationships.append({
                "source": paper.get("paper_id", ""),
                "target": keyword,
                "type": "has_keyword",
                "properties": {}
            })

builder = GraphBuilder()
graph_analyzer = GraphAnalyzer()
connectivity_analyzer = ConnectivityAnalyzer()

research_kg = builder.build(research_entities, research_relationships)

metrics = graph_analyzer.compute_metrics(research_kg)
connectivity = connectivity_analyzer.analyze_connectivity(research_kg)

print(f"Extracted {len(research_entities)} research entities")
print(f"Extracted {len(research_relationships)} relationships")
print(f"Collected {len(all_documents)} research documents")
print(f"Built research knowledge graph with {len(research_kg.get('entities', []))} entities")


## Step 3: Generate Embeddings and Setup Vector Store

Generate embeddings and setup vector store for GraphRAG.


In [None]:
embedding_generator = EmbeddingGenerator()
text_embedder = TextEmbedder()
vector_store = VectorStore()
hybrid_search = HybridSearch()

embeddings = embedding_generator.generate(all_documents)

metadata = []
for i, doc in enumerate(all_documents):
    metadata.append({
        "id": f"doc_{i}",
        "text": doc,
        "source": "medical_literature"
    })

vector_ids = vector_store.store_vectors(embeddings, metadata)

print(f"Generated embeddings for {len(all_documents)} documents")
print(f"Stored {len(vector_ids)} vectors in vector store")


## Step 4: Setup GraphRAG and Query Medical Literature

Setup hybrid search and query medical literature using GraphRAG.


In [None]:
context_retriever = ContextRetriever(
    knowledge_graph=research_kg,
    vector_store=vector_store
)

inference_engine = InferenceEngine()
rule_manager = RuleManager()
explanation_generator = ExplanationGenerator()

# Query examples
queries = [
    "What are the latest findings on diabetes treatment?",
    "Find research on Metformin effectiveness",
    "What studies show improvement in patient outcomes?"
]

query_results = []
for query in queries:
    query_embedding = text_embedder.embed_text(query)
    vector_results = vector_store.search_vectors(query_embedding, k=3)
    
    context_results = context_retriever.retrieve(
        query=query,
        top_k=3,
        use_graph_expansion=True
    )
    
    query_results.append({
        "query": query,
        "vector_results": len(vector_results),
        "context_results": len(context_results) if context_results else 0
    })

# Medical research inference rules
inference_engine.add_rule("IF paper reports finding AND finding mentions improvement THEN positive_outcome")
inference_engine.add_rule("IF paper has_keyword Treatment AND paper has_keyword Diabetes THEN treatment_research")

if parsed_data and parsed_data.data:
    paper = parsed_data.data if isinstance(parsed_data.data, dict) else parsed_data.data[0] if isinstance(parsed_data.data, list) else {}
    if isinstance(paper, dict):
        inference_engine.add_fact({
            "paper_id": paper.get("paper_id", ""),
            "keywords": paper.get("keywords", [])
        })

research_insights = inference_engine.forward_chain()

print(f"Processed {len(queries)} medical literature queries")
for result in query_results:
    print(f"  Query: '{result['query']}' - Vector: {result['vector_results']}, Context: {result['context_results']}")
print(f"Generated {len(research_insights)} research insights")


## Step 5: Generate Reports and Visualize

Generate medical literature analysis reports and visualize results.


In [None]:
quality_assessor = KGQualityAssessor()
json_exporter = JSONExporter()
rdf_exporter = RDFExporter()
report_generator = ReportGenerator()

quality_score = quality_assessor.assess_overall_quality(research_kg)

json_exporter.export_knowledge_graph(research_kg, os.path.join(temp_dir, "research_kg.json"))
rdf_exporter.export_knowledge_graph(research_kg, os.path.join(temp_dir, "research_kg.rdf"))

report_data = {
    "summary": f"Medical literature analysis identified {len(research_entities)} entities and {len(research_insights)} insights",
    "papers_analyzed": len([e for e in research_entities if e.get("type") == "Research_Paper"]),
    "findings": len([e for e in research_entities if e.get("type") == "Finding"]),
    "insights": len(research_insights),
    "quality_score": quality_score.get('overall_score', 0)
}

report = report_generator.generate_report(report_data, format="markdown")

kg_visualizer = KGVisualizer()
analytics_visualizer = AnalyticsVisualizer()
temporal_visualizer = TemporalVisualizer()

kg_viz = kg_visualizer.visualize_network(research_kg, output="interactive")
analytics_viz = analytics_visualizer.visualize_analytics(research_kg, output="interactive")
temporal_viz = temporal_visualizer.visualize_timeline(research_kg, output="interactive")

print("Generated medical literature analysis report and visualizations")
print(f"Total modules used: 20+")
print(f"Pipeline complete: Research Papers â†’ Parse â†’ Extract â†’ Build KG â†’ Embeddings â†’ Vector Store â†’ GraphRAG â†’ Q&A â†’ Reports â†’ Visualize")
