# Drug Discovery Pipeline

## Overview

This notebook demonstrates a complete drug discovery pipeline: ingest drug and protein data from multiple sources (APIs, databases, feeds), extract compound and target entities, build drug-target knowledge graph, generate embeddings, perform similarity search, predict drug-target interactions, and identify targets.

### Modules Used (20+)

- **Ingestion**: WebIngestor, DBIngestor, FeedIngestor, FileIngestor
- **Parsing**: JSONParser, StructuredDataParser, DocumentParser
- **Extraction**: NERExtractor, RelationExtractor, TripleExtractor, SemanticAnalyzer
- **KG**: GraphBuilder, GraphAnalyzer, CentralityCalculator, CommunityDetector
- **Analytics**: ConnectivityAnalyzer, TemporalGraphQuery
- **Embeddings**: EmbeddingGenerator, TextEmbedder
- **Vector Store**: VectorStore, HybridSearch
- **Reasoning**: InferenceEngine, RuleManager, ExplanationGenerator
- **Ontology**: OntologyGenerator, OntologyValidator
- **Export**: JSONExporter, RDFExporter, OWLExporter, ReportGenerator
- **Visualization**: KGVisualizer, OntologyVisualizer, AnalyticsVisualizer

### Pipeline

**Drug/Protein Data Sources â†’ Parse â†’ Extract Entities (compounds, targets, interactions) â†’ Build Drug-Target KG â†’ Generate Embeddings â†’ Similarity Search â†’ Predict Interactions â†’ Target Identification â†’ Generate Reports â†’ Visualize**

---

## Step 1: Ingest Drug and Protein Data from Multiple Sources

Ingest drug compound and protein target data from APIs, databases, and feeds.


In [None]:
from semantica.ingest import WebIngestor, DBIngestor, FeedIngestor, FileIngestor
from semantica.parse import JSONParser, StructuredDataParser, DocumentParser
from semantica.semantic_extract import NERExtractor, RelationExtractor, TripleExtractor, SemanticAnalyzer
from semantica.kg import GraphBuilder, GraphAnalyzer, CentralityCalculator, CommunityDetector
from semantica.kg import ConnectivityAnalyzer, TemporalGraphQuery
from semantica.embeddings import EmbeddingGenerator, TextEmbedder
from semantica.vector_store import VectorStore, HybridSearch
from semantica.reasoning import InferenceEngine, RuleManager, ExplanationGenerator
from semantica.ontology import OntologyGenerator, OntologyValidator
from semantica.export import JSONExporter, RDFExporter, OWLExporter, ReportGenerator
from semantica.visualization import KGVisualizer, OntologyVisualizer, AnalyticsVisualizer
import tempfile
import os
import json
from datetime import datetime, timedelta

web_ingestor = WebIngestor()
db_ingestor = DBIngestor()
feed_ingestor = FeedIngestor()
file_ingestor = FileIngestor()

json_parser = JSONParser()
structured_parser = StructuredDataParser()
document_parser = DocumentParser()

# Real drug APIs
drug_apis = [
    "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/2244/JSON",  # PubChem API
    "https://www.ebi.ac.uk/chembl/api/data/molecule/CHEMBL25.json",  # ChEMBL API
    "https://go.drugbank.com/releases/latest"  # DrugBank API
]

# Real protein APIs
protein_apis = [
    "https://www.uniprot.org/uniprot/P04637.json",  # UniProt API
    "https://www.rcsb.org/pdb/json/descriptors/1A2B"  # PDB API
]

# Real interaction databases
interaction_databases = [
    "STRING",
    "BioGRID"
]

# Real database connection for compound libraries
db_connection_string = "postgresql://user:password@localhost:5432/drug_discovery_db"
db_query = "SELECT compound_id, target_protein, interaction_type, binding_affinity, mechanism FROM drug_target_interactions WHERE binding_affinity < 100 LIMIT 1000"

temp_dir = tempfile.mkdtemp()

# Sample drug-target data for local ingestion
drug_data_file = os.path.join(temp_dir, "drug_targets.json")
drug_data = [
    {
        "compound_id": "CID2244",
        "compound_name": "Aspirin",
        "target_protein": "PTGS1",
        "target_name": "Prostaglandin G/H synthase 1",
        "interaction_type": "inhibitor",
        "binding_affinity": 5.2,
        "mechanism": "Irreversible inhibition",
        "pathway": "Arachidonic acid metabolism",
        "timestamp": (datetime.now() - timedelta(days=1)).isoformat()
    },
    {
        "compound_id": "CID1983",
        "compound_name": "Ibuprofen",
        "target_protein": "PTGS2",
        "target_name": "Prostaglandin G/H synthase 2",
        "interaction_type": "inhibitor",
        "binding_affinity": 8.5,
        "mechanism": "Reversible inhibition",
        "pathway": "Arachidonic acid metabolism",
        "timestamp": (datetime.now() - timedelta(days=2)).isoformat()
    },
    {
        "compound_id": "CID1983",
        "compound_name": "Ibuprofen",
        "target_protein": "PTGS1",
        "target_name": "Prostaglandin G/H synthase 1",
        "interaction_type": "inhibitor",
        "binding_affinity": 12.3,
        "mechanism": "Reversible inhibition",
        "pathway": "Arachidonic acid metabolism",
        "timestamp": (datetime.now() - timedelta(days=2)).isoformat()
    },
    {
        "compound_id": "CID60823",
        "compound_name": "Atorvastatin",
        "target_protein": "HMGCR",
        "target_name": "3-hydroxy-3-methylglutaryl-coenzyme A reductase",
        "interaction_type": "inhibitor",
        "binding_affinity": 0.8,
        "mechanism": "Competitive inhibition",
        "pathway": "Cholesterol biosynthesis",
        "timestamp": (datetime.now() - timedelta(days=3)).isoformat()
    },
    {
        "compound_id": "CID54686970",
        "compound_name": "Metformin",
        "target_protein": "PRKAA1",
        "target_name": "5'-AMP-activated protein kinase catalytic subunit alpha-1",
        "interaction_type": "activator",
        "binding_affinity": 15.0,
        "mechanism": "Allosteric activation",
        "pathway": "AMPK signaling",
        "timestamp": (datetime.now() - timedelta(days=4)).isoformat()
    }
]

with open(drug_data_file, 'w') as f:
    json.dump(drug_data, f, indent=2)

# Ingest from local file
file_data = file_ingestor.ingest_file(drug_data_file)
parsed_drug = structured_parser.parse_json(json.dumps(drug_data))

# Ingest from drug APIs (example with public API)
try:
    web_content = web_ingestor.ingest_url(drug_apis[0])  # PubChem API
    if web_content:
        print(f"âœ“ Ingested web content: {web_content.url if hasattr(web_content, 'url') else 'N/A'}")
except Exception as e:
    print(f"âš  Web ingestion (example): {str(e)[:100]}")

# Database ingestion pattern
try:
    db_data = db_ingestor.export_table(
        connection_string=db_connection_string,
        table_name="drug_target_interactions",
        limit=1000
    )
    print(f"âœ“ Database ingestion configured for: {db_connection_string}")
    print(f"  Query pattern: {db_query}")
except Exception as e:
    print(f"âš  Database connection (example pattern): Configure with real credentials")
    db_data = {"data": drug_data}

print(f"\nðŸ“Š Ingestion Summary:")
print(f"  Local drug-target interactions: {len(drug_data)}")
print(f"  Database records: {len(db_data.get('data', [])) if db_data else 0}")
print(f"  Drug APIs: {len(drug_apis)}")
print(f"  Protein APIs: {len(protein_apis)}")


## Step 2: Extract Drug and Target Entities

Extract compounds, targets, and interactions from the ingested data.


In [None]:
ner_extractor = NERExtractor()
relation_extractor = RelationExtractor()
triple_extractor = TripleExtractor()
semantic_analyzer = SemanticAnalyzer()

all_drug_texts = []
all_interactions = []

# Process parsed drug data
if parsed_drug and isinstance(parsed_drug, dict):
    interactions = parsed_drug.get("data", drug_data)
    for interaction in interactions:
        all_interactions.append(interaction)
        interaction_text = f"Compound {interaction.get('compound_name', '')} {interaction.get('interaction_type', '')} target {interaction.get('target_name', '')} with binding affinity {interaction.get('binding_affinity', 0)}"
        all_drug_texts.append(interaction_text)

# Extract entities
all_entities = []
all_relationships = []
all_triples = []

for text in all_drug_texts:
    entities = ner_extractor.extract(text)
    all_entities.extend(entities)
    
    relationships = relation_extractor.extract(text, entities)
    all_relationships.extend(relationships)
    
    triples = triple_extractor.extract(text)
    all_triples.extend(triples)

# Build structured entity list
compound_entities = []
target_entities = []
interaction_entities = []

unique_compounds = {}
unique_targets = {}

for interaction in all_interactions:
    compound_id = interaction.get("compound_id", "")
    compound_name = interaction.get("compound_name", "")
    
    if compound_id and compound_id not in unique_compounds:
        compound_entity = {
            "id": compound_id,
            "type": "Compound",
            "properties": {
                "compound_id": compound_id,
                "name": compound_name,
                "pathway": interaction.get("pathway", "")
            }
        }
        compound_entities.append(compound_entity)
        unique_compounds[compound_id] = compound_entity
    
    target_id = interaction.get("target_protein", "")
    target_name = interaction.get("target_name", "")
    
    if target_id and target_id not in unique_targets:
        target_entity = {
            "id": target_id,
            "type": "Target",
            "properties": {
                "protein_id": target_id,
                "name": target_name,
                "pathway": interaction.get("pathway", "")
            }
        }
        target_entities.append(target_entity)
        unique_targets[target_id] = target_entity
    
    interaction_entity = {
        "id": f"{compound_id}_{target_id}",
        "type": "Interaction",
        "properties": {
            "compound": compound_id,
            "target": target_id,
            "interaction_type": interaction.get("interaction_type", ""),
            "binding_affinity": interaction.get("binding_affinity", 0),
            "mechanism": interaction.get("mechanism", ""),
            "timestamp": interaction.get("timestamp", "")
        }
    }
    interaction_entities.append(interaction_entity)

print(f"Extracted {len(compound_entities)} unique compounds")
print(f"Extracted {len(target_entities)} unique targets")
print(f"Extracted {len(interaction_entities)} interactions")
print(f"Extracted {len(all_relationships)} relationships")
print(f"Extracted {len(all_triples)} triples")


## Step 3: Build Drug-Target Knowledge Graph

Build a knowledge graph from extracted drug-target entities and relationships.


In [None]:
builder = GraphBuilder()

# Add all entities
for compound in compound_entities:
    builder.add_entity(
        entity_id=compound["id"],
        entity_type=compound["type"],
        properties=compound.get("properties", {})
    )

for target in target_entities:
    builder.add_entity(
        entity_id=target["id"],
        entity_type=target["type"],
        properties=target.get("properties", {})
    )

for interaction in interaction_entities:
    builder.add_entity(
        entity_id=interaction["id"],
        entity_type=interaction["type"],
        properties=interaction.get("properties", {})
    )

# Add relationships
relationships = []
for interaction in interaction_entities:
    compound_id = interaction["properties"].get("compound", "")
    target_id = interaction["properties"].get("target", "")
    interaction_id = interaction["id"]
    interaction_type = interaction["properties"].get("interaction_type", "")
    binding_affinity = interaction["properties"].get("binding_affinity", 0)
    
    # Compound-Target relationship
    builder.add_relationship(
        source_id=compound_id,
        target_id=target_id,
        relationship_type=interaction_type,
        properties={
            "binding_affinity": binding_affinity,
            "mechanism": interaction["properties"].get("mechanism", "")
        }
    )
    
    # Interaction relationships
    builder.add_relationship(
        source_id=compound_id,
        target_id=interaction_id,
        relationship_type="has_interaction",
        properties={}
    )
    builder.add_relationship(
        source_id=interaction_id,
        target_id=target_id,
        relationship_type="targets",
        properties={}
    )
    
    relationships.append({
        "source": compound_id,
        "target": target_id,
        "type": interaction_type,
        "binding_affinity": binding_affinity
    })

knowledge_graph = builder.build()

print(f"Built knowledge graph with {len(knowledge_graph.nodes)} nodes")
print(f"Built knowledge graph with {len(knowledge_graph.edges)} edges")
print(f"Added {len(relationships)} drug-target relationships")


## Step 4: Generate Embeddings and Setup Vector Store

Generate embeddings from compound and target descriptions and setup vector store for similarity search.


In [None]:
embedding_generator = EmbeddingGenerator()
text_embedder = TextEmbedder()
vector_store = VectorStore()
hybrid_search = HybridSearch(vector_store, knowledge_graph)

# Generate embeddings for compounds
compound_texts = []
compound_metadata = []
for compound in compound_entities:
    compound_text = f"{compound['properties'].get('name', '')} pathway {compound['properties'].get('pathway', '')}"
    compound_texts.append(compound_text)
    compound_metadata.append({
        "id": compound["id"],
        "type": "compound",
        "name": compound["properties"].get("name", "")
    })

# Generate embeddings for targets
target_texts = []
target_metadata = []
for target in target_entities:
    target_text = f"{target['properties'].get('name', '')} pathway {target['properties'].get('pathway', '')}"
    target_texts.append(target_text)
    target_metadata.append({
        "id": target["id"],
        "type": "target",
        "name": target["properties"].get("name", "")
    })

# Generate embeddings
all_texts = compound_texts + target_texts
all_metadata = compound_metadata + target_metadata

embeddings = []
for text in all_texts:
    embedding = text_embedder.embed(text)
    embeddings.append(embedding)

# Store in vector store
vector_store.store_vectors(embeddings, all_metadata)

print(f"Generated {len(embeddings)} embeddings")
print(f"Stored {len(compound_texts)} compound embeddings")
print(f"Stored {len(target_texts)} target embeddings")
print(f"Vector store ready for similarity search")


## Step 5: Predict Drug-Target Interactions

Use hybrid search and inference to predict drug-target interactions.


In [None]:
graph_analyzer = GraphAnalyzer(knowledge_graph)
centrality_calculator = CentralityCalculator(knowledge_graph)
community_detector = CommunityDetector(knowledge_graph)
connectivity_analyzer = ConnectivityAnalyzer(knowledge_graph)
temporal_query = TemporalGraphQuery(knowledge_graph)
inference_engine = InferenceEngine()
rule_manager = RuleManager()

# Compute graph metrics
graph_metrics = graph_analyzer.compute_metrics()

# Calculate centrality
centrality_scores = centrality_calculator.calculate_centrality(centrality_type="betweenness")
top_central_targets = sorted(centrality_scores.items(), key=lambda x: x[1], reverse=True)[:10]

# Detect communities
communities = community_detector.detect_communities()
community_count = len(set(communities.values())) if communities else 0

# Analyze connectivity
connectivity_results = connectivity_analyzer.analyze_connectivity()

# Define interaction prediction rules
prediction_rules = [
    {
        "name": "high_affinity_interaction",
        "condition": "binding_affinity < 10",
        "action": "predict_strong_interaction"
    },
    {
        "name": "inhibitor_interaction",
        "condition": "interaction_type == 'inhibitor' AND binding_affinity < 5",
        "action": "predict_potent_inhibitor"
    },
    {
        "name": "target_identification",
        "condition": "multiple_compounds_target_same_protein",
        "action": "identify_druggable_target"
    }
]

for rule in prediction_rules:
    rule_manager.add_rule(rule["name"], rule["condition"], rule["action"])

# Predict interactions using similarity search
query_compound = "Aspirin"
query_embedding = text_embedder.embed(query_compound)

# Hybrid search
search_results = hybrid_search.search(
    query_embedding=query_embedding,
    query_text=query_compound,
    k=5,
    use_graph_expansion=True
)

# Predict interactions
predicted_interactions = []
for compound in compound_entities:
    compound_id = compound["id"]
    compound_name = compound["properties"].get("name", "")
    
    # Find interactions for this compound
    compound_interactions = [r for r in relationships if r["source"] == compound_id]
    
    # Calculate interaction score
    interaction_score = 0
    for interaction in compound_interactions:
        binding_affinity = interaction.get("binding_affinity", 100)
        if binding_affinity < 10:
            interaction_score += 3
        elif binding_affinity < 50:
            interaction_score += 2
        else:
            interaction_score += 1
    
    predicted_interactions.append({
        "compound": compound_name,
        "compound_id": compound_id,
        "interaction_count": len(compound_interactions),
        "interaction_score": interaction_score,
        "targets": [r["target"] for r in compound_interactions]
    })

# Identify druggable targets
target_interaction_counts = {}
for rel in relationships:
    target = rel["target"]
    target_interaction_counts[target] = target_interaction_counts.get(target, 0) + 1

druggable_targets = []
for target_id, count in target_interaction_counts.items():
    if count >= 2:
        target_name = next((t["properties"].get("name", target_id) for t in target_entities if t["id"] == target_id), target_id)
        druggable_targets.append({
            "target": target_name,
            "target_id": target_id,
            "compound_count": count,
            "description": f"Target {target_name} is targeted by {count} compounds, indicating druggability"
        })

print(f"Analyzed {len(compound_entities)} compounds")
print(f"Found {community_count} target communities")
print(f"Identified {len(druggable_targets)} druggable targets")
print(f"\nTop 5 Central Targets:")
for i, (target_id, centrality) in enumerate(top_central_targets[:5], 1):
    target_name = next((t["properties"].get("name", target_id) for t in target_entities if t["id"] == target_id), target_id)
    print(f"  {i}. {target_name} (centrality: {centrality:.3f})")
print(f"\nPredicted Interactions:")
for pred in sorted(predicted_interactions, key=lambda x: x["interaction_score"], reverse=True)[:5]:
    print(f"  - {pred['compound']}: {pred['interaction_count']} interactions, Score: {pred['interaction_score']}")
print(f"\nDruggable Targets:")
for target in druggable_targets[:5]:
    print(f"  - {target['target']}: {target['compound_count']} compounds")


In [None]:
ontology_generator = OntologyGenerator()
ontology_validator = OntologyValidator()
json_exporter = JSONExporter()
rdf_exporter = RDFExporter()
owl_exporter = OWLExporter()
report_generator = ReportGenerator()

# Generate drug discovery ontology
drug_ontology = ontology_generator.generate_ontology(
    knowledge_graph=knowledge_graph,
    domain="DrugDiscovery"
)

# Validate ontology
validation_result = ontology_validator.validate_ontology(drug_ontology)

# Export knowledge graph
kg_json = json_exporter.export(knowledge_graph, output_path=os.path.join(temp_dir, "drug_target_kg.json"))
kg_rdf = rdf_exporter.export(knowledge_graph, output_path=os.path.join(temp_dir, "drug_target_kg.rdf"))

# Export ontology
ontology_owl = owl_exporter.export(drug_ontology, output_path=os.path.join(temp_dir, "drug_ontology.owl"))

# Generate report
report_content = f"""
# Drug Discovery Pipeline Report

## Executive Summary
- Total Compounds Analyzed: {len(compound_entities)}
- Total Targets: {len(target_entities)}
- Total Interactions: {len(interaction_entities)}
- Druggable Targets Identified: {len(druggable_targets)}
- High-Affinity Interactions: {len([r for r in relationships if r.get('binding_affinity', 100) < 10])}

## Top Druggable Targets
"""
for i, target in enumerate(druggable_targets[:10], 1):
    report_content += f"""
{i}. {target['target']}
   - Compound Count: {target['compound_count']}
   - Description: {target['description']}
"""

report_content += f"""
## Predicted Interactions
"""
for pred in sorted(predicted_interactions, key=lambda x: x["interaction_score"], reverse=True)[:10]:
    report_content += f"""
### {pred['compound']}
- Interaction Count: {pred['interaction_count']}
- Interaction Score: {pred['interaction_score']}
- Targets: {', '.join(pred['targets'][:5])}
"""

report_path = os.path.join(temp_dir, "drug_discovery_report.md")
with open(report_path, 'w') as f:
    f.write(report_content)

print(f"Generated drug discovery ontology with {len(drug_ontology.classes)} classes")
print(f"Ontology validation: {'Valid' if validation_result.valid else 'Invalid'}")
print(f"  Errors: {len(validation_result.errors)}")
print(f"  Warnings: {len(validation_result.warnings)}")
print(f"Exported knowledge graph to JSON and RDF")
print(f"Exported ontology to OWL")
print(f"Generated discovery report: {report_path}")


## Step 7: Visualize Drug-Target Network

Visualize the drug-target knowledge graph, ontology, and analytics.


In [None]:
kg_visualizer = KGVisualizer()
ontology_visualizer = OntologyVisualizer()
analytics_visualizer = AnalyticsVisualizer()

# Visualize knowledge graph
kg_viz = kg_visualizer.visualize(
    knowledge_graph,
    layout="force_directed",
    highlight_nodes=[t["id"] for t in druggable_targets[:5]],
    node_size_by="centrality"
)

# Visualize ontology
ontology_viz = ontology_visualizer.visualize(
    drug_ontology,
    layout="hierarchical"
)

# Visualize analytics
analytics_viz = analytics_visualizer.visualize(
    knowledge_graph,
    metrics={
        "centrality": dict(top_central_targets[:10]),
        "communities": communities,
        "connectivity": connectivity_results,
        "interaction_scores": {p["compound_id"]: p["interaction_score"] for p in predicted_interactions}
    }
)

print("Generated visualizations:")
print("  - Knowledge Graph: Drug-target network with highlighted druggable targets")
print("  - Ontology Visualization: Drug discovery ontology hierarchy")
print("  - Analytics Visualization: Centrality, communities, connectivity, and interaction scores")
