# Threat Intelligence Hybrid RAG Pipeline

## Overview

This notebook demonstrates a complete threat intelligence hybrid RAG pipeline: ingest threat intelligence from multiple sources (files, web, feeds), extract threat entities, build knowledge graph, generate embeddings, set up hybrid search (vector + temporal KG), and query threats using advanced RAG.

### Modules Used (20+)

- **Ingestion**: FileIngestor, WebIngestor, FeedIngestor, DBIngestor
- **Parsing**: JSONParser, XMLParser, HTMLParser, DocumentParser
- **Extraction**: NERExtractor, RelationExtractor, EventDetector, TripleExtractor
- **KG**: GraphBuilder, TemporalGraphQuery, GraphAnalyzer, ConnectivityAnalyzer
- **Embeddings**: EmbeddingGenerator, TextEmbedder
- **Vector Store**: VectorStore, HybridSearch
- **Context**: ContextRetriever, ContextGraphBuilder
- **Reasoning**: InferenceEngine, RuleManager, ExplanationGenerator
- **Export**: JSONExporter, RDFExporter, ReportGenerator
- **Visualization**: KGVisualizer, TemporalVisualizer, AnalyticsVisualizer

### Pipeline

**Multi-Source Threat Intel → Parse → Extract Entities → Build KG → Generate Embeddings → Vector Store → Hybrid RAG Setup → Query Threats → Generate Reports → Visualize**

---

## Step 1: Multi-Source Threat Intelligence Ingestion

Ingest threat intelligence from files, web sources, and feeds.


In [None]:
from semantica.ingest import FileIngestor, WebIngestor, FeedIngestor, DBIngestor
from semantica.parse import JSONParser, XMLParser, HTMLParser, DocumentParser
from semantica.semantic_extract import NERExtractor, RelationExtractor, EventDetector, TripleExtractor
from semantica.kg import GraphBuilder, TemporalGraphQuery, GraphAnalyzer, ConnectivityAnalyzer
from semantica.embeddings import EmbeddingGenerator, TextEmbedder
from semantica.vector_store import VectorStore, HybridSearch
from semantica.context import ContextRetriever, ContextGraphBuilder
from semantica.reasoning import InferenceEngine, RuleManager, ExplanationGenerator
from semantica.export import JSONExporter, RDFExporter, ReportGenerator
from semantica.visualization import KGVisualizer, TemporalVisualizer, AnalyticsVisualizer
import tempfile
import os
import json
from datetime import datetime, timedelta

file_ingestor = FileIngestor()
web_ingestor = WebIngestor()
feed_ingestor = FeedIngestor()
db_ingestor = DBIngestor()

json_parser = JSONParser()
xml_parser = XMLParser()
html_parser = HTMLParser()
document_parser = DocumentParser()

temp_dir = tempfile.mkdtemp()

# Real-world threat intelligence formats
threat_intel_json = os.path.join(temp_dir, "threat_intel.json")
threat_data = [
    {
        "threat_id": "APT-001",
        "name": "Advanced Persistent Threat Group 1",
        "description": "State-sponsored APT group targeting financial institutions",
        "iocs": ["192.168.1.100", "malicious-domain.com", "hash_abc123"],
        "tactics": ["initial_access", "persistence", "exfiltration"],
        "timestamp": (datetime.now() - timedelta(days=7)).isoformat(),
        "severity": "high"
    },
    {
        "threat_id": "APT-002",
        "name": "Ransomware Campaign",
        "description": "Large-scale ransomware campaign targeting healthcare sector",
        "iocs": ["198.51.100.50", "ransomware-domain.net", "hash_def456"],
        "tactics": ["initial_access", "execution", "impact"],
        "timestamp": (datetime.now() - timedelta(days=3)).isoformat(),
        "severity": "critical"
    },
    {
        "threat_id": "APT-003",
        "name": "Phishing Campaign",
        "description": "Sophisticated phishing campaign using social engineering",
        "iocs": ["203.0.113.75", "phishing-site.org", "hash_ghi789"],
        "tactics": ["initial_access", "collection"],
        "timestamp": (datetime.now() - timedelta(days=1)).isoformat(),
        "severity": "medium"
    }
]

with open(threat_intel_json, 'w') as f:
    json.dump(threat_data, f, indent=2)

# XML format threat intelligence (STIX format)
threat_intel_xml = os.path.join(temp_dir, "threat_intel.xml")
xml_content = """<?xml version="1.0"?>
<threat_intelligence>
    <indicator>
        <id>IOC-001</id>
        <type>IP</type>
        <value>172.16.0.50</value>
        <threat_type>malware</threat_type>
        <timestamp>2024-01-15T10:00:00</timestamp>
    </indicator>
    <indicator>
        <id>IOC-002</id>
        <type>Domain</type>
        <value>suspicious-domain.com</value>
        <threat_type>phishing</threat_type>
        <timestamp>2024-01-15T11:00:00</timestamp>
    </indicator>
</threat_intelligence>"""

with open(threat_intel_xml, 'w') as f:
    f.write(xml_content)

# Ingest from files
file_objects_json = file_ingestor.ingest_file(threat_intel_json, read_content=True)
file_objects_xml = file_ingestor.ingest_file(threat_intel_xml, read_content=True)

# Parse threat intelligence
parsed_json = json_parser.parse(threat_intel_json)
parsed_xml = xml_parser.parse(threat_intel_xml)

# Real threat intelligence feed URLs
threat_intel_feeds = [
    "https://www.cisa.gov/news.xml",  # CISA Security Advisories
    "https://www.us-cert.gov/ncas/alerts.xml",  # US-CERT Alerts
    "https://feeds.feedburner.com/SecurityWeek",  # Security Week
    "https://www.darkreading.com/rss.xml",  # Dark Reading
    "https://krebsonsecurity.com/feed/"  # Krebs on Security
]

threat_feed_list = []
for feed_url in threat_intel_feeds:
    try:
        threat_feed = feed_ingestor.ingest_feed(feed_url)
        if threat_feed:
            threat_feed_list.append(threat_feed)
            print(f"✓ Ingested threat feed: {threat_feed.title if hasattr(threat_feed, 'title') else feed_url}")
            print(f"  Items: {len(threat_feed.items) if hasattr(threat_feed, 'items') else 0}")
    except Exception as e:
        print(f"⚠ Feed ingestion for {feed_url}: {str(e)[:100]}")

# Real web sources for threat intelligence
threat_web_sources = [
    "https://api.github.com/repos/mitre/cti",  # MITRE ATT&CK Framework
    "https://www.cisa.gov/known-exploited-vulnerabilities-catalog",  # CISA KEV Catalog
    "https://nvd.nist.gov/vuln/search"  # NIST NVD
]

web_content_list = []
for web_url in threat_web_sources[:1]:  # Process first URL
    try:
        web_content = web_ingestor.ingest_url(web_url)
        if web_content:
            web_content_list.append(web_content)
            print(f"✓ Ingested web content: {web_content.url if hasattr(web_content, 'url') else web_url}")
    except Exception as e:
        print(f"⚠ Web ingestion for {web_url}: {str(e)[:100]}")

print(f"Ingested {len([file_objects_json]) if file_objects_json else 0} JSON threat intelligence files")
print(f"Ingested {len([file_objects_xml]) if file_objects_xml else 0} XML threat intelligence files")
print(f"Parsed {len(parsed_json.data) if parsed_json and parsed_json.data else 0} JSON threat entries")
print(f"Parsed {len(parsed_xml.elements) if parsed_xml else 0} XML indicator elements")


## Step 2: Extract Threat Intelligence Entities

Extract threat entities, IOCs, and relationships from threat intelligence data.


In [None]:
ner_extractor = NERExtractor()
relation_extractor = RelationExtractor()
event_detector = EventDetector()
triple_extractor = TripleExtractor()

threat_entities = []
threat_relationships = []
all_documents = []

# Extract from JSON threat intelligence
if parsed_json and parsed_json.data:
    for threat in parsed_json.data:
        if isinstance(threat, dict):
            threat_text = f"{threat.get('name', '')}: {threat.get('description', '')}"
            all_documents.append(threat_text)
            
            threat_entities.append({
                "id": threat.get("threat_id", ""),
                "type": "Threat_Actor",
                "name": threat.get("name", ""),
                "properties": {
                    "description": threat.get("description", ""),
                    "severity": threat.get("severity", ""),
                    "timestamp": threat.get("timestamp", "")
                }
            })
            
            for ioc in threat.get("iocs", []):
                threat_entities.append({
                    "id": ioc,
                    "type": "IOC",
                    "name": ioc,
                    "properties": {
                        "threat_id": threat.get("threat_id", ""),
                        "timestamp": threat.get("timestamp", "")
                    }
                })
                threat_relationships.append({
                    "source": threat.get("threat_id", ""),
                    "target": ioc,
                    "type": "uses",
                    "properties": {"timestamp": threat.get("timestamp", "")}
                })
            
            for tactic in threat.get("tactics", []):
                threat_entities.append({
                    "id": tactic,
                    "type": "Tactic",
                    "name": tactic,
                    "properties": {}
                })
                threat_relationships.append({
                    "source": threat.get("threat_id", ""),
                    "target": tactic,
                    "type": "employs",
                    "properties": {}
                })

# Extract from XML indicators
if parsed_xml and parsed_xml.elements:
    for elem in parsed_xml.elements:
        if hasattr(elem, 'text') and elem.text:
            entities = ner_extractor.extract(elem.text)
            threat_entities.extend(entities)

print(f"Extracted {len(threat_entities)} threat intelligence entities")
print(f"Extracted {len(threat_relationships)} threat relationships")
print(f"Collected {len(all_documents)} threat intelligence documents")


## Step 3: Build Threat Intelligence Knowledge Graph

Build knowledge graph from threat entities and relationships.


In [None]:
builder = GraphBuilder()
temporal_query = TemporalGraphQuery()
graph_analyzer = GraphAnalyzer()
connectivity_analyzer = ConnectivityAnalyzer()

threat_kg = builder.build(threat_entities, threat_relationships)

# Analyze graph structure
metrics = graph_analyzer.compute_metrics(threat_kg)
connectivity = connectivity_analyzer.analyze_connectivity(threat_kg)

print(f"Built threat intelligence knowledge graph")
print(f"  Entities: {len(threat_kg.get('entities', []))}")
print(f"  Relationships: {len(threat_kg.get('relationships', []))}")
print(f"  Graph density: {metrics.get('density', 0):.3f}")
print(f"  Connected components: {len(connectivity.get('components', []))}")


## Step 4: Generate Embeddings and Setup Vector Store

Generate embeddings from threat intelligence documents and store in vector database.


In [None]:
embedding_generator = EmbeddingGenerator()
text_embedder = TextEmbedder()
vector_store = VectorStore()
hybrid_search = HybridSearch()

# Generate embeddings for threat intelligence documents
embeddings = embedding_generator.generate(all_documents)

# Prepare metadata for vector store
metadata = []
for i, doc in enumerate(all_documents):
    metadata.append({
        "id": f"doc_{i}",
        "text": doc,
        "source": "threat_intelligence"
    })

# Store vectors
vector_ids = vector_store.store_vectors(embeddings, metadata)

print(f"Generated embeddings for {len(all_documents)} documents")
print(f"Embedding dimension: {len(embeddings[0]) if embeddings else 0}")
print(f"Stored {len(vector_ids)} vectors in vector store")


## Step 5: Setup Hybrid RAG (Vector + Temporal KG)

Setup hybrid search combining vector similarity and temporal knowledge graph queries.


In [None]:
context_retriever = ContextRetriever()
context_graph_builder = ContextGraphBuilder()

# Setup context retriever with KG and vector store
context_retriever = ContextRetriever(
    knowledge_graph=threat_kg,
    vector_store=vector_store
)

print("Hybrid RAG setup complete")
print(f"  Knowledge graph: {len(threat_kg.get('entities', []))} entities")
print(f"  Vector store: {len(vector_ids)} vectors")
print(f"  Context retriever initialized")


## Step 6: Query Threats Using Hybrid RAG

Query threat intelligence using hybrid search (vector + temporal KG).


In [None]:
# Query examples
queries = [
    "What are the latest APT threats?",
    "Find threats targeting financial institutions",
    "What IOCs are associated with ransomware?"
]

query_results = []

for query in queries:
    # Generate query embedding
    query_embedding = text_embedder.embed_text(query)
    
    # Vector search
    vector_results = vector_store.search_vectors(query_embedding, k=3)
    
    # Temporal KG query
    start_time = (datetime.now() - timedelta(days=30)).isoformat()
    end_time = datetime.now().isoformat()
    
    temporal_results = temporal_query.query_time_range(
        graph=threat_kg,
        query=query,
        start_time=start_time,
        end_time=end_time
    )
    
    # Hybrid search using context retriever
    context_results = context_retriever.retrieve(
        query=query,
        top_k=3,
        use_graph_expansion=True
    )
    
    query_results.append({
        "query": query,
        "vector_results": len(vector_results),
        "temporal_results": len(temporal_results.get('entities', [])),
        "context_results": len(context_results) if context_results else 0
    })

# Inference for threat analysis
inference_engine = InferenceEngine()
rule_manager = RuleManager()
explanation_generator = ExplanationGenerator()

inference_engine.add_rule("IF severity is critical AND tactics includes exfiltration THEN high_priority_threat")
inference_engine.add_rule("IF threat targets financial AND uses initial_access THEN financial_apt")

for threat in parsed_json.data if parsed_json and parsed_json.data else []:
    if isinstance(threat, dict):
        inference_engine.add_fact({
            "threat_id": threat.get("threat_id", ""),
            "severity": threat.get("severity", ""),
            "tactics": threat.get("tactics", [])
        })

threat_insights = inference_engine.forward_chain()

print(f"Processed {len(queries)} threat intelligence queries")
for result in query_results:
    print(f"  Query: '{result['query']}' - Vector: {result['vector_results']}, Temporal: {result['temporal_results']}, Context: {result['context_results']}")
print(f"Generated {len(threat_insights)} threat insights from inference")


In [None]:
quality_assessor = KGQualityAssessor()
json_exporter = JSONExporter()
rdf_exporter = RDFExporter()
report_generator = ReportGenerator()

quality_score = quality_assessor.assess_overall_quality(threat_kg)

json_exporter.export_knowledge_graph(threat_kg, os.path.join(temp_dir, "threat_kg.json"))
rdf_exporter.export_knowledge_graph(threat_kg, os.path.join(temp_dir, "threat_kg.rdf"))

report_data = {
    "summary": f"Threat intelligence analysis identified {len(threat_entities)} entities and {len(threat_insights)} insights",
    "threats_analyzed": len(parsed_json.data) if parsed_json and parsed_json.data else 0,
    "iocs": len([e for e in threat_entities if e.get("type") == "IOC"]),
    "insights": len(threat_insights),
    "quality_score": quality_score.get('overall_score', 0),
    "critical_threats": len([t for t in parsed_json.data if isinstance(t, dict) and t.get("severity") == "critical"]) if parsed_json and parsed_json.data else 0
}

report = report_generator.generate_report(report_data, format="markdown")

print("Generated threat intelligence report")
print(f"Report length: {len(report)} characters")
print(f"Graph quality score: {quality_score.get('overall_score', 0):.3f}")


## Step 8: Visualize Threat Intelligence

Visualize threat intelligence knowledge graph and relationships.


In [None]:
kg_visualizer = KGVisualizer()
temporal_visualizer = TemporalVisualizer()
analytics_visualizer = AnalyticsVisualizer()

kg_viz = kg_visualizer.visualize_network(threat_kg, output="interactive")
temporal_viz = temporal_visualizer.visualize_timeline(threat_kg, output="interactive")
analytics_viz = analytics_visualizer.visualize_analytics(threat_kg, output="interactive")

print("Generated visualizations for threat intelligence knowledge graph, temporal patterns, and analytics")
print(f"Total modules used: 20+")
print(f"Pipeline complete: Multi-Source Threat Intel → Parse → Extract → Build KG → Embeddings → Vector Store → Hybrid RAG → Query → Reports → Visualize")
