# Threat Correlation Pipeline

## Overview

This notebook demonstrates a complete threat correlation pipeline for cybersecurity: ingest threat feeds from multiple sources, extract IOCs, build temporal knowledge graph, correlate threats, detect campaigns, and generate reports.

### Modules Used (20+)

- **Ingestion**: FileIngestor, FeedIngestor, DBIngestor
- **Parsing**: XMLParser, StructuredDataParser
- **Extraction**: NERExtractor, RelationExtractor, EventDetector
- **KG**: GraphBuilder, TemporalGraphQuery, TemporalPatternDetector, GraphAnalyzer, ConnectivityAnalyzer
- **Reasoning**: InferenceEngine, ExplanationGenerator
- **Quality**: KGQualityAssessor, ProvenanceTracker, ConflictDetector
- **Export**: RDFExporter, ReportGenerator
- **Visualization**: AnalyticsVisualizer, TemporalVisualizer

### Pipeline

**Multiple Threat Feeds â†’ Parse â†’ Extract IOCs â†’ Build Temporal KG â†’ Correlate Threats â†’ Detect Campaigns â†’ Generate Reports â†’ Visualize**

---

## Step 1: Ingest Threat Feeds

Ingest threat intelligence from multiple sources.


In [None]:
from semantica.ingest import FileIngestor, FeedIngestor, DBIngestor, WebIngestor
from semantica.parse import XMLParser, StructuredDataParser, JSONParser
from semantica.semantic_extract import NERExtractor, RelationExtractor, EventDetector
from semantica.kg import GraphBuilder, TemporalGraphQuery, TemporalPatternDetector, GraphAnalyzer, ConnectivityAnalyzer
from semantica.reasoning import InferenceEngine, ExplanationGenerator
from semantica.kg_qa import KGQualityAssessor
from semantica.kg import ProvenanceTracker, ConflictDetector
from semantica.export import RDFExporter, ReportGenerator
from semantica.visualization import AnalyticsVisualizer, TemporalVisualizer
import tempfile
import os
import json
from datetime import datetime, timedelta

file_ingestor = FileIngestor()
feed_ingestor = FeedIngestor()
db_ingestor = DBIngestor()
web_ingestor = WebIngestor()
xml_parser = XMLParser()
structured_parser = StructuredDataParser()
json_parser = JSONParser()

temp_dir = tempfile.mkdtemp()

# Real threat intelligence feed URLs
threat_feeds = [
    "https://www.cisa.gov/news.xml",  # CISA Security Advisories
    "https://www.us-cert.gov/ncas/alerts.xml",  # US-CERT Alerts
    "https://feeds.feedburner.com/SecurityWeek",  # Security Week
    "https://www.darkreading.com/rss.xml"  # Dark Reading
]

# Real database connection pattern (PostgreSQL example)
db_connection_string = "postgresql://user:password@localhost:5432/threat_intel_db"
db_query = "SELECT ioc, ioc_type, timestamp, severity, source FROM threat_indicators WHERE timestamp > NOW() - INTERVAL '7 days'"

# Real web API endpoints for threat intelligence
threat_apis = [
    "https://api.github.com/repos/mitre/cti/contents/enterprise-attack/attack-pattern",  # MITRE ATT&CK
    "https://www.virustotal.com/vtapi/v2/domain/report",  # VirusTotal API (requires API key)
    "https://api.shodan.io/shodan/host/search"  # Shodan API (requires API key)
]

# Ingest from real RSS feeds
feed_data_list = []
for feed_url in threat_feeds:
    try:
        feed_data = feed_ingestor.ingest_feed(feed_url)
        if feed_data:
            feed_data_list.append(feed_data)
            print(f"âœ“ Ingested feed: {feed_data.title if hasattr(feed_data, 'title') else feed_url}")
            print(f"  Items: {len(feed_data.items) if hasattr(feed_data, 'items') else 0}")
    except Exception as e:
        print(f"âš  Feed ingestion failed for {feed_url}: {str(e)[:100]}")

# Ingest from web APIs (example with public API)
try:
    web_content = web_ingestor.ingest_url("https://api.github.com/repos/mitre/cti")
    if web_content:
        print(f"âœ“ Ingested web content: {web_content.url if hasattr(web_content, 'url') else 'N/A'}")
except Exception as e:
    print(f"âš  Web ingestion (example): {str(e)[:100]}")

# Database ingestion pattern (would connect to real database)
try:
    # Example: Export from threat intelligence database
    db_data = db_ingestor.export_table(
        connection_string=db_connection_string,
        table_name="threat_indicators",
        limit=1000
    )
    print(f"âœ“ Database ingestion configured for: {db_connection_string}")
    print(f"  Query pattern: {db_query}")
except Exception as e:
    print(f"âš  Database connection (example pattern): Configure with real credentials")
    # Simulate database structure for demonstration
    db_data = {
        "data": [
            {"ioc": "192.168.1.100", "ioc_type": "IP", "timestamp": datetime.now().isoformat(), "severity": "high", "source": "threat_feed"},
            {"ioc": "malicious-domain.com", "ioc_type": "Domain", "timestamp": datetime.now().isoformat(), "severity": "medium", "source": "threat_feed"}
        ]
    }

# Parse feed data
parsed_feeds = []
for feed_data in feed_data_list:
    if hasattr(feed_data, 'items'):
        for item in feed_data.items[:10]:  # Process first 10 items
            parsed_feeds.append({
                "title": item.title if hasattr(item, 'title') else "",
                "description": item.description if hasattr(item, 'description') else "",
                "published": item.published if hasattr(item, 'published') else "",
                "link": item.link if hasattr(item, 'link') else ""
            })

parsed_db = structured_parser.parse_json(json.dumps(db_data)) if db_data else None

print(f"\nðŸ“Š Ingestion Summary:")
print(f"  Feeds ingested: {len(feed_data_list)}")
print(f"  Feed items processed: {len(parsed_feeds)}")
print(f"  Database records: {len(db_data.get('data', [])) if db_data else 0}")
print(f"  Web sources: 1")


## Step 2: Extract IOCs

Extract Indicators of Compromise (IOCs) from threat feeds.


In [None]:
ner_extractor = NERExtractor()
relation_extractor = RelationExtractor()
event_detector = EventDetector()

all_threat_texts = []
if parsed_xml and parsed_xml.elements:
    for elem in parsed_xml.elements:
        if hasattr(elem, 'text') and elem.text:
            all_threat_texts.append(elem.text)

for db_record in parsed_db.get("data", threat_db_data):
    threat_text = f"IOC: {db_record.get('ioc', '')} Type: {db_record.get('type', '')} Severity: {db_record.get('severity', '')}"
    all_threat_texts.append(threat_text)

all_entities = []
all_relationships = []
all_events = []

for text in all_threat_texts:
    entities = ner_extractor.extract(text)
    all_entities.extend(entities)
    
    relationships = relation_extractor.extract(text, entities)
    all_relationships.extend(relationships)
    
    events = event_detector.detect_events(text)
    all_events.extend(events)

print(f"Extracted {len(all_entities)} IOCs")
print(f"Extracted {len(all_relationships)} relationships")
print(f"Detected {len(all_events)} events")


## Step 3: Build Temporal Knowledge Graph

Build a temporal knowledge graph from extracted IOCs and relationships.


In [None]:
builder = GraphBuilder()

threat_entities = []
for i, entity in enumerate(all_entities[:10], 1):
    threat_entities.append({
        "id": f"ioc_{i}",
        "type": entity.get("type", "IOC"),
        "name": entity.get("text", entity.get("entity", "")),
        "properties": {"timestamp": datetime.now().isoformat()}
    })

threat_relationships = []
for i, rel in enumerate(all_relationships[:5], 1):
    threat_relationships.append({
        "source": f"ioc_{i}",
        "target": f"ioc_{i+1}",
        "type": rel.get("type", "related_to"),
        "properties": {"timestamp": datetime.now().isoformat()}
    })

threat_kg = builder.build(threat_entities, threat_relationships)

print(f"Built temporal knowledge graph")
print(f"  Entities: {len(threat_kg.get('entities', []))}")
print(f"  Relationships: {len(threat_kg.get('relationships', []))}")


## Step 4: Correlate Threats

Correlate threats using temporal queries and inference.


In [None]:
temporal_query = TemporalGraphQuery()
pattern_detector = TemporalPatternDetector()
graph_analyzer = GraphAnalyzer()
connectivity_analyzer = ConnectivityAnalyzer()
inference_engine = InferenceEngine()
explanation_generator = ExplanationGenerator()

start_time = (datetime.now() - timedelta(days=7)).isoformat()
end_time = datetime.now().isoformat()

temporal_results = temporal_query.query_time_range(
    graph=threat_kg,
    query="Find threats in the last 7 days",
    start_time=start_time,
    end_time=end_time
)

patterns = pattern_detector.detect_temporal_patterns(
    threat_kg,
    pattern_type="sequence",
    min_frequency=1
)

connectivity = connectivity_analyzer.analyze_connectivity(threat_kg)

inference_engine.add_rule("IF IOC has high severity AND IOC is related to another IOC THEN potential_campaign")
for entity in threat_entities[:3]:
    if entity.get("properties", {}).get("severity") == "high":
        inference_engine.add_fact({"ioc": entity.get("id"), "severity": "high"})

correlations = inference_engine.forward_chain()

print(f"Temporal query returned {len(temporal_results.get('entities', []))} entities")
print(f"Detected {len(patterns)} temporal patterns")
print(f"Connectivity: {connectivity.get('is_connected', False)}")
print(f"Inferred {len(correlations)} correlations")


## Step 5: Detect Campaigns

Detect threat campaigns using graph analysis and inference.


In [None]:
campaigns = []

if len(patterns) > 0:
    campaigns.append({
        "campaign_id": "campaign_1",
        "description": "Detected threat campaign based on temporal patterns",
        "iocs": [e.get("id") for e in threat_entities[:3]],
        "severity": "high",
        "patterns": len(patterns)
    })

if correlations:
    campaigns.append({
        "campaign_id": "campaign_2",
        "description": "Detected campaign from inference correlations",
        "iocs": [e.get("id") for e in threat_entities[:2]],
        "severity": "medium",
        "correlations": len(correlations)
    })

print(f"Detected {len(campaigns)} threat campaigns")
for campaign in campaigns:
    print(f"  Campaign: {campaign['campaign_id']} - Severity: {campaign['severity']}")


## Step 6: Quality Assessment and Provenance

Assess graph quality and track provenance.


In [None]:
quality_assessor = KGQualityAssessor()
provenance_tracker = ProvenanceTracker()
conflict_detector = ConflictDetector()

quality_score = quality_assessor.assess_overall_quality(threat_kg)

for entity in threat_entities:
    provenance_tracker.track_entity(entity.get("id"), "threat_feed", entity)

conflicts = conflict_detector.detect_value_conflicts(threat_entities, "name")

print(f"Graph quality score: {quality_score.get('overall_score', 0):.3f}")
print(f"Tracked provenance for {len(threat_entities)} entities")
print(f"Detected {len(conflicts)} conflicts")


## Step 7: Generate Reports

Generate threat intelligence reports.


In [None]:
rdf_exporter = RDFExporter()
report_generator = ReportGenerator()

rdf_exporter.export_knowledge_graph(threat_kg, os.path.join(temp_dir, "threats.rdf"))

report_data = {
    "summary": f"Threat correlation analysis detected {len(campaigns)} campaigns",
    "iocs": len(threat_entities),
    "campaigns": campaigns,
    "quality_score": quality_score.get('overall_score', 0)
}

report = report_generator.generate_report(report_data, format="markdown")

print("Generated threat intelligence report")
print(f"Report length: {len(report)} characters")


## Step 8: Visualize Results

Visualize threat correlation results.


In [None]:
analytics_visualizer = AnalyticsVisualizer()
temporal_visualizer = TemporalVisualizer()

analytics_viz = analytics_visualizer.visualize_analytics(threat_kg, output="interactive")
temporal_viz = temporal_visualizer.visualize_timeline(threat_kg, output="interactive")

print("Generated analytics and temporal visualizations")
print(f"Total modules used: 20+")
print(f"Pipeline complete: Multi-source ingestion â†’ Extraction â†’ Temporal KG â†’ Correlation â†’ Campaign Detection â†’ Quality â†’ Reports â†’ Visualization")
