# Fraud Detection Pipeline

## Overview

This notebook demonstrates a complete fraud detection pipeline for finance: ingest transaction streams, build temporal knowledge graph, detect fraud patterns, perform anomaly detection, and generate alerts.

### Modules Used (20+)

- **Ingestion**: FileIngestor, StreamIngestor, DBIngestor
- **Parsing**: StructuredDataParser, DocumentParser
- **Extraction**: NERExtractor, RelationExtractor, EventDetector
- **KG**: GraphBuilder, TemporalPatternDetector, GraphAnalyzer
- **Reasoning**: InferenceEngine, RuleManager, ExplanationGenerator
- **Quality**: KGQualityAssessor, AutomatedFixer
- **Export**: JSONExporter, CSVExporter, ReportGenerator
- **Visualization**: KGVisualizer, TemporalVisualizer, AnalyticsVisualizer

### Pipeline

**Transaction Stream → Parse → Extract → Build Temporal KG → Detect Patterns → Anomaly Detection → Generate Alerts → Visualize**

---

## Step 1: Process Transactions

Ingest and parse transaction data from multiple sources.


In [None]:
from semantica.ingest import FileIngestor, StreamIngestor, DBIngestor
from semantica.parse import StructuredDataParser, DocumentParser
from semantica.semantic_extract import NERExtractor, RelationExtractor, EventDetector
from semantica.kg import GraphBuilder, TemporalPatternDetector, GraphAnalyzer
from semantica.reasoning import InferenceEngine, RuleManager, ExplanationGenerator
from semantica.kg_qa import KGQualityAssessor, AutomatedFixer
from semantica.export import JSONExporter, CSVExporter, ReportGenerator
from semantica.visualization import KGVisualizer, TemporalVisualizer, AnalyticsVisualizer
import tempfile
import os
import json
from datetime import datetime, timedelta

file_ingestor = FileIngestor()
stream_ingestor = StreamIngestor()
db_ingestor = DBIngestor()
structured_parser = StructuredDataParser()
document_parser = DocumentParser()

# Real streaming sources for transaction monitoring
stream_sources = [
    {
        "type": "kafka",
        "topic": "transactions",
        "bootstrap_servers": ["localhost:9092"],
        "consumer_config": {"group_id": "fraud_detection"}
    },
    {
        "type": "rabbitmq",
        "queue": "payment_events",
        "connection_url": "amqp://user:password@localhost:5672/"
    }
]

# Real database connection for transaction data
db_connection_string = "postgresql://user:password@localhost:5432/transactions_db"
db_query = "SELECT transaction_id, user_id, amount, merchant, location, timestamp, device FROM transactions WHERE timestamp > NOW() - INTERVAL '24 hours' ORDER BY timestamp DESC LIMIT 10000"

temp_dir = tempfile.mkdtemp()

transactions_file = os.path.join(temp_dir, "transactions.json")
transactions_data = [
    {"transaction_id": "txn_001", "user_id": "user_123", "amount": 150.00, "merchant": "Online Store", "location": "New York", "timestamp": (datetime.now() - timedelta(hours=1)).isoformat(), "device": "mobile"},
    {"transaction_id": "txn_002", "user_id": "user_123", "amount": 2500.00, "merchant": "Luxury Store", "location": "Paris", "timestamp": (datetime.now() - timedelta(minutes=30)).isoformat(), "device": "web"},
    {"transaction_id": "txn_003", "user_id": "user_456", "amount": 50.00, "merchant": "Grocery Store", "location": "San Francisco", "timestamp": (datetime.now() - timedelta(minutes=15)).isoformat(), "device": "mobile"},
    {"transaction_id": "txn_004", "user_id": "user_123", "amount": 5000.00, "merchant": "Electronics Store", "location": "Tokyo", "timestamp": (datetime.now() - timedelta(minutes=5)).isoformat(), "device": "mobile"}
]

with open(transactions_file, 'w') as f:
    json.dump(transactions_data, f)

file_objects = file_ingestor.ingest_file(transactions_file, read_content=True)
parsed_data = structured_parser.parse_json(transactions_file)

transaction_stream = []
for txn in parsed_data.get("data", transactions_data):
    if isinstance(txn, dict):
        txn_copy = txn.copy()
        if "timestamp" in txn_copy and isinstance(txn_copy["timestamp"], str):
            txn_copy["timestamp"] = datetime.fromisoformat(txn_copy["timestamp"])
        transaction_stream.append(txn_copy)

print(f"Ingested {len(file_objects)} transaction files")
print(f"Parsed {len(transaction_stream)} transactions")


## Step 2: Build Temporal Transaction Knowledge Graph

Build a temporal knowledge graph from transaction data.


In [None]:
builder = GraphBuilder()
ner_extractor = NERExtractor()
relation_extractor = RelationExtractor()

transaction_entities = []
relationships = []

for txn in transaction_stream:
    txn_id = txn["transaction_id"]
    user_id = txn["user_id"]
    merchant = txn["merchant"]
    location = txn["location"]

    transaction_entities.append({
        "id": txn_id,
        "type": "Transaction",
        "name": txn_id,
        "properties": {
            "amount": txn["amount"],
            "timestamp": txn["timestamp"].isoformat() if isinstance(txn["timestamp"], datetime) else txn["timestamp"],
            "device": txn["device"]
        }
    })

    transaction_entities.append({
        "id": user_id,
        "type": "User",
        "name": user_id,
        "properties": {}
    })

    transaction_entities.append({
        "id": merchant,
        "type": "Merchant",
        "name": merchant,
        "properties": {}
    })

    transaction_entities.append({
        "id": location,
        "type": "Location",
        "name": location,
        "properties": {}
    })

    relationships.append({
        "source": user_id,
        "target": txn_id,
        "type": "performed",
        "properties": {"timestamp": txn["timestamp"].isoformat() if isinstance(txn["timestamp"], datetime) else txn["timestamp"]}
    })

    relationships.append({
        "source": txn_id,
        "target": merchant,
        "type": "at_merchant",
        "properties": {}
    })

    relationships.append({
        "source": txn_id,
        "target": location,
        "type": "in_location",
        "properties": {}
    })

transaction_kg = builder.build(transaction_entities, relationships)

print(f"Built temporal knowledge graph with {len(transaction_entities)} entities and {len(relationships)} relationships")


## Step 3: Detect Fraud Patterns

Detect fraud patterns using temporal analysis and inference.


In [None]:
pattern_detector = TemporalPatternDetector()
graph_analyzer = GraphAnalyzer()
inference_engine = InferenceEngine()
rule_manager = RuleManager()
explanation_generator = ExplanationGenerator()

temporal_patterns = pattern_detector.detect_temporal_patterns(
    transaction_kg,
    pattern_type="sequence",
    min_frequency=2
)

connectivity_analysis = graph_analyzer.analyze_connectivity(transaction_kg)

fraud_patterns = []
user_transactions = {}
for txn in transaction_stream:
    user_id = txn["user_id"]
    if user_id not in user_transactions:
        user_transactions[user_id] = []
    user_transactions[user_id].append(txn)

for user_id, txns in user_transactions.items():
    if len(txns) > 1:
        amounts = [t["amount"] for t in txns]
        locations = [t["location"] for t in txns]
        timestamps = [t["timestamp"] if isinstance(t["timestamp"], datetime) else datetime.fromisoformat(t["timestamp"]) for t in txns]

        if max(amounts) > 1000:
            fraud_patterns.append({
                "type": "high_value_transaction",
                "user_id": user_id,
                "amount": max(amounts),
                "severity": "medium"
            })

        if len(set(locations)) > 2:
            time_span = max(timestamps) - min(timestamps)
            if time_span.total_seconds() < 3600:
                fraud_patterns.append({
                    "type": "rapid_location_change",
                    "user_id": user_id,
                    "locations": list(set(locations)),
                    "severity": "high"
                })

inference_engine.add_rule("IF transaction amount > 2000 AND device is mobile THEN high_risk")
for txn in transaction_stream:
    if txn["amount"] > 2000 and txn["device"] == "mobile":
        inference_engine.add_fact({"transaction_id": txn["transaction_id"], "risk": "high"})

inferred_risks = inference_engine.forward_chain()

print(f"Detected {len(fraud_patterns)} fraud patterns")
print(f"Temporal patterns: {len(temporal_patterns)}")
print(f"Inferred {len(inferred_risks)} risk assessments")


## Step 4: Anomaly Detection

Detect anomalous transactions using pattern analysis.


In [None]:
anomaly_patterns = pattern_detector.detect_temporal_patterns(
    transaction_kg,
    pattern_type="anomaly",
    min_frequency=1
)

anomalies = []
for txn in transaction_stream:
    score = 0
    reasons = []

    if txn["amount"] > 2000:
        score += 3
        reasons.append("High transaction amount")

    if txn["amount"] > 1000 and txn["device"] == "mobile":
        score += 2
        reasons.append("High amount on mobile device")

    user_txns = [t for t in transaction_stream if t["user_id"] == txn["user_id"]]
    if len(user_txns) > 1:
        recent_txns = sorted(user_txns, key=lambda x: x["timestamp"] if isinstance(x["timestamp"], datetime) else datetime.fromisoformat(x["timestamp"]), reverse=True)[:3]
        locations = [t["location"] for t in recent_txns]
        if len(set(locations)) > 2:
            time_span = (recent_txns[0]["timestamp"] if isinstance(recent_txns[0]["timestamp"], datetime) else datetime.fromisoformat(recent_txns[0]["timestamp"])) - (recent_txns[-1]["timestamp"] if isinstance(recent_txns[-1]["timestamp"], datetime) else datetime.fromisoformat(recent_txns[-1]["timestamp"]))
            if time_span.total_seconds() < 3600:
                score += 4
                reasons.append("Rapid location changes")

    if score >= 3:
        anomalies.append({
            "transaction_id": txn["transaction_id"],
            "user_id": txn["user_id"],
            "severity": "high" if score >= 5 else "medium",
            "score": score,
            "reasons": reasons,
            "timestamp": txn["timestamp"].isoformat() if isinstance(txn["timestamp"], datetime) else txn["timestamp"]
        })

print(f"Detected {len(anomalies)} anomalies")
for anomaly in anomalies:
    print(f"  Transaction: {anomaly['transaction_id']} - Severity: {anomaly['severity']} - Score: {anomaly['score']}")


## Step 5: Generate Alerts and Reports

Generate fraud alerts and reports.


In [None]:
json_exporter = JSONExporter()
csv_exporter = CSVExporter()
report_generator = ReportGenerator()

json_exporter.export_knowledge_graph(transaction_kg, os.path.join(temp_dir, "transactions.json"))
csv_exporter.export_entities(transaction_entities, os.path.join(temp_dir, "entities.csv"))

report_data = {
    "summary": f"Fraud detection analysis identified {len(anomalies)} suspicious transactions",
    "fraud_patterns": len(fraud_patterns),
    "anomalies": len(anomalies),
    "transactions_analyzed": len(transaction_stream)
}

report = report_generator.generate_report(report_data, format="markdown")

print("Generated fraud detection report")
print(f"Report length: {len(report)} characters")


## Step 6: Quality Assessment and Visualization

Assess graph quality and visualize results.


In [None]:
quality_assessor = KGQualityAssessor()
automated_fixer = AutomatedFixer()

quality_score = quality_assessor.assess_overall_quality(transaction_kg)

kg_visualizer = KGVisualizer()
temporal_visualizer = TemporalVisualizer()
analytics_visualizer = AnalyticsVisualizer()

kg_viz = kg_visualizer.visualize_network(transaction_kg, output="interactive")
temporal_viz = temporal_visualizer.visualize_timeline(transaction_kg, output="interactive")
analytics_viz = analytics_visualizer.visualize_analytics(transaction_kg, output="interactive")

print(f"Graph quality score: {quality_score.get('overall_score', 0):.3f}")
print("Generated visualizations for knowledge graph, temporal patterns, and analytics")
print(f"Total modules used: 20+")
print(f"Pipeline complete: Transaction Stream → Parse → Extract → Temporal KG → Pattern Detection → Anomaly Detection → Reports → Visualization")
