# GNN and Visualization Integration Pipeline

**No Neo4j required!** This notebook works directly with CSV files.

## What This Notebook Does:
1. **Load Data** ‚Üí Build graph from CSV files (Drug, Protein, Pathway, PPI)
2. **Visualize** ‚Üí Force-directed layout, community detection
3. **Train GNN** ‚Üí DVGAE generates node embeddings
4. **Predict Links** ‚Üí Find novel drug-gene-disease associations
5. **Interpret** ‚Üí Trace paths to explain predictions


In [None]:
# Setup - Run this first!
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.insert(0, str(Path('../src').resolve()))

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Import our modules
from visualization.graph_visualizer import GraphVisualizer
from pipeline.integrated_pipeline import CardioKGPipeline, PipelineConfig

print("‚úÖ Setup complete!")


In [None]:
# Step 1: Initialize Pipeline and Load Graph (NO NEO4J NEEDED!)
config = PipelineConfig(
    raw_data_dir="../data/raw",
    output_dir="../output",
    embedding_dim=32,  # Smaller for faster training
    hidden_dim=64,
    epochs=50,
    encoder_type='gcn'
)

pipeline = CardioKGPipeline(config)

# Load graph from CSV files
G = pipeline.load_graph_from_csv(
    ppi_path="../data/raw/biogrid_ppi_human.csv"
)

# Show statistics
stats = pipeline.visualizer.get_graph_statistics()
print(f"\nüìä Knowledge Graph Built!")
print(f"   Nodes: {stats['num_nodes']:,}")
print(f"   Edges: {stats['num_edges']:,}")
print(f"\n   Node types: {stats['node_type_distribution']}")
print(f"   Edge types: {stats['edge_type_distribution']}")


In [None]:
# Step 2: Visualize Graph Structure
print("üîç Computing force-directed layout...")
pos = pipeline.visualizer.compute_force_directed_layout(algorithm='spring', iterations=50)

# Detect communities
communities = pipeline.visualizer.detect_communities()
print(f"   Found {len(set(communities.values()))} communities")

# Find hub nodes
hubs = pipeline.visualizer.identify_hub_nodes(metric='degree', top_k=10)
print("\nüîó Top Hub Nodes:")
for node, score in hubs[:5]:
    node_type = pipeline.node_types.get(node, 'Unknown')
    print(f"   {node} ({node_type}): degree={score}")


In [None]:
# Step 3: Plot the Knowledge Graph
pipeline.visualizer.plot_graph(
    pos=pos,
    color_by='node_type',
    node_size_by='degree',
    title='Cardiotoxicity Knowledge Graph',
    figsize=(12, 9),
    show_labels=False,
    edge_alpha=0.1
)


In [None]:
# Step 4: Train GNN (DVGAE) for Node Embeddings
print("üß† Training DVGAE... (this may take a few minutes)")
embedding_results = pipeline.train_embeddings(epochs=50)

print(f"\n‚úÖ Training Complete!")
print(f"   Best Validation AUC: {embedding_results['best_val_auc']:.4f}")
print(f"   Test AUC: {embedding_results['test_auc']:.4f}")
print(f"   Test AP: {embedding_results['test_ap']:.4f}")


In [None]:
# Step 5: Train Link Predictor and Make Predictions
print("üéØ Training link predictor...")
link_results = pipeline.train_link_predictor()
print(f"   Link Prediction AUC: {link_results['auc']:.4f}")

# Predict novel Drug ‚Üí Protein associations
predictions = pipeline.predict_links(
    source_type='Drug',
    target_type='Protein',
    top_k=10,
    min_probability=0.5
)

print(f"\nüíä Top Predicted Drug ‚Üí Protein Associations:")
for i, pred in enumerate(predictions[:5], 1):
    print(f"   {i}. {pred.source} ‚Üí {pred.target}")
    print(f"      Probability: {pred.probability:.3f} | Confidence: {pred.confidence}")


In [None]:
# Step 6: Explain a Prediction with Path Tracing
if predictions:
    top_pred = predictions[0]
    print(f"üîç Explaining: {top_pred.source} ‚Üí {top_pred.target}\n")
    print(f"Mechanism: {top_pred.mechanism}")
    
    if top_pred.paths:
        print(f"\nPath: {' ‚Üí '.join(top_pred.paths[0])}")
        pipeline.visualizer.visualize_path(top_pred.paths[0])
else:
    print("No predictions above threshold found.")


# GNN and Visualization Integration Pipeline

## Cardiotoxicity Knowledge Graph Analysis

This notebook demonstrates the integrated workflow for analyzing cardiotoxicity using:

1. **Visualization Phase**: Force-directed layouts reveal graph structure
2. **GNN Phase**: DVGAE generates node embeddings via message passing
3. **Prediction Phase**: Link prediction for novel drug-gene-disease associations
4. **Interpretation Phase**: Path tracing explains predictions

Based on the CardioKG methodology from Imperial College London.


In [1]:
# Setup and imports
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.insert(0, str(Path('../src').resolve()))

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

# Our modules
from visualization.graph_visualizer import GraphVisualizer
from models.dvgae import train_dvgae_on_graph, networkx_to_pyg
from pipeline.integrated_pipeline import CardioKGPipeline, PipelineConfig

# Set style
plt.style.use('dark_background')
sns.set_palette('husl')

print("‚úÖ Imports successful!")


‚úÖ Imports successful!


## Step 1: Download Datasets

Download data from Kaggle and other sources.


In [2]:
# Download Kaggle PPI dataset
try:
    import kagglehub
    
    # Download protein-protein interactions
    path = kagglehub.dataset_download("alexandervc/protein-protein-interactions")
    print(f"‚úÖ Downloaded Kaggle PPI to: {path}")
    
    # List files
    import os
    for f in os.listdir(path):
        print(f"  - {f}")
        
except ImportError:
    print("‚ö†Ô∏è kagglehub not installed. Install with: pip install kagglehub")
    path = None


‚úÖ Downloaded Kaggle PPI to: /Users/rishi/.cache/kagglehub/datasets/alexandervc/protein-protein-interactions/versions/4
  - Ecoli gene regulatory networks are inconsistent with gene expression data.pdf
  - Network biology.pdf
  - BIOGRID-ALL-4.3.195.tab3
  - BIOGRID-CHEMICALS-4.3.195.chemtab


## Step 2: Build Knowledge Graph and Run Full Pipeline


In [3]:
# Initialize pipeline with configuration
config = PipelineConfig(
    raw_data_dir="../data/raw",
    output_dir="../output",
    embedding_dim=64,
    hidden_dim=128,
    epochs=100,  # Reduce for faster testing
    encoder_type='gcn'
)

pipeline = CardioKGPipeline(config)

# Load graph from CSV files
G = pipeline.load_graph_from_csv()

print(f"\nüìä Graph Statistics:")
print(f"  Nodes: {G.number_of_nodes()}")
print(f"  Edges: {G.number_of_edges()}")


INFO:pipeline.integrated_pipeline:Pipeline initialized (device: cpu)
INFO:pipeline.integrated_pipeline:Building graph from CSV files...
INFO:pipeline.integrated_pipeline:Loading 25670 drug-protein relationships...
INFO:pipeline.integrated_pipeline:Loading 615 protein-pathway relationships...
INFO:pipeline.integrated_pipeline:Built graph: 14492 nodes, 26257 edges



üìä Graph Statistics:
  Nodes: 14492
  Edges: 26257


## Phase 1: Visualization - Structural Analysis

Use force-directed layouts to understand graph structure.


In [None]:
# Analyze structure with visualization
structure_results = pipeline.analyze_structure(save_plots=False)

# Visualize the graph
pos = pipeline.visualizer.compute_force_directed_layout(algorithm='spring', iterations=100)

pipeline.visualizer.plot_graph(
    pos=pos,
    color_by='node_type',
    node_size_by='degree',
    title='Cardiotoxicity Knowledge Graph',
    figsize=(14, 10),
    show_labels=False
)


INFO:pipeline.integrated_pipeline:üîç Phase 2: Structural Analysis...
INFO:visualization.graph_visualizer:Computing spring layout...


## Phase 2: GNN Embedding with DVGAE

Train Deep Variational Graph Autoencoder for node embeddings.


In [None]:
# Train DVGAE embeddings
print("üß† Training DVGAE for node embeddings...")
embedding_results = pipeline.train_embeddings(epochs=100)

print(f"\n‚úÖ Training complete!")
print(f"  Best Validation AUC: {embedding_results['best_val_auc']:.4f}")
print(f"  Test AUC: {embedding_results['test_auc']:.4f}")


## Phase 3: Link Prediction

Train classifier and predict novel drug-gene-disease associations.


In [None]:
# Train link predictor
link_results = pipeline.train_link_predictor()
print(f"\nüéØ Link Prediction AUC: {link_results['auc']:.4f}")

# Predict novel associations
predictions = pipeline.predict_links(
    source_type='Drug',
    target_type='Protein', 
    top_k=10,
    min_probability=0.5
)

print(f"\nüíä Top Predicted Drug ‚Üí Protein Associations:")
for i, pred in enumerate(predictions[:5], 1):
    print(f"{i}. {pred.source} ‚Üí {pred.target} (prob: {pred.probability:.3f})")


## Phase 4: Interpretation - Compare Layouts and Trace Paths


In [None]:
# Compare force-directed layout with GNN embeddings
pipeline.visualizer.compare_layouts(embeddings=pipeline.embeddings)

# Explain top prediction with path tracing
if predictions:
    top_pred = predictions[0]
    print(f"\nüîç Explaining: {top_pred.source} ‚Üí {top_pred.target}")
    print(f"Mechanism: {top_pred.mechanism}")
    
    if top_pred.paths:
        pipeline.visualizer.visualize_path(top_pred.paths[0])
