In [12]:
# Classical Persistent Homology on Network Intrusion Data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gudhi as gd
import networkx as nx
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance_matrix
from tqdm import tqdm

# ------------------------------
# 1. Load Dataset
# ------------------------------

def load_and_preprocess_data(csv_file, features, timestamp_col='timestamp'):
    """Load and preprocess the CSV file."""
    # Read CSV using pandas directly for easier debugging and speed
    df = pd.read_csv(
        csv_file,
        usecols=features,
        dtype={col: 'float64' for col in features if col != timestamp_col}
    )
    
    # Parse timestamp
    df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce')
    df = df.dropna(subset=[timestamp_col])
    df = df.set_index(timestamp_col)
    
    return df

# ------------------------------
# 2. Feature Selection & Scaling
# ------------------------------

def extract_point_cloud(df, feature_cols):
    """Extract and scale features for point cloud."""
    point_data = df[feature_cols].fillna(0).values
    scaler = StandardScaler()
    point_data_scaled = scaler.fit_transform(point_data)
    return point_data_scaled

# ------------------------------
# 3. Build Rips Complex & Persistence
# ------------------------------

def compute_persistent_homology(point_data_scaled, max_dimension=2, max_edge_length=3.0):
    """Build Rips Complex and compute persistent homology."""
    rips_complex = gd.RipsComplex(points=point_data_scaled, max_edge_length=max_edge_length)
    simplex_tree = rips_complex.create_simplex_tree(max_dimension=max_dimension)
    simplex_tree.compute_persistence()
    return simplex_tree

def plot_persistence(simplex_tree):
    """Plot the persistence diagram and barcode."""
    gd.plot_persistence_diagram(simplex_tree.persistence())
    plt.title("Persistence Diagram")
    plt.show()
    
    gd.plot_persistence_barcode(simplex_tree.persistence())
    plt.title("Persistence Barcode")
    plt.show()

# ------------------------------
# 4. Betti Numbers Extraction
# ------------------------------

def extract_betti_numbers(simplex_tree):
    """Extract Betti numbers at max filtration scale."""
    betti_nums = simplex_tree.betti_numbers()
    return betti_nums

# ------------------------------
# 5. Graph Construction
# ------------------------------

def build_graph_from_point_cloud(point_data_scaled, threshold=1.5):
    """Build a graph from point cloud using distance threshold."""
    dist_mat = distance_matrix(point_data_scaled, point_data_scaled)
    G = nx.Graph()
    n_points = len(point_data_scaled)
    G.add_nodes_from(range(n_points))
    
    # Efficient vectorized thresholding
    rows, cols = np.where(np.triu(dist_mat, k=1) <= threshold)
    edges = [(int(i), int(j), {'weight': dist_mat[i, j]}) for i, j in zip(rows, cols)]
    G.add_edges_from(edges)
    
    return G

# ------------------------------
# 6. Visualization of Graph Metrics and Betti Numbers
# ------------------------------

def visualize_graph_metrics(betti_nums, G):
    """Visualize Betti numbers and graph metrics."""
    degrees = np.array([deg for _, deg in G.degree()])
    max_degree = degrees.max() if len(degrees) > 0 else 0
    num_edges = G.number_of_edges()
    
    # Store in DataFrame
    summary_df = pd.DataFrame({
        'beta_0': [betti_nums[0] if len(betti_nums) > 0 else 0],
        'beta_1': [betti_nums[1] if len(betti_nums) > 1 else 0],
        'beta_2': [betti_nums[2] if len(betti_nums) > 2 else 0],
        'max_degree': [max_degree],
        'num_edges': [num_edges]
    })
    
    # Normalize
    df_plot = summary_df.copy()
    for col in df_plot.columns:
        col_min, col_max = df_plot[col].min(), df_plot[col].max()
        if col_max != col_min:
            df_plot[col] = (df_plot[col] - col_min) / (col_max - col_min)
        else:
            df_plot[col] = 0.0
    
    df_plot.plot(figsize=(12, 6), title="Normalized Betti Numbers and Graph Metrics")
    plt.ylabel("Normalized Value")
    plt.xlabel("Sample")
    plt.grid(True)
    plt.show()
    
    return summary_df

# ------------------------------
# 7. Main Pipeline
# ------------------------------

def main():
    csv_file = "mergedBFXSS.csv"
    features = [
        'timestamp',
        'duration',
        'total_payload_bytes',
        'packets_count',
        'fwd_packets_count',
        'bwd_packets_count',
        'bytes_rate'
    ]
    
    # Load and preprocess data
    df = load_and_preprocess_data(csv_file, features)
    print(f"Data shape after preprocessing: {df.shape}")
    print(df.head())
    
    # Extract and scale point cloud
    point_data_scaled = extract_point_cloud(df, features[1:])
    print(f"Point cloud shape: {point_data_scaled.shape}")
    
    # Build Rips Complex and compute PH
    print("Building Rips Complex and computing persistence...")
    simplex_tree = compute_persistent_homology(point_data_scaled)
    print(f"Number of simplices: {simplex_tree.num_simplices()}")
    
    # Plot persistence diagrams
    plot_persistence(simplex_tree)
    
    # Extract Betti numbers
    betti_nums = extract_betti_numbers(simplex_tree)
    print(f"Betti numbers: {betti_nums}")
    
    # Build Graph
    print("Building Graph from point cloud...")
    G = build_graph_from_point_cloud(point_data_scaled, threshold=1.5)
    print(f"Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
    
    # Visualize metrics
    summary_df = visualize_graph_metrics(betti_nums, G)
    print("\nSummary:")
    print(summary_df)

if __name__ == "__main__":
    main()


Data shape after preprocessing: (4463753, 6)
                            duration  packets_count  fwd_packets_count  \
timestamp                                                                
2018-02-23 07:15:39.588247  0.457770           18.0               10.0   
2018-02-23 07:15:41.110103  0.160555           20.0               11.0   
2018-02-23 07:15:41.111346  0.159061           20.0               11.0   
2018-02-23 07:15:41.111651  0.158567           19.0               10.0   
2018-02-23 07:15:41.111177  0.159373           20.0               11.0   

                            bwd_packets_count  total_payload_bytes  \
timestamp                                                            
2018-02-23 07:15:39.588247                8.0               4364.0   
2018-02-23 07:15:41.110103                9.0               8099.0   
2018-02-23 07:15:41.111346                9.0               8099.0   
2018-02-23 07:15:41.111651                9.0               8099.0   
2018-02-23 07:15

: 

: 

: 

In [3]:
# Classical Persistent Homology on Network Intrusion Data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gudhi as gd
import networkx as nx
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance_matrix
from tqdm import tqdm
import dask.dataframe as dd 



In [11]:
# ------------------------------
# 1. Load Dataset
# ------------------------------

features = [
    'timestamp',
    'duration',
    'total_payload_bytes',
    'packets_count',
    'fwd_packets_count',
    'bwd_packets_count',
    'bytes_rate'
]

# Load with Dask
ddf = dd.read_csv(
    "mergedBFXSS.csv",
    usecols=features,
    assume_missing=True,
    dtype={
        'timestamp': 'object',
        'duration': 'float64',
        'total_payload_bytes': 'float64',
        'packets_count': 'float64',
        'fwd_packets_count': 'float64',
        'bwd_packets_count': 'float64',
        'bytes_rate': 'float64'
    }
)

# Convert Dask to Pandas **before** timestamp parsing
print("Converting to Pandas DataFrame...")
df = ddf.compute()

# Parse timestamp and drop NaT rows
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
df = df.dropna(subset=['timestamp'])

# Set timestamp as index
df = df.set_index("timestamp")

print(df.head())

Converting to Pandas DataFrame...
                            duration  packets_count  fwd_packets_count  \
timestamp                                                                
2018-02-23 07:15:39.588247  0.457770           18.0               10.0   
2018-02-23 07:15:41.110103  0.160555           20.0               11.0   
2018-02-23 07:15:41.111346  0.159061           20.0               11.0   
2018-02-23 07:15:41.111651  0.158567           19.0               10.0   
2018-02-23 07:15:41.111177  0.159373           20.0               11.0   

                            bwd_packets_count  total_payload_bytes  \
timestamp                                                            
2018-02-23 07:15:39.588247                8.0               4364.0   
2018-02-23 07:15:41.110103                9.0               8099.0   
2018-02-23 07:15:41.111346                9.0               8099.0   
2018-02-23 07:15:41.111651                9.0               8099.0   
2018-02-23 07:15:41.111177 

In [10]:
# Parse timestamp and drop rows with NaT
ddf["timestamp"] = dd.to_datetime(ddf["timestamp"], errors="coerce")
ddf = ddf.dropna(subset=['timestamp'])
ddf = ddf.set_index("timestamp")

# Trigger computation
df = ddf.compute()

print(df.head())



KeyError: 'timestamp'

In [None]:
# ------------------------------
# 2. Feature Selection
# ------------------------------

# Extract only numeric features (skip 'timestamp' index)
point_data = df[features[1:]].fillna(0).values

# Normalize features
scaler = StandardScaler()
point_data_scaled = scaler.fit_transform(point_data)

print(f"Shape of point cloud: {point_data_scaled.shape}")

# ------------------------------
# 3. Build Rips Complex & Filtration
# ------------------------------

max_dimension = 2
max_edge_length = 3.0  # adjust as needed

print("Building Rips Complex...")
rips_complex = gd.RipsComplex(points=point_data_scaled, max_edge_length=max_edge_length)
simplex_tree = rips_complex.create_simplex_tree(max_dimension=max_dimension)

print(f"Number of simplices: {simplex_tree.num_simplices()}")

# ------------------------------
# 4. Compute Persistence Diagram
# ------------------------------

print("Computing Persistence...")
with tqdm(total=1) as pbar:
    persistence = simplex_tree.persistence()
    pbar.update(1)

gd.plot_persistence_diagram(persistence)
plt.title("Persistence Diagram")
plt.show()

gd.plot_persistence_barcode(persistence)
plt.title("Persistence Barcode")
plt.show()

# ------------------------------
# 5. Extract Betti Numbers
# ------------------------------

betti_nums = simplex_tree.betti_numbers()
print(f"Betti numbers at max filtration scale: {betti_nums}")

# ------------------------------
# 6. Build Graph (for Network Intrusion Detection)
# ------------------------------

print("Building Graph from point cloud...")
graph_threshold = 1.5  # adjust as needed
G = nx.Graph()
dist_mat = distance_matrix(point_data_scaled, point_data_scaled)
for i in tqdm(range(len(point_data_scaled)), desc="Adding edges"):
    for j in range(i+1, len(point_data_scaled)):
        dist = dist_mat[i, j]
        if dist <= graph_threshold:
            G.add_edge(i, j, weight=dist)

print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

# ------------------------------
# 7. Visualize Graph Metrics and Betti Numbers
# ------------------------------

degrees = np.array([deg for _, deg in G.degree()])
max_degree = degrees.max()
num_edges = G.number_of_edges()

summary_df = pd.DataFrame({
    'beta_0': [betti_nums[0] if len(betti_nums) > 0 else 0],
    'beta_1': [betti_nums[1] if len(betti_nums) > 1 else 0],
    'beta_2': [betti_nums[2] if len(betti_nums) > 2 else 0],
    'max_degree': [max_degree],
    'num_edges': [num_edges]
})

# Normalize for visualization
df_plot = summary_df.copy()
for col in ['beta_0', 'beta_1', 'beta_2', 'max_degree', 'num_edges']:
    df_plot[col] = (df_plot[col] - df_plot[col].min()) / (df_plot[col].max() - df_plot[col].min() + 1e-9)

df_plot.plot(figsize=(12, 6), title="Normalized Betti Numbers and Graph Metrics")
plt.ylabel("Normalized Value")
plt.xlabel("Sample")
plt.grid(True)
plt.show()

# ------------------------------
# 8. Summary
# ------------------------------

print("\nSummary:")
print(f"Betti_0 (Connected Components): {betti_nums[0] if len(betti_nums) > 0 else 'N/A'}")
print(f"Betti_1 (1-dimensional Holes): {betti_nums[1] if len(betti_nums) > 1 else 'N/A'}")
print(f"Betti_2 (2-dimensional Holes): {betti_nums[2] if len(betti_nums) > 2 else 'N/A'}")
print(f"Max Degree in Graph: {max_degree}")
print(f"Number of Edges in Graph: {num_edges}")