# Raw data exploration

This notebook previews files under `data/raw/`:
- `edges.csv`: edge list with optional `weight`, `type`.
- `celegansneural.mtx`: sparse adjacency (Matrix Market).

We will list files, inspect head/tail, basic summaries, and sample nonzero entries from the matrix.


In [5]:
from pathlib import Path
import pandas as pd
import numpy as np
from scipy import io as spio
from scipy import sparse

raw_dir = Path('./raw')
raw_dir.exists(), list(raw_dir.glob('*'))[:10]


(True,
 [PosixPath('raw/celegansneural.mtx'),
  PosixPath('raw/celegansneural.zip'),
  PosixPath('raw/edges.csv')])

In [6]:
# Preview edges.csv if present
edges_path = raw_dir / 'edges.csv'
if edges_path.exists():
    df_edges = pd.read_csv(edges_path)
    display(df_edges.head(10))
    display(df_edges.tail(5))
    display(df_edges.describe(include='all').T)
    print('Columns:', list(df_edges.columns))
    print('Num rows:', len(df_edges))
else:
    print('edges.csv not found at', edges_path)


Unnamed: 0,source,target,weight,type
0,1,10,4.0,chemical
1,1,2,1.0,chemical
2,1,3,2.0,chemical
3,1,4,1.0,chemical
4,1,5,2.0,chemical
5,1,6,1.0,chemical
6,1,7,6.0,chemical
7,1,8,6.0,chemical
8,1,9,1.0,chemical
9,10,104,1.0,chemical


Unnamed: 0,source,target,weight,type
2340,99,208,1.0,chemical
2341,99,3,2.0,chemical
2342,99,4,4.0,chemical
2343,99,85,4.0,chemical
2344,99,87,1.0,chemical


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
source,2345.0,,,,129.718124,78.776953,1.0,64.0,126.0,199.0,297.0
target,2345.0,,,,114.156077,77.729822,1.0,45.0,112.0,178.0,284.0
weight,2345.0,,,,3.760768,5.484495,1.0,1.0,2.0,4.0,70.0
type,2345.0,1.0,chemical,2345.0,,,,,,,


Columns: ['source', 'target', 'weight', 'type']
Num rows: 2345


In [None]:
# Preview Matrix Market file if present
mtx_path = raw_dir / 'celegansneural.mtx'
if mtx_path.exists():
    M = spio.mmread(str(mtx_path))  # may be sparse
    print('Matrix type:', type(M))
    A = sparse.coo_matrix(M)
    print('Shape:', A.shape, 'nnz:', A.nnz)
    # sample a few nonzero entries
    idx = np.random.choice(A.nnz, size=min(10, A.nnz), replace=False) if A.nnz > 0 else []
    rows = A.row[idx] if len(idx) else []
    cols = A.col[idx] if len(idx) else []
    vals = A.data[idx] if len(idx) else []
    df_nz = pd.DataFrame({'row': rows, 'col': cols, 'edge degree': avls})
    display(df_nz)
else:
    print('Matrix file not found at', mtx_path)


Matrix type: <class 'scipy.sparse._coo.coo_matrix'>
Shape: (297, 297) nnz: 2345


Unnamed: 0,row,col,value
0,240,58,1
1,17,53,1
2,4,197,1
3,224,129,1
4,57,61,3
5,90,187,2
6,145,195,1
7,137,185,2
8,72,132,4
9,2,176,14


In [8]:
# Quick consistency checks if both files exist
if edges_path.exists() and mtx_path.exists():
    print('edges.csv rows:', len(df_edges))
    print('mtx shape:', A.shape, 'nnz:', A.nnz)
    # If edges.csv has node names, display a few unique names
    if 'source' in df_edges and 'target' in df_edges:
        srcs = df_edges['source'].astype(str).unique()[:5]
        tars = df_edges['target'].astype(str).unique()[:5]
        print('Sample sources:', srcs)
        print('Sample targets:', tars)


edges.csv rows: 2345
mtx shape: (297, 297) nnz: 2345
Sample sources: ['1' '10' '100' '101' '102']
Sample targets: ['10' '2' '3' '4' '5']


In [10]:
# Build V4: directed, unweighted with positions compatible with e3_nav
import networkx as nx
from pathlib import Path

out_dir = Path('processed/V4'); out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / 'graph_V4_directed_unweighted.gpickle'

# Load directed edges (chemical), unweighted
if edges_path.exists():
    df = pd.read_csv(edges_path)
    if 'type' in df.columns:
        df = df[df['type'].astype(str).str.lower().eq('chemical')]
    # Build DiGraph
    G = nx.DiGraph()
    # Add edges as unweighted
    for _, r in df.iterrows():
        u, v = str(r['source']), str(r['target'])
        if u == v: 
            continue
        G.add_edge(u, v, weight=1.0)
    # Compute 2D layout (deterministic)
    if G.number_of_nodes() > 0:
        pos = nx.spring_layout(G.to_undirected(), seed=42)
        # attach as 'pos' tuples
        for n, (x, y) in pos.items():
            G.nodes[n]['pos'] = (float(x), float(y))
    # Save as gpickle
    try:
        from networkx.readwrite.gpickle import write_gpickle
        write_gpickle(G, str(out_path))
    except Exception:
        import pickle
        with open(out_path, 'wb') as f:
            pickle.dump(G, f)
    print('Saved V4 graph to', out_path)
    # Show sample node positions
    sample_nodes = list(G.nodes())[:5]
    sample_pos = {n: G.nodes[n].get('pos') for n in sample_nodes}
    print('Sample positions:', sample_pos)
else:
    print('edges.csv not found; cannot build V4.')


Saved V4 graph to processed/V4/graph_V4_directed_unweighted.gpickle
Sample positions: {'1': (-0.1489739613120049, -0.32148940601113635), '10': (-0.16825360047155574, -0.2548834961354343), '2': (-0.04666693105236676, -0.33582233988456733), '3': (0.11854226229609921, 0.03416426111511704), '4': (-0.09095418744126506, 0.022751669428587563)}
