In [2]:
import json, math
from collections import defaultdict
import numpy as np
import pandas as pd
import community as community_louvain
import networkx as nx
import matplotlib.pyplot as plt

In [3]:
INPUT_PATH = "MAPQGE30/chr1_1mb.RAWobserved"  
data = np.loadtxt(INPUT_PATH)
if data.shape[1] < 3:
    raise ValueError("Expected 3 columns (i, j, v).")

In [5]:
thresh = np.percentile(data[:,2], 90)  # 90th percentile
filtered_indices = np.where(data[:,2] >= thresh)[0]
filtered = data[filtered_indices]
len(filtered)

2657

In [7]:
i = filtered[:, 0].astype(int)
j = filtered[:, 1].astype(int)
w = filtered[:, 2].astype(float)
len(i), len(j), len(w)

(2657, 2657, 2657)

In [8]:
nodes = defaultdict(list)
for a, b, wt in zip(i, j, w):
    if a == b:
        continue  # drop self edges
    nodes[a].append((b, wt)) #add to adjacency list
    nodes[b].append((a, wt)) #add to adjacency list
len(nodes) #number of nodes

227

In [9]:
edges = set() #only unique edges
for source, lst in nodes.items():
    for target, wt in lst:
        u, v = (source, target) if source < target else (target, source)
        edges.add((u, v))
len(edges)

2429

In [10]:
nodes = sorted({x for e in edges for x in e})
len(nodes)

227

In [11]:
G = nx.Graph()
G.add_edges_from([(u, v) for (u, v) in edges])

In [12]:
partition = community_louvain.best_partition(G)
len(partition) #dict of node with its community id

227

In [13]:
d3 = {
    "nodes": [{"id": int(n), "community": int(partition.get(n, -1))} for n in nodes],
    "links": [{"source": int(u), "target": int(v)} for (u, v) in sorted(edges)],
}

json_out = f"percentile.json"
with open(json_out, "w") as f:
    json.dump(d3, f)