In [86]:
import pandas as pd
from itertools import combinations
from collections import defaultdict

## Step 1
Reshape the original dataset from wide format to long format and then aggregate by sampleID to concatenate non-zero microbes.

In [87]:
data = pd.read_csv("south.csv")

In [88]:
data.head()

Unnamed: 0,sampleID,region,site,g_Methanobacterium,f_Rice_Cluster_II,c_Bathyarchaeia,g_RBG-16-58-14,c_Subgroup_17,g_Methanosaeta,g_Methanoregula,...,f_Caedibacteraceae,f_Halanaerobiaceae,g_Candidatus_Actinomarina,f_Helicobacteraceae,f_Francisellaceae,g_endosymbionts8,g_Soortia,g_Rhodopila,f_Family_XIV,g_Syntrophaceticus
0,MB_MCMST2_P6_S7,south,MCMS,32,0,13204,1801,3259,152,6,...,0,0,0,0,0,0,0,0,0,0
1,MB_MCMST2_P6_S9,south,MCMS,87,0,18061,5518,2623,548,67,...,0,0,0,0,0,0,0,0,0,0
2,MB_MCMST2_P6_S2,south,MCMS,14,0,1912,106,4618,9,0,...,0,0,0,0,0,0,0,0,0,0
3,MB_MCMST2_P6_S5,south,MCMS,90,0,18759,3022,5005,47,4,...,0,0,0,0,0,0,0,0,0,0
4,MB_MCMST1_P1_S1,south,MCMS,1432,0,35720,8110,6188,5700,15,...,0,0,0,0,0,0,0,0,0,0


In [89]:
data = data.drop(columns=['region', 'site'])

In [90]:
long_data = data.melt(id_vars="sampleID", var_name="microbe", value_name="count")

In [91]:
long_data = long_data[long_data["count"] > 0]

In [92]:
long_data.head()

Unnamed: 0,sampleID,microbe,count
0,MB_MCMST2_P6_S7,g_Methanobacterium,32
1,MB_MCMST2_P6_S9,g_Methanobacterium,87
2,MB_MCMST2_P6_S2,g_Methanobacterium,14
3,MB_MCMST2_P6_S5,g_Methanobacterium,90
4,MB_MCMST1_P1_S1,g_Methanobacterium,1432


In [93]:
wide_data = long_data.groupby("sampleID")["microbe"].apply(lambda x: ", ".join(x)).reset_index()  


In [94]:
wide_data.columns = ["sampleID", "Microbes"]

In [95]:
wide_data.head()

Unnamed: 0,sampleID,Microbes
0,MB_MCHST3_P1_S1,"g_Methanobacterium, c_Bathyarchaeia, g_RBG-16-..."
1,MB_MCHST3_P1_S2,"g_Methanobacterium, c_Bathyarchaeia, g_RBG-16-..."
2,MB_MCHST3_P1_S3,"g_Methanobacterium, c_Bathyarchaeia, g_RBG-16-..."
3,MB_MCHST3_P1_S4,"g_Methanobacterium, c_Bathyarchaeia, g_RBG-16-..."
4,MB_MCHST3_P1_S5,"g_Methanobacterium, c_Bathyarchaeia, g_RBG-16-..."


## Step 2
Filtered each sample to retain only the microbes that matched the top five microbes identified through Principal Component Analysis.

In [96]:
target_microbes = {"g_Methanosaeta", "g_Methanoregula", "g_Sh765B-TzT-35", 
                   "c_BD2-11_terrestrial_group", "f_Methanomassiliicoccaceae"}

In [97]:
filtered_rows = []
for index, row in wide_data.iterrows():
    sample_id = row.iloc[0]  
    microbes = row.iloc[1].split(", ") 

   
    matched_microbes = list(target_microbes.intersection(microbes))
    
    if matched_microbes:  
        filtered_rows.append([sample_id, ", ".join(matched_microbes)])  

In [98]:
target_data = pd.DataFrame(filtered_rows, columns=["sampleID", "Microbes"])

## Step 3
calculating the network nodes as the unique microbes and the edges by counting how often each pair of microbes co-occurred in the same sample.

In [99]:
target_data["Microbes"] = target_data["Microbes"].apply(lambda x: x.split(", "))

In [100]:
unique_microbes = set(microbe for sublist in target_data["Microbes"] for microbe in sublist)

In [101]:
nodes_df = pd.DataFrame({"Id": list(unique_microbes), "Label": list(unique_microbes)})
nodes_df.to_csv("2.csv", index=False)

In [102]:
edges_dict = defaultdict(int)

In [103]:
for microbes in target_data["Microbes"]:
    for microbe1, microbe2 in combinations(sorted(microbes), 2):  
        edges_dict[(microbe1, microbe2)] += 1

In [104]:
edges_df = pd.DataFrame([(k[0], k[1], v) for k, v in edges_dict.items()], columns=["Source", "Target", "Weight"])
edges_df.to_csv("3.csv", index=False)