In [1]:
import pandas as pd
import random as rn
import numpy as np
from netcoloc import netprop_zscore, netprop
import ndex2

In [46]:
def sampling_from_pascal(pascal_file_list, trait_list, outpath, num_samps=100, num_reps=1000):
    # import PCNET
    print("Loading PCNet")
    interactome_uuid='4de852d9-9908-11e9-bcaf-0ac135e8bacf'
    ndex_server='public.ndexbio.org'
    ndex_user=None
    ndex_password=None
    G_PC = ndex2.create_nice_cx_from_server(
            ndex_server, 
            username=ndex_user, 
            password=ndex_password, 
            uuid=interactome_uuid
        ).to_networkx()
    nodes = list(G_PC.nodes)
    # calculate heat matrix
    print("Calculating w_prime")
    w_prime = netprop.get_normalized_adjacency_matrix(G_PC, conserve_heat=True, weighted=False)
    print("Calculating heat matrix")
    indiv_heats = netprop.get_individual_heats_matrix(w_prime, alpha=0.5)
    # perform each sampling analysis
    consensus_z_scores = {}
    for i, file in enumerate(pascal_file_list):
        data = pd.read_csv(file, sep="\t")
        trait = trait_list[i]
        print("Sampling", trait)
        res = sample_seed_genes(data, nodes, indiv_heats, G_PC, trait=trait, max_genes=500, num_samps=num_samps,
                             outdir=outpath, num_reps=num_reps)
        print("Finished sampling", trait)
        consensus_z_scores[trait] = get_consensus_z_scores(res)
    return consensus_z_scores
        
def sample_seed_genes(data, nodes, indiv_heats, G_PC, trait="BMI", max_genes=500, num_samps=100, 
                      sig=0.05, num_reps=1000, parallel=False, outdir=DATADIR):
    outfile=outdir+trait+"sampling_"+str(max_genes)+"_"+str(num_samps)+".tsv"
    data = data.loc[data.gene_symbol.isin(nodes)] # subset to genes in PCNET
    all_seeds = data.loc[data.pvalue <= sig/len(data)] # bonferonni correction
    all_seeds = all_seeds.assign(log10p=-1*np.log10(all_seeds.pvalue)) # get -log10p for weighted sampling
    sampling_results = []
    if parallel:
        pool = mp.Pool(2)
        #sampling_results = [pool.apply(_do_propagation, args=(all_seeds, G_PC, indiv_heats, num_reps, max_genes)) for i in range(3)]
        pool.close()
    else:
        for i in range(num_samps):
            sample_results = _do_propagation(all_seeds, G_PC, indiv_heats, num_reps, max_genes)
            sample_z = pd.DataFrame(sample_results, columns=["z"+str(i)])
            if i ==0 :
                sample_z.to_csv(outfile, sep="\t")
            else:
                existing = pd.read_csv(outfile, sep="\t", index_col=0)
                existing = existing.join(sample_z)
                existing.to_csv(outfile, sep="\t")
            sampling_results.append(sample_z)
    
    return pd.concat(sampling_results, axis=1)


def _do_propagation(all_seeds, G_PC, indiv_heats, num_reps=1000, max_genes=500):
    sample_seeds = rn.choices(population=all_seeds.gene_symbol.values, weights=all_seeds.log10p.values, k=max_genes)
    samp_results = netprop_zscore.calculate_heat_zscores(indiv_heats, nodes=list(G_PC.nodes), degrees=dict(G_PC.degree),
                                                         seed_genes=sample_seeds,num_reps=num_reps, alpha=0.05, 
                                                         minimum_bin_size=10, random_seed=1)
    return samp_results[0]


def get_consensus_z_scores(sampled_results, percentile=.75):
    """
    returns the consensus z score for each gene across all samples
    """
    if type(sampled_results) == str:
        results = pd.read_csv(sampled_results, sep="\t", index_col=0)
    else:
        results = sampled_results
    consensus_z = pd.DataFrame({'z':results.quantile(q=percentile, axis=1)})
    return consensus_z

### Option 1 - run a list of pascal files

In [30]:
DATADIR = "~/Data/rat/data/"

In [47]:
file_list=[DATADIR+"GIANT_genomics/BMI/GIANT_BMI_pascal.sum.genescores.txt"] # list of tab-separated pascal files
trait_list=["BMI"] # trait names for naming output files
outpath="" # location to save results
sampling_from_pascal(file_list, trait_list, outpath, num_samps=4, num_reps=10)

Loading PCNet
Calculating w_prime
Calculating heat matrix
Sampling BMI


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Finished sampling BMI


{'BMI':                 z
 UBE2Q1  -0.050077
 RNF14    0.065487
 UBE2Q2   2.744207
 TMCO1    0.133241
 UBAC1    0.375765
 ...           ...
 CD300LD  1.243231
 SLC10A5  0.226971
 BAGE4   -0.482724
 FAM181B  1.738547
 TPT1P8   0.507057
 
 [18820 rows x 1 columns]}

### Option 2 - run one trait at a time, but precalculate pcnet heat matrix

In [3]:
import ndex2
interactome_uuid='4de852d9-9908-11e9-bcaf-0ac135e8bacf'
ndex_server='public.ndexbio.org'
ndex_user=None
ndex_password=None
G_PC = ndex2.create_nice_cx_from_server(
            ndex_server, 
            username=ndex_user, 
            password=ndex_password, 
            uuid=interactome_uuid
        ).to_networkx()
nodes = list(G_PC.nodes)

# print out interactome num nodes and edges for diagnostic purposes
print('number of nodes:', len(G_PC.nodes))
print('number of edges:', len(G_PC.edges))

number of nodes: 18820
number of edges: 2693250


In [5]:
w_prime = netprop.get_normalized_adjacency_matrix(G_PC, conserve_heat=True, weighted=False)

In [6]:
indiv_heats = netprop.get_individual_heats_matrix(w_prime, alpha=0.5)

In [None]:
data = pd.read_csv(DATADIR+"GIANT_genomics/BMI/GIANT_BMI_pascal.sum.genescores.txt", sep="\t")

In [21]:
a = sample_seed_genes(data, nodes, indiv_heats, G_PC, num_reps=10, num_samps=4, trait="BMI")

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [44]:
get_consensus_z_scores(a)

Unnamed: 0,z
UBE2Q1,0.046308
RNF14,-1.284102
UBE2Q2,0.304512
TMCO1,-0.603054
UBAC1,0.536572
...,...
CD300LD,1.944859
SLC10A5,10.460008
BAGE4,2.367835
FAM181B,1.499096
