In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from collections import defaultdict
from tqdm import tqdm
import networkx as nx
from matplotlib_venn import venn2, venn3
from neteval import gene_mapper as gm
from neteval import query_ensembl as qe
from neteval import query_hgnc as qh
import obonet as obo
import random as rn
import glob
import re

In [2]:
datadir='/cellar/users/snwright/Data/RareCommon/inputs/'
resdir= '/cellar/users/snwright/Data/RareCommon/outputs/netcoloc/'
outdir='/cellar/users/snwright/Data/Transfer/RVC/'
figdir='/cellar/users/snwright/Data/Transfer/RVC/figures/RAVAR_GWASCat/'

In [3]:
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams.update({'font.size': 7})
plt.rcParams['axes.linewidth'] = 0.5
plt.rcParams['hatch.linewidth'] = 0.5
plt.rcParams['xtick.major.width'] = 0.4
plt.rcParams['ytick.major.width'] = 0.4
plt.rcParams['xtick.minor.width'] = 0.3
plt.rcParams['ytick.minor.width'] = 0.3
plt.rcParams['legend.frameon'] = False
plt.rcParams['xtick.major.size'] = 3
plt.rcParams['ytick.major.size'] = 3
plt.rcParams['xtick.minor.size'] = 2
plt.rcParams['ytick.minor.size'] = 2
plt.rcParams['xtick.major.pad'] = 1
plt.rcParams['ytick.major.pad'] = 1
plt.rcParams['axes.labelpad'] = 1
plt.rcParams['patch.linewidth'] = 0.25

## Load evaluation data

In [19]:
col_names =['NDCG_RC', 'NDCG_CR', 'AUPRC_RC', 'AUPRC_CR', 'Baseline_AUPRC_RC', 'Baseline_AUPRC_CR', 'Method']

In [41]:
ng = pd.read_csv(os.path.join(resdir, 'testing', 'nearestGene_evaluation.tsv'), sep='\t', header=None, names=col_names)
ng = ng.assign(Group='NG')
ngth8 = pd.read_csv(os.path.join(resdir, 'testing', 'nearestGeneTH8_evaluation.tsv'), sep='\t', header=None, names=col_names)
ngth8 = ngth8.assign(Group='NG8')
mag103 = pd.read_csv(os.path.join(resdir, 'magma', 'magma_th103_evaluation.tsv'), sep='\t', header=None, names=col_names)
mag103 = mag103.assign(Group='MAG103')
mag256 = pd.read_csv(os.path.join(resdir, 'magma', 'magma_th256_evaluation.tsv'), sep='\t', header=None, names=col_names)
mag256 = mag256.assign(Group='MAG256')
res_df = pd.concat([ng, ngth8, mag103, mag256])

In [42]:
res_df.groupby('Method').corr(numeric_only=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,NDCG_RC,NDCG_CR,AUPRC_RC,AUPRC_CR,Baseline_AUPRC_RC,Baseline_AUPRC_CR
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
binary,NDCG_RC,1.0,0.898802,0.536498,0.571061,0.722734,0.722248
binary,NDCG_CR,0.898802,1.0,0.536768,0.60221,0.695232,0.695136
binary,AUPRC_RC,0.536498,0.536768,1.0,0.910203,0.59409,0.593569
binary,AUPRC_CR,0.571061,0.60221,0.910203,1.0,0.653904,0.653359
binary,Baseline_AUPRC_RC,0.722734,0.695232,0.59409,0.653904,1.0,0.999988
binary,Baseline_AUPRC_CR,0.722248,0.695136,0.593569,0.653359,0.999988,1.0
q_None,NDCG_RC,1.0,0.895193,0.547217,0.571746,0.724905,0.7244
q_None,NDCG_CR,0.895193,1.0,0.559988,0.605788,0.699634,0.699528
q_None,AUPRC_RC,0.547217,0.559988,1.0,0.859995,0.607584,0.607173
q_None,AUPRC_CR,0.571746,0.605788,0.859995,1.0,0.660104,0.659535


## NDCG

Why do the results look SO consistent across methods? Are they actually using different Z-scores? Or does it really not matter?

In [105]:
res_df.loc['magma_th103_GCST001791_EFO_0004531']

Unnamed: 0,NDCG_RC,NDCG_CR,AUPRC_RC,AUPRC_CR,Baseline_AUPRC_RC,Baseline_AUPRC_CR,Method,Group
magma_th103_GCST001791_EFO_0004531,0.285368,0.275374,0.019029,0.016104,0.007318,0.007266,binary,MAG103
magma_th103_GCST001791_EFO_0004531,0.282078,0.276282,0.01989,0.018923,0.007318,0.007266,q_None,MAG103
magma_th103_GCST001791_EFO_0004531,0.282953,0.276024,0.019633,0.018691,0.007318,0.007266,q_max,MAG103
magma_th103_GCST001791_EFO_0004531,0.280894,0.276473,0.019979,0.018595,0.007318,0.007266,q_sum,MAG103


In [49]:
res_df.groupby(['Group', 'Method']).AUPRC_RC.mean()

Group   Method
MAG103  binary    0.012610
        q_None    0.012605
        q_max     0.012737
        q_sum     0.012586
MAG256  binary    0.011647
        q_None    0.011335
        q_max     0.011388
        q_sum     0.011420
NG      binary    0.002413
        q_None    0.002744
        q_max     0.002411
        q_sum     0.002401
NG8     binary    0.006167
        q_None    0.006448
        q_max     0.006141
        q_sum     0.006226
Name: AUPRC_RC, dtype: float64

## Debug Metrics

1. Not getting the same number. Need to exclude the seed genes (did not solve). Why is it still wrong?
2. Does it make more sense to set a k value, it appears that the discounting factor isn't that strong, so very low ranked genes are still contributing.

In [139]:
trait = 'magma_th103_GCST001791_EFO_0004531'

In [198]:
zc = pd.read_csv(os.path.join(resdir, 'magma','magma_th103_GCST001791_EFO_0004531_z_CV_q_neglog10_max.tsv'), header=None, 
                 names=['z'], index_col=0, sep='\t')
zr = pd.read_csv(os.path.join(resdir, 'magma' ,'magma_th103_GCST001791_EFO_0004531_z_RV_q_neglog10_max.tsv'), header=None, 
                 names=['z'], index_col=0, sep='\t')
zc2 = pd.read_csv(os.path.join(resdir, 'magma','magma_th103_GCST001791_EFO_0004531_z_CV_neglog10.tsv'), header=None, 
                 names=['z2'], index_col=0, sep='\t')
zr2 = pd.read_csv(os.path.join(resdir, 'magma' ,'magma_th103_GCST001791_EFO_0004531_z_RV_neglog10.tsv'), header=None, 
                 names=['z2'], index_col=0, sep='\t')

In [199]:
xc = zc.join(zc2)

In [201]:
xc.corr()

Unnamed: 0,z,z2
z,1.0,0.786838
z2,0.786838,1.0


In [184]:
zc.sort_values(by='z', ascending=False, inplace=True)
zr.sort_values(by='z', ascending=False, inplace=True)

In [185]:
c_genes = pd.read_csv(os.path.join(datadir, 'magma', 'magma_th103_GCST001791_EFO_0004531_CV.txt'), sep='\t', index_col=0)
r_genes = pd.read_csv(os.path.join(datadir, 'magma', 'magma_th103_GCST001791_EFO_0004531_RV.txt'), sep='\t', index_col=0).sort_values(by='P-value')
c_genes.index.name=None
r_genes.index.name=None
c_genes = c_genes.assign(logp= c_genes['P-value'].apply(lambda x: -1 * np.log10(x + 1e-250))).sort_values(by='logp', ascending=False)
r_genes = r_genes.assign(logp= r_genes['P-value'].apply(lambda x: -1 * np.log10(x + 1e-250))).sort_values(by='logp', ascending=False)


In [165]:
len(zc)

19267

In [186]:
zc = zc[~zc.index.isin(c_genes.index.values)]
zr = zr[~zr.index.isin(r_genes.index.values)]

In [187]:
zc['rank'] = zc.rank(ascending=False)
zr['rank'] = zr.rank(ascending=False)

In [188]:
cr = zc.join(r_genes, how='inner')

In [189]:
cr['dg'] = cr.apply(lambda x: x.logp / (np.log2(x['rank'] + 1)), axis=1)

In [190]:
idcg = np.sum([x / np.log2(i+2) for i, x in enumerate(r_genes.logp.values[1:100])])

In [191]:
dcg = cr['dg'][1:100].sum()

In [192]:
dcg/idcg

0.3516144996195619

In [135]:
4.3/np.log2(232)

0.5472143547644442

In [75]:
def ndcg(scores, weights)

Unnamed: 0,P-value,logp
8335,5.755600e-17,16.239909
8358,1.090300e-16,15.962454
56606,1.932000e-16,15.713993
55867,1.498800e-15,14.824256
2646,2.720000e-15,14.565431
...,...,...
29128,8.696100e-04,3.060675
645432,9.417300e-04,3.026074
83445,9.485500e-04,3.022940
91833,9.750400e-04,3.010978


### calculate NDCG

### calculate AUPRC