### Pubmed search validation statistics
Robert Ietswaart, 20190825 - MIT license

In [101]:
import re
import copy
import numpy as np
import pickle as pkl
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
import seaborn as sns
from scipy.stats import fisher_exact, chi2_contingency,  mannwhitneyu, ks_2samp

from statsmodels.stats.multitest import fdrcorrection

In [102]:
path='/Users/horizon/Documents/HMS/Novartis2018Hackathon/PCS/Saman/'
filename='Full_result_ADRmesh_Genemesh_pubmed_NEW.csv'
ATpmid = pd.read_csv(path+filename,sep='\t')#
ATpmid.head()

Unnamed: 0,HLGT,gene,mesh_tr,HLGT_N,mesh_N,intersect_N,flag
0,coagulopathies and bleeding diatheses (excl th...,AR,"Receptors, Androgen",94336,14984,14,True
1,coagulopathies and bleeding diatheses (excl th...,NR3C1,"Receptors, Glucocorticoid",94336,12736,8,True
2,coagulopathies and bleeding diatheses (excl th...,HTR3A,"Receptors, Serotonin, 5-HT3",94336,1657,0,False
3,coagulopathies and bleeding diatheses (excl th...,PDE3A,"Cyclic Nucleotide Phosphodiesterases, Type 3",94336,855,0,False
4,coagulopathies and bleeding diatheses (excl th...,PTGS2,Cyclooxygenase 2,94336,22509,9,False


In [103]:
mapper={'HLGT_N':'N_A','mesh_N':'N_T','intersect_N':'N_AT','flag':'RF_pred'}
ATpmid=ATpmid.rename(columns=mapper)
len(ATpmid)

5149

In [154]:
#TEMP: delete menopause HLGT as it has no meddra terms associated: Saman should send updated file with these not in file anymore
ATpmid=ATpmid[~ATpmid['HLGT'].isin(['menopause and related conditions'])]


In [158]:
dtemp=ATpmid[ATpmid['RF_pred']==False]
dtemp=dtemp.drop_duplicates(['HLGT','gene'])
print(len(dtemp[dtemp['N_AT']>0]),'/',len(dtemp),
      len(dtemp[dtemp['N_AT']>0])/len(dtemp) ,'of negatives have literature co-occurrence')

2748 / 4890 0.561963190184049 of negatives have literature co-occurrence


In [159]:
dtemp=ATpmid[ATpmid['RF_pred']==True]
dtemp=dtemp.drop_duplicates(['HLGT','gene'])
print(len(dtemp[dtemp['N_AT']>0]),'/',len(dtemp),
      len(dtemp[dtemp['N_AT']>0])/len(dtemp) ,'of predictions have literature co-occurrence')

145 / 219 0.6621004566210046 of predictions have literature co-occurrence


In [160]:
N_all_pmids=29138919
#information retrieved from:
#https://www.nlm.nih.gov/bsd/licensee/baselinestats.html
#https://www.nlm.nih.gov/bsd/licensee/2019_stats/2019_LO.html


In [161]:
eps=1e-100
ATpmid['lift']=N_all_pmids*ATpmid['N_AT']/(ATpmid['N_A']*ATpmid['N_T']+eps)

In [162]:
dtemp=ATpmid[ATpmid['lift']>1]
print(len(dtemp[dtemp['RF_pred']==False])/len(ATpmid[ATpmid['RF_pred']==False]),
      'of negatives have positive lift')
print(len(dtemp[dtemp['RF_pred']==True])/len(ATpmid[ATpmid['RF_pred']==True]),
      'of predictions have positive lift')
###NB results for lift and FE odds ratio is exactly the same

0.12678936605316973 of negatives have positive lift
0.22727272727272727 of predictions have positive lift


In [163]:
print(np.median(ATpmid[ATpmid['RF_pred']==True]['lift']))
print(np.median(ATpmid[ATpmid['RF_pred']==False]['lift']))
print(np.mean(ATpmid[ATpmid['RF_pred']==True]['lift']))
print(np.mean(ATpmid[ATpmid['RF_pred']==False]['lift']))
# print(max(ATpmid[ATpmid['RF_pred']==True]['lift']))
# print(max(ATpmid[ATpmid['RF_pred']==False]['lift']))

0.1596146102210732
0.06256674444049146
1.2741817056733502
0.6606205103810764


In [164]:
print('fold',np.median(ATpmid[ATpmid['RF_pred']==True]['lift'])/np.median(ATpmid[ATpmid['RF_pred']==False]['lift']))

fold 2.551109405618603


In [112]:
#mannwhitneyu test = Mann Whitney U test = Wilcoxon rank sum test (not paired)
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html#scipy.stats.mannwhitneyu
#with corrects for ties (=equal rankings) and continuity correction since our data is discrete

In [167]:
U , pval = mannwhitneyu(ATpmid[ATpmid['RF_pred']==True]['lift'],
                        ATpmid[ATpmid['RF_pred']==False]['lift'],
                        use_continuity=True,
                        alternative='two-sided')

U, pval

(611395.0, 0.00033989280683877624)

In [168]:
def get_contingency_table(universe,Total_condition1,Total_condition2,Overlap_cond12):
    ct = [[1, 2], [3, 4]]#contingency table
    ct[0][0] = Overlap_cond12
    ct[0][1] = Total_condition2-Overlap_cond12
    ct[1][0] = Total_condition1-Overlap_cond12
    ct[1][1] = universe-Total_condition1-Total_condition2+Overlap_cond12
    return ct


# def FE_test_from_totals(universe,Total_condition1,Total_condition2,Overlap_cond12):
#     cont_table = [[1, 2], [3, 4]]#contingency table
#     cont_table[0][0] = Overlap_cond12
#     cont_table[0][1] = Total_condition2-Overlap_cond12
#     cont_table[1][0] = Total_condition1-Overlap_cond12
#     cont_table[1][1] = universe-Total_condition1-Total_condition2+Overlap_cond12
#     fisher_exact(cont_table)
#     return oddsratio, pvalue

## FE / chi2 test to see if Pubmed retrieval rate is higher than over background

In [169]:
N_all_ATs = len(ATpmid)
dtemp=ATpmid[ATpmid['RF_pred']==True]
N_RF_pred = len(dtemp)
dtemp = ATpmid[ATpmid['N_AT']>0]
N_pmpos = len(dtemp)
N_RF_pred_pmpos=len(dtemp[dtemp['RF_pred']==True])

print(N_all_ATs,
      N_RF_pred,
      N_pmpos,
      N_RF_pred_pmpos)
cont_table = get_contingency_table(N_all_ATs,
                                   N_RF_pred,
                                   N_pmpos,
                                   N_RF_pred_pmpos)    

OR , pval = fisher_exact(cont_table)
print('Fisher Exact odds ratio',OR,'pvalue',pval)
c2, pval, dof, ex = chi2_contingency(cont_table, correction=True)
print('Chi2 statistic',c2,'pvalue',pval,'degrees of freedom',dof)
print('contingency table \n', cont_table)
print('expected contingency table under independence null hypothesis \n', ex)


5110 220 2893 145
Fisher Exact odds ratio 1.5069868995633189 pvalue 0.004337562085414955
Chi2 statistic 7.695235016242595 pvalue 0.005536679998657007 degrees of freedom 1
contingency table 
 [[145, 2748], [75, 2142]]
expected contingency table under independence null hypothesis 
 [[ 124.5518591 2768.4481409]
 [  95.4481409 2121.5518591]]


### Idem with dropping duplicates

In [170]:
N_all_ATs = len(ATpmid.drop_duplicates(['HLGT','gene']))
dtemp=ATpmid[ATpmid['RF_pred']==True]
dtemp=dtemp.drop_duplicates(['HLGT','gene'])#some assays (eg hERG binding, hERG QP) are mapped to same gene:
#this will duplicate the pubmed retrieval query
N_RF_pred = len(dtemp)
dtemp = ATpmid[ATpmid['N_AT']>0]
dtemp=dtemp.drop_duplicates(['HLGT','gene'])#some assays (eg hERG binding, hERG QP) are mapped to same gene:

#this will duplicate the pubmed retrieval query
N_pmpos = len(dtemp)
N_RF_pred_pmpos=len(dtemp[dtemp['RF_pred']==True])
                    
print(N_all_ATs,
      N_RF_pred,
      N_pmpos,
      N_RF_pred_pmpos)
cont_table = get_contingency_table(N_all_ATs,
                                   N_RF_pred,
                                   N_pmpos,
                                   N_RF_pred_pmpos)    

OR , pval = fisher_exact(cont_table)
print('Fisher Exact odds ratio',OR,'pvalue',pval)
c2, pval, dof, ex = chi2_contingency(cont_table, correction=True)
print('Chi2 statistic',c2,'pvalue',pval,'degrees of freedom',dof)
print('contingency table \n', cont_table)
print('expected contingency table under independence null hypothesis \n', ex)

5109 219 2893 145
Fisher Exact odds ratio 1.5273515873952555 pvalue 0.003386803438991879
Chi2 statistic 8.154948222126267 pvalue 0.004294382610624115 degrees of freedom 1
contingency table 
 [[145, 2748], [74, 2142]]
expected contingency table under independence null hypothesis 
 [[ 124.00998238 2768.99001762]
 [  94.99001762 2121.00998238]]
