In [2]:
import pandas as pd
import os

In [3]:
datadir = '/cellar/users/snwright/Data/RareCommon/RAVAR'

## RAVAR UKB

In [59]:
ravar_genes = pd.read_csv(os.path.join(datadir ,'gene_fulltable_06112024.txt.entrez'),sep='\t', 
                            usecols=['Gene Symbol', 'Ensembl ID', 'Gene Type', 'Entrez', 'CHR', 'Location', 'Reported Trait', 'Trait Label', 'Trait Ontology id', 'EFO synonym', 'P-value', 'PMID'])
#replace '−' with '-'
ravar_genes['P-value'] = ravar_genes['P-value'].apply(lambda x: float(x.replace('−','-')) if type(x) == str else float(x))
ravar_genes['TRAIT_CODE'] = ravar_genes['Trait Ontology id'].apply(lambda x: x.replace(":", "_") if type(x) == str else x)
ravar_genes['logp'] = -1 * np.log10(ravar_genes['P-value'] + 1e-250)

In [63]:
gene_counts = ravar_genes.groupby(['PMID', 'Reported Trait', 'Trait Label']).Entrez.nunique().reset_index()

In [67]:
all_pmids = gene_counts[gene_counts.Entrez>=3].PMID.unique()

In [72]:
all_pmids = [int(x) for x in all_pmids]

In [81]:
missing = [x for x in all_pmids if int(x) not in manual_info.PMID.astype(int).values]

In [86]:
ukb_missing = [33226994,
33574263,
34626176,
36050321,
36297015,
36311265,
36450729]

In [87]:
ravar_genes[ravar_genes.PMID.isin(ukb_missing)].PMID.value_counts()

PMID
33574263.0    204
36450729.0     36
33226994.0     17
36050321.0     15
36297015.0      7
36311265.0      6
34626176.0      3
Name: count, dtype: int64

In [89]:
# Match UKB traits first, then come back and extract info!
# There are exactly 2 traits not already covered. Just ignore these.

## Manual info for RAVAR/GWAS Catalog shared traits

In [5]:
study_info = pd.read_csv(os.path.join(datadir, 'study_info_condensed.txt'), sep='\t')
study_info.head()

Unnamed: 0,Reported Trait,Trait Label,Trait Ontology id,Method/Software,PMID
0,Abnormal findings on diagnostic imaging of oth...,abnormal result of diagnostic imaging,EFO:0009827,collapsing analyse,34375979
1,Biochemistry Total Protein measurement,total blood protein measurement,EFO:0004536,BOLT-LMM,34226706
2,Biochemistry Calcium measurement,calcium measurement,EFO:0004838,BOLT-LMM,34226706
3,Biochemistry SHBG measurement,sex hormone-binding globulin measurement,EFO:0004696,BOLT-LMM,34226706
4,Biochemistry Gamma Glutamyl transferase measur...,serum gamma-glutamyl transferase measurement,EFO:0004532,BOLT-LMM,34226706


In [38]:
manual_info = pd.read_csv(os.path.join(datadir, 'RVStudyInfo_manual_long.tsv'), sep='\t')
manual_info['index'] = [x+1 for x in range(len(manual_info))]
s_36809768q = pd.read_csv('/cellar/users/snwright/Data/RareCommon/Annotations/rare_info/36809768_Q.tsv', sep='\t')
s_34662886 = pd.read_csv('/cellar/users/snwright/Data/RareCommon/Annotations/rare_info/34662886_All.tsv', sep='\t')
replace_idx = list(s_34662886['index'].values) + list(s_36809768q['index'].values)

In [39]:
# Remove the results for studies mapped separately.
manual_info = manual_info[~manual_info.index.isin(replace_idx)]

In [40]:
# Clean up info
# replace N with N Actual:
manual_info = manual_info.assign(N_final = manual_info.apply(lambda x: x['N-Actual'] if x['N-Actual'] > 0 else x.N, axis=1))

In [41]:
manual_info = manual_info.assign(Class_final = manual_info.apply(lambda x: x['Classification.1'] if isinstance(x['Classification.1'], str) else x.Classification, axis=1))

In [42]:
clean_info = manual_info.drop(columns = ['N', 'N-Actual', 'Classification', 'Classification.1'])

In [47]:
clean_info = clean_info.rename(columns={'N_final': 'N', 'Class_final':'Classification', 'Unnamed: 11':'ExtraNote'})

In [48]:
clean_info

Unnamed: 0,Reported Trait,Trait Label,Trait Ontology id,PMID,COHORT,Ancestry,Notes,ExtraNote,index,N,Classification
0,Abnormal findings on diagnostic imaging of oth...,abnormal result of diagnostic imaging,EFO:0009827,34375979,UKB,EUR,,,1,281104.0,Population CC/Q
1,Biochemistry Total Protein measurement,total blood protein measurement,EFO:0004536,34226706,UKB,EUR,,,2,400482.0,Population Q
2,Biochemistry Calcium measurement,calcium measurement,EFO:0004838,34226706,UKB,EUR,,,3,400792.0,Population Q
3,Biochemistry SHBG measurement,sex hormone-binding globulin measurement,EFO:0004696,34226706,UKB,EUR,,,4,397043.0,Population Q
4,Biochemistry Gamma Glutamyl transferase measur...,serum gamma-glutamyl transferase measurement,EFO:0004532,34226706,UKB,EUR,,,5,437651.0,Population Q
...,...,...,...,...,...,...,...,...,...,...,...
2916,Coronary artery disease,coronary artery disease,EFO:0001645,28642624,PDAY,"EUR, AFR",,,2917,985.0,Population CC
2917,Momp a antigen for chlamydia trachomatis,Chlamydia trachomatis infectious disease,EFO:0007205,34375979,UKB,EUR,,,2918,281104.0,Population CC/Q
2918,Gestational [pregnancy-induced] hypertension w...,preeclampsia,EFO:0000668,34375979,UKB,EUR,,,2919,281104.0,Population CC/Q
2919,VLDLTG L measurement,very low density lipoprotein cholesterol chang...,EFO:0020857,36419110,Finnish,EUR,T1D patients,,2920,738.0,Ascertained Q


In [51]:
s_36809768q= s_36809768q.rename(columns={'N_Actual':'N'})
s_34662886 = s_34662886.rename(columns={'N_Actual':'N'})

In [52]:
all_info = pd.concat([clean_info, s_34662886, s_36809768q])

In [54]:
all_info.to_csv('/cellar/users/snwright/Data/RareCommon/Annotations/rare_info/cleaned_info_gwascat_matches.tsv', sep='\t', index=False)