In [1]:
import pandas as pd
import os
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('/cellar/users/snwright/Git/Network_Evaluation_Tools/neteval/')

In [43]:
datadir='/cellar/users/snwright/Data/RareCommon'
refdir='/cellar/users/snwright/Data/RareCommon/Reference/'
outdir='/cellar/users/snwright/Data/RareCommon/Annotations/'

In [110]:
r_traitlist = pd.read_csv(os.path.join(datadir, 'inputs/Jan_2025', 'rare.traitlist2'), header=None)[0].values
c_traitlist = pd.read_csv(os.path.join(datadir, 'inputs/Jan_2025', 'common.traitlist2'), header=None)[0].values

In [111]:
all_pairs = [r_traitlist[i]+'_'+c_traitlist[i] for i in range(len(r_traitlist))]

In [8]:
all_pairs[0]

'34375979.0_EFO_0004631_GCST90000618_EFO_0004631'

## Input info
### GWAS

In [61]:
gwas = pd.read_csv(os.path.join(datadir, 'GWASCatalog', 'study_info.v1.0.3.1_Jan29_2025.txt'), sep='\t')
gwas.columns
# Filter gwas to just my studies of interest. < will need to be updated if I change the studies.
all_studies = [c.split('_')[0] for c in c_traitlist]
gwas = gwas[gwas['STUDY ACCESSION'].isin(all_studies)]

In [26]:
gwas_ancestry = pd.read_csv(os.path.join(datadir, 'GWASCatalog', 'ancestries.v1.0.3.1_Jan29_2025.txt'), sep='\t', low_memory=False)

### RAVAR

In [29]:
rv_pub = pd.read_csv(os.path.join(datadir, 'RAVAR', 'publication_allinfo_06112024.txt'), sep='\t', low_memory=False)
rv_trait = pd.read_csv(os.path.join(datadir, 'RAVAR', 'trait_allinfo_06112024.txt'), sep='\t', low_memory=False)
rv_pmid = pd.read_csv(os.path.join(datadir, 'RAVAR', 'pmid_info_manual.txt'), sep='\t', low_memory=False)

## Study Age


In [45]:
rv_year = rv_pub.loc[:, ('PMID', 'Publication Year')].set_index('PMID')['Publication Year'].to_dict()
pd.DataFrame({'Year': rv_year}).reset_index(names='StudyR').to_csv(os.path.join(outdir, 'StudyR_Date.txt'), sep='\t', index=False)

In [37]:
cv_year_df = gwas.loc[:, ('STUDY ACCESSION', 'DATE')]
cv_year_df = cv_year_df.assign(Year=cv_year_df.DATE.apply(lambda x: x.split('-')[0]))

In [46]:
cv_year = cv_year_df.set_index('STUDY ACCESSION').Year.to_dict()
pd.DataFrame({'Year': cv_year}).reset_index(names='StudyC').to_csv(os.path.join(outdir, 'StudyC_Date.txt'), sep='\t', index=False)

## Study Size

In [83]:
gwas['INITIAL SAMPLE SIZE'].values[1]

'1,783 Northern European ancestry cases, 7,435 Northern European ancestry controls'

In [91]:
import re
def sum_numbers_in_string(s: str):
    # Find all occurrences of numbers which may include commas.
    matches = re.findall(r'\d[\d,]*', s)
    # Remove commas and convert each match to an integer, then sum them up.
    return sum(int(num.replace(',', '')) for num in matches)

In [92]:
gwas['N'] = gwas['INITIAL SAMPLE SIZE'].apply(lambda x: sum_numbers_in_string(x))

DATE ADDED TO CATALOG                                                 2024-05-08
PUBMED ID                                                               38658550
FIRST AUTHOR                                                               Cho C
DATE                                                                  2024-04-24
JOURNAL                                                               Nat Commun
LINK                                        www.ncbi.nlm.nih.gov/pubmed/38658550
STUDY                          Large-scale cross-ancestry genome-wide meta-an...
DISEASE/TRAIT                                                 Serum urate levels
INITIAL SAMPLE SIZE            219,768 East Asian ancestry individuals, 677,3...
REPLICATION SAMPLE SIZE                                                      NaN
PLATFORM [SNPS PASSING QC]                                NR [9487379] (imputed)
ASSOCIATION COUNT                                                            351
MAPPED_TRAIT                

## Study Ancestry

## Study Classification

## Sequencing Method

In [67]:
gwas = gwas.assign(GENOTYPING_TECH_CLEAN=gwas['GENOTYPING TECHNOLOGY'].apply(lambda x: x.split('[')[0].strip()))

In [70]:
gwas[gwas.GENOTYPING_TECH_CLEAN=='Genome-wide genotyping array, Targeted genotyping array, Exome genotyping array, Genome-wide sequencing']

Unnamed: 0,DATE ADDED TO CATALOG,PUBMED ID,FIRST AUTHOR,DATE,JOURNAL,LINK,STUDY,DISEASE/TRAIT,INITIAL SAMPLE SIZE,REPLICATION SAMPLE SIZE,...,SUBMISSION DATE,STATISTICAL MODEL,BACKGROUND TRAIT,MAPPED BACKGROUND TRAIT,MAPPED BACKGROUND TRAIT URI,COHORT,FULL SUMMARY STATISTICS,SUMMARY STATS LOCATION,GXE,GENOTYPING_TECH_CLEAN
35869,2021-07-27,34187551,McCartney DL,2021-06-29,Genome Biol,www.ncbi.nlm.nih.gov/pubmed/34187551,Genome-wide association studies identify 137 g...,DNA methylation-estimated granulocyte proportions,"34,470 European ancestry individuals, 6,152 Af...",,...,,,,,,JHS|BHS|GENOA|GTP|AIRWAVE|BIB|BHS|BLSA|DTR|EGC...,yes,http://ftp.ebi.ac.uk/pub/databases/gwas/summar...,no,"Genome-wide genotyping array, Targeted genotyp..."


In [71]:
gwas.GENOTYPING_TECH_CLEAN.value_counts()

GENOTYPING_TECH_CLEAN
Genome-wide genotyping array                                                                               940
Genome-wide genotyping array, Genome-wide sequencing                                                        34
Exome genotyping array                                                                                      14
Targeted genotyping array                                                                                   12
Genome-wide sequencing                                                                                       9
Genome-wide genotyping array, Exome-wide sequencing                                                          4
Genome-wide genotyping array, Targeted genotyping array                                                      4
Exome-wide sequencing                                                                                        4
Genome-wide genotyping array, Exome genotyping array                                      

# Trait Type

In [118]:
trait_df = pd.read_csv('/cellar/users/snwright/Data/Transfer/RVC/TraitClassification.tsv', sep='\t')

In [119]:
trait_df.head()

Unnamed: 0,EFO,Sub-Category,Category
0,EFO:0000319,Cardiovascular,Cardiovascular
1,EFO:0000537,Cardiovascular,Cardiovascular
2,EFO:0000612,Cardiovascular,Cardiovascular
3,EFO:0009289,Cardiovascular,Cardiovascular
4,EFO:0004269,Cardiovascular,Cardiovascular
