In [1]:
import pandas as pd
import numpy as np
import h5py
import csv
import os
from random import seed, random

# Generate GWAS Data by Disease for Semantic Zoom

We will use a GWAS file (`gwas_catalog_v1.0.2-associations_e104_r2021-10-06.tsv`, https://www.ebi.ac.uk/gwas/docs/file-downloads) as an input to generate two following files:
1. HiGlass' BEDDB file: This file selectively stores variant information for given zoom level.
2. HiGlass' multivec file: This file stores the frequency of variant information for every binned genomic regions.

These two files can be used together in a single Gosling.js track to allow semantic zooming:
- When zoomed out, show density information
- When zoomed in, show detailed information

Find information about the file header of GWAS files at https://www.ebi.ac.uk/gwas/docs/fileheaders.

clodius aggregate bedfile --chromsizes-filename hg38.txt --delimiter $'\t' --importance-column 6 --max-per-tile 80 gwas.v1.bed 

python manage.py ingest_tileset --filename data/gwas.v1.beddb --filetype beddb --datatype bedlike --uid gwas-v1-beddb

## Create The Two Files

In [2]:
original_file_name = 'gwas_catalog_v1.0.2-associations_e104_r2021-10-06.tsv'
original_file = os.path.join('data', original_file_name)
bed_file = os.path.join('data', 'gwas.bed')
mv5_file = os.path.join('data', 'gwas.hdf5')
chr_file = os.path.join('data', 'hg38_full.txt')

In [3]:
# We are interested only in the following information
CHR_FILTER = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']

In [4]:
df = pd.read_csv(chr_file, '\t', header=None, names=['chr', 'length'])

df = df[df.chr.isin(['chr' + c for c in CHR_FILTER])]

TOTAL_CHROM_LEN = df.length.sum()

# l = df[df.chr == 'chr1'].length
# l

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
odf = pd.read_csv(original_file, '\t')

odf = odf.drop(columns=[
    'INITIAL SAMPLE SIZE', 'JOURNAL', 'FIRST AUTHOR', 'DATE', 'PUBMEDID', 'DATE ADDED TO CATALOG', 'REGION', 'UPSTREAM_GENE_ID', 'DOWNSTREAM_GENE_ID', 'OR or BETA', '95% CI (TEXT)', 'REPLICATION SAMPLE SIZE', 'UPSTREAM_GENE_DISTANCE', 'DOWNSTREAM_GENE_DISTANCE', 'RISK ALLELE FREQUENCY', 'INTERGENIC', 'STUDY', 'STUDY ACCESSION', 'MAPPED_TRAIT_URI', 'PLATFORM [SNPS PASSING QC]', 'MERGED', 'SNP_GENE_IDS', 'STRONGEST SNP-RISK ALLELE', 'GENOTYPING TECHNOLOGY', 'CONTEXT'
])

odf = odf[
    (odf['DISEASE/TRAIT'].apply(lambda x: x.upper() if isinstance(x, str) else '').str.contains('CANCER')) | (odf['MAPPED_TRAIT'].apply(lambda x: x.upper() if isinstance(x, str) else '').str.contains('CANCER'))
]

odf.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,LINK,DISEASE/TRAIT,CHR_ID,CHR_POS,REPORTED GENE(S),MAPPED_GENE,SNPS,SNP_ID_CURRENT,P-VALUE,PVALUE_MLOG,P-VALUE (TEXT),CNV,MAPPED_TRAIT
981,www.ncbi.nlm.nih.gov/pubmed/29059683,Breast cancer,18,32526319,NR,WBP11P1 - NA,rs191218575,191218575,8e-06,5.09691,(EA),N,breast carcinoma
982,www.ncbi.nlm.nih.gov/pubmed/29059683,Breast cancer,10,51918266,NR,PRKG1,rs1937680,1937680,3e-07,6.522879,(EA),N,breast carcinoma
983,www.ncbi.nlm.nih.gov/pubmed/29059683,Breast cancer,1,118111143,NR,SPAG17,rs1962373,1962373,1e-06,6.0,(EA),N,breast carcinoma
984,www.ncbi.nlm.nih.gov/pubmed/29059683,Breast cancer,4,175011701,NR,ADAM29 - NA,rs1966977,1966977,6e-07,6.221849,(EA),N,breast carcinoma
985,www.ncbi.nlm.nih.gov/pubmed/29059683,Breast cancer,2,25068670,NR,EFR3B,rs1971136,1971136,5e-09,8.30103,(EA),N,breast carcinoma


# traits = odf[~odf['MAPPED_TRAIT'].isnull()]['MAPPED_TRAIT'].unique().tolist()

unique_traits = [trait for trait_str in traits for trait in trait_str.split(', ')]
unique_traits = list(set(unique_traits))

unique_traits = [trait for trait in unique_traits]

unique_traits.sort()

print(len(unique_traits))
unique_traits
traits

In [None]:
# Save the data
odf.to_csv(os.path.join('data', f'filtered_{original_file_name}'))

# Below Is Deprecated

In [12]:
CATEGORY_FILTER = [
    'Anthracycline-induced cardiotoxicity in early breast cancer',
    'Invasive breast cancer',
    'Persistent chemotherapy-induced alopecia in breast cancer',
    'Sensory peripheral neuropathy in microtubule targeting agent-treated breast cancer',
    'Adverse response to chemotherapy (amenorrhea) in breast cancer',
    'Acute post-radiotherapy pain in breast cancer',
    'BRCA1/2-negative high-risk breast cancer',
    'BRCA1 mutation in breast cancer',
    'BRCA2 mutation in breast cancer',
    'Survival in breast cancer (estrogen-receptor positive)',
    'Plasma anastrozole concentration in anastrozole-treated estrogen receptor positive breast cancer',
    'Breast cancer specific mortality in breast cancer',
    'Breast cancer specific mortality in estrogen receptor negative breast cancer',
    'Breast cancer specific mortality in estrogen receptor positive breast cancer',
    'Response to tamoxifen in oestrogen receptor positive/HER2 negative breast cancer',
    'Disease-free survival in breast cancer',
    'Estrone conjugates/estrone ratio in resected early stage estrogen-receptor positive breast cancer',
    'Estrone/androstenedione ratio in resected early stage-receptor positive breast cancer',
    'Plasma estrone levels in resected estrogen-receptor positive breast cancer',
    'Plasma androstenedione levels in resected early stage-receptor positive breast cancer',
    'Plasma estrone conjugates levels in resected early stage estrogen-receptor positive breast cancer',
    'Lobular breast cancer (menopausal hormone therapy interaction)',
    'Estrogen receptor status in HER2 negative breast cancer',
    'Estrogen receptor status in breast cancer',
    'Taxane-induced peripheral neuropathy in breast cancer',
    'HER2 status in breast cancer',
    'Survival in endocrine treated breast cancer (estrogen-receptor positive)',
    'Response to tamoxifen in breast cancer',
    'Estradiol plasma levels (breast cancer)',
    'Adverse response to chemotherapy in breast cancer (alopecia) (anti-microtubule)',
    'Adverse response to chemotherapy in breast cancer (alopecia)',
    'Adverse response to chemotherapy in breast cancer (alopecia) (cyclophosphamide+epirubicin+/-5FU)',
    'Adverse response to chemotherapy in breast cancer (alopecia) (cyclophosphamide+doxorubicin+/-5FU)',
    'Adverse response to chemotherapy in breast cancer (alopecia) (paclitaxel)',
    'Adverse response to chemotherapy in breast cancer (alopecia) (docetaxel)',
    'Response to chemotherapy in breast cancer (hypertension) (bevacizumab)',
    'Response to chemotherapy in breast cancer hypertensive cases (cumulative dose) (bevacizumab)',
    'Change in LVEF in response to paclitaxel and trastuzumab in HER2+ breast cancer'
]

diseases = odf[odf['DISEASE/TRAIT'].str.contains("breast cancer", na=False)]['DISEASE/TRAIT'].unique().tolist()

len(diseases)
len(odf['DISEASE/TRAIT'].unique().tolist())

5642

In [13]:
# For the extracted keys, refer to https://www.ebi.ac.uk/gwas/docs/fileheaders
# DATE ADDED TO CATALOG	PUBMEDID	FIRST AUTHOR	DATE	JOURNAL	LINK	STUDY	DISEASE/TRAIT	INITIAL SAMPLE SIZE	REPLICATION SAMPLE SIZE	REGION	CHR_ID	CHR_POS	REPORTED GENE(S)	MAPPED_GENE	UPSTREAM_GENE_ID	DOWNSTREAM_GENE_ID	SNP_GENE_IDS	UPSTREAM_GENE_DISTANCE	DOWNSTREAM_GENE_DISTANCE	STRONGEST SNP-RISK ALLELE	SNPS	MERGED	SNP_ID_CURRENT	CONTEXT	INTERGENIC	RISK ALLELE FREQUENCY	P-VALUE	PVALUE_MLOG	P-VALUE (TEXT)	OR or BETA	95% CI (TEXT)	PLATFORM [SNPS PASSING QC]	CNV

mv5_data = h5py.File(mv5_file, "w")

seed(1)

output = []
test = []
density_dict = { c : np.zeros((df[df.chr == ('chr' + c)].length.sum(), 1)) for c in CHR_FILTER}

for index, record in odf.iterrows(): # Oh, this is so wrong..
    # print(record)

    # if not record['DISEASE/TRAIT'] in CATEGORY_FILTER:
        # continue

    if not record.CHR_ID in CHR_FILTER:
        continue
    
    if not str(record.CHR_POS).isnumeric():
        continue
        
    #print(record)
    
    """
    BED
    """
    data_clean = {
        'chr': 'chr' + record.CHR_ID, 
        'start': int(record.CHR_POS),
        'end': int(record.CHR_POS) + 1,
        'pubmedid': record.PUBMEDID,
        'date': record.DATE,
        'link': record.LINK,
        'pval': record['P-VALUE'],
        'importance': 1 / (record['P-VALUE'] + 0.0001) + random(),
        'disease': record['DISEASE/TRAIT'],
        'pvalmlog': record['PVALUE_MLOG'],
        'pvaltext': record['P-VALUE (TEXT)'],
    }
    
    output.append(data_clean)
    
    """
    MV5
    """    
    column_index = 0
    
    # Add frequency by 1
    # print(record.CHR_ID, record.CHR_POS, column_index)
    density_dict[record.CHR_ID][int(record.CHR_POS), column_index] = density_dict[record.CHR_ID][int(record.CHR_POS), column_index] + 1

print("Writing to BED file")
headers = output[0].keys()
with open(bed_file, 'w') as opf:
    myWriter = csv.DictWriter(opf, delimiter='\t', fieldnames=headers)
    for row in output:
        myWriter.writerow(row)

print("Writing to hdf5 file")
for c in CHR_FILTER:
    print("chr" + c)
    mv5_data.create_dataset(('chr' + c), (df[df.chr == ('chr' + c)].length.sum(), 1), "i", data=density_dict[c], compression='gzip')

mv5_data.close()

Writing to BED file
Writing to hdf5 file
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY


Then, run the following commands:

```sh
clodius aggregate bedfile \
    --chromsizes-filename hg38_full.txt \
    --delimiter $'\t' \
    --importance-column 8 \
    --max-per-tile 160 \
    gwas.bed

clodius aggregate multivec \
    --chromsizes-filename hg38_full.txt \
    --starting-resolution 1 \
    gwas.hdf5

python manage.py ingest_tileset \
    --filename gwas.bed.beddb \
    --filetype beddb \
    --datatype bedlike \
    --uid gwas-beddb
```