# Loading gnomAD data
##### Updated 06/03/2024
##### Selin Kubali

#### Goal:
Extract gnomAD information for all exomes in each given gene.

#### Required inputs
- gnomAD VCFs for each gene. Found in *selected_genes/hcm/gnomAD/gnomAD_gene_vcfs*

#### Output
CSV files containing gnomAD information – CADD, phyloP, allele frequency max, SpliceAI max, REVEL, Ensembl VEP consequence and confidence for canonical transcripts – for given genes. Uploaded to . Found in *selected_genes/hcm/gnomAD/gnomAD_csvs*.

## Download VCFs

In [None]:
!dx cd /selected_genes/hcm/gnomAD/gnomAD_gene_vcfs/
!dx download vcf_*.gz

In [None]:
import pandas as pd

## List of genes

In [3]:
genes = ["ACTN2", "ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "PLN", "PTPN11", "TNNI3", "TTR", "TNNT2", "TPM1", "MYL2", "MYL3", "ACTC1"]

### Transform information given in gnomAD VCFs into a dataframe

In [19]:
def parse_string(info_str):
    """ Transform gnomAD data into a set of dictionaries """
    features = info_str.split(';')
    features_list = []
    for pair in features:
        # Split once again on vep
        if 'vep=' in pair:
            vep_split = pair.split('vep=')
            try:
                features_list.append(['vep', vep_split[1]])
            except:
                print(vep_split)
        elif '=' in pair:
            features_list.append(pair.split('='))



    features_dict = {key: value for key, value in features_list}
    filtered_features_dict = {key: features_dict.get(key, 'NA') for key in ['cadd_raw_score', 'phylop', 'fafmax_faf95_max', 'spliceai_ds_max', 'revel_max', 'vep']}
    return filtered_features_dict

In [20]:
def filter_list_of_lists(lst):
    """ Only add consequence for canonical transcripts from Ensembl VEP """
    list_of_lists = []
    for i in lst:
        if len(i) > 32:
            if (i[24] == 'YES') & (i[32] == 'Ensembl'):
                list_of_lists.append([i[1], i[3], i[24], i[32], i[42]])
    return list_of_lists

In [21]:
def fill_na(row):
    """ Add row of NAs where information is missing """
    if type(row) != list:
        return ['']*5
    else:
        return row

In [22]:
def clean_data(data):
    filtered_features_dict = data['Info'].apply(parse_string).tolist()
    # Convert provided dictionaries into dataframe 
    info_df = pd.DataFrame(filtered_features_dict)
    chrom_data = pd.concat([data[['Chrom', 'Pos', 'Ref', 'Alt','Filter']], info_df], axis=1)
    # Split information given in VEP and filter for canonical transcripts
    chrom_data['vep'] = chrom_data['vep'].apply(lambda x: x.split(','))
    chrom_data['vep'] = chrom_data['vep'].apply(lambda x: [item.split('|') for item in x])
    chrom_data['vep'] = chrom_data['vep'].apply(filter_list_of_lists)
    chrom_data = chrom_data.explode('vep')
    chrom_data['vep'] = chrom_data['vep'].apply(fill_na)
    # Convert VEP data into multiple columns
    chrom_data[['Consequence', 'Gene', 'Canonical', 'Source', 'Confidence']] = pd.DataFrame(chrom_data['vep'].tolist(), index=chrom_data.index)
    chrom_data.drop('vep', axis=1, inplace=True)
    return chrom_data

In [23]:
for gene in genes:
    data = pd.read_csv('vcf_for_'+gene+'.gz', sep = '\t', names = ['Chrom', 'Pos', 'ID', 'Ref', 'Alt', '?', 'Filter', 'Info'], usecols=['Chrom', 'Pos', 'Ref', 'Alt', 'Filter', 'Info'], low_memory=False,  comment='#', compression='gzip', memory_map=True)
    chrom_data = clean_data(data)
    chrom_data = chrom_data[chrom_data['Gene'] == gene]

    chrom_data.to_csv(gene+'_gnomAD.csv')

In [None]:
!dx mkdir -p /selected_genes/hcm/gnomAD/gnomAD_csvs
!dx cd /selected_genes/hcm/gnomAD/gnomAD_csvs
!dx upload *_gnomAD.csv