# Adding gnomAD and AlphaMissense information to VCF files

##### Updated 06/03/2024
##### Selin Kubali

##### Goal
Take cleaned files from *selected_genes/hcm/cleaned_vcf_files* and add gnomAD data and AlphaMissense scores. Filter for rare variants and exclude low-quality variants. Extrapolate unknown information to create a list of rare variants for each gene with predicted conservation and pathogenicity. Combine variant file with list of patients and demographic information to create a file for each gene that has all patients in UK BioBank, their demographic information, and any relevant information about variants they may carry.

##### Input
Cleaned VCF files from *selected_genes/hcm/cleaned_vcf_files*

CSV files with AlphaMissense scores, found in *selected_genes/hcm/vcf_files/carrier*

VCF files with gnomAD information, found in *selected_genes/hcm/gnomAD/gnomAD_csvs*

CSV files with patient carrier information, found in *selected_genes/hcm/csv_files*

CSV file with patient demographic information, found in *selected_genes/hcm/csv_files*

CSV file with polygenic risk scores for HCM for each patient, found in *selected_genes/hcm/csv_files*


##### Output
CSV for each gene with list of demographic and variant information for each patient, uploaded to *selected_genes/hcm/cox_model_data*
Variant information for each gene, uploaded to *selected_genes/hcm/variant_files*
___

In [1]:
# load packages
import pandas as pd
from pandas.api.types import CategoricalDtype
from random import shuffle

In [2]:
def load_variants(gene, dir_path = "selected_genes/hcm/cleaned_vcf_files"):
    path = f'/mnt/project/{dir_path}/{gene}_variants_cleaned.vcf'
    variants = pd.read_csv(path, sep = "\t")
    variants = variants.rename(columns = {'#CHROM':'Chrom', 'POS':'Pos','REF':'Ref','ALT':'Alt','ID':'Name','FILTER':'Filter'})
    variants = variants[variants['Filter'] == '.']
    variants = variants[['Chrom','Pos','Ref','Alt','Name']]
    return variants

In [3]:
def load_gnomAD(gene, dir_path = "selected_genes/hcm/gnomAD/gnomAD_csvs"):
    """" loads csv with gnomAD information """ 
    path = f'/mnt/project/{dir_path}/{gene}_gnomAD.csv'
    gnomAD = pd.read_csv(path, sep = ",")
    gnomAD['faf_max'] = gnomAD['fafmax_faf95_max']
    return gnomAD

In [4]:
def load_alphamissense(gene, dir_path = "selected_genes/hcm/alpha_missensense_annotated"):
    """" loads csv with alpha missense information """ 
    path = f'/mnt/project/{dir_path}/{gene}_alphamissense.csv'
    alphamissense = pd.read_csv(path, sep = ",")
    return alphamissense

### List of genes

In [5]:
genes = ["ACTN2", "ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "PLN", "PTPN11", "TNNI3", "TTR", "TNNT2", "TPM1", "MYL2", "MYL3", "ACTC1"]

In [6]:
for gene in genes:
    gnomAD = load_gnomAD(gene)
    variants = load_variants(gene)
    alphamissense = load_alphamissense(gene)
    gene_with_gnomAD = variants.merge(gnomAD, how = 'left', on = ['Chrom','Pos','Ref','Alt'])
    gene_with_gnomAD = gene_with_gnomAD.loc[(gene_with_gnomAD["faf_max"] <= 0.001) | (gene_with_gnomAD["faf_max"].isna())]     # filter for rare genes
    gene_gnomAD_ai = gene_with_gnomAD.merge(alphamissense, how = 'left', on = ['Chrom','Pos','Ref','Alt'])
    gene_gnomAD_ai = gene_gnomAD_ai.drop(['Unnamed: 0_x','Unnamed: 0_y','fafmax_faf95_max','Gene','Canonical','Source'],axis=1)
    gene_gnomAD_ai = gene_gnomAD_ai[gene_gnomAD_ai['Filter'] == 'PASS']     # filter for read quality
    gene_gnomAD_ai.to_csv(gene+"_updated.csv")

## Edit variants

In [None]:
def comp_predictors_extreme_val(df, col):
    """ finds either lowest values or highest in a column for a computational score. returns this value """
                    
    if col in ['am_pathogenicity', 'revel_max', 'cadd_raw_score', 'faf_max', 'phylop', 'spliceai_ds_max']:
        val = min(df[df[col].notna()][col])
        return val
                  

In [7]:
def define_comp_predictors(df, col):
    """ groups existing computation predictor scores (CADD_raw, phyloP100way_vertebrate, GERP++_RS, REVEL, am_pathogenicity) by functional consequences 
    and assigns them to variants based on this consequence. """


    for i in df.index:
        col_means_grouped = df.groupby(by='Consequence')[col].mean()
        if pd.isnull(df[col][i]):
            if (pd.isnull(col_means_grouped[df['Consequence'][i]])) & (df['Consequence'][i] == 'synonymous_variant'):
                if df[df[col].notna()][col].count() == 0:
                    df = df.drop([col], axis = 1)
                    break
               # what to do if no mean computational value exists for a Synonymous or Intronic consequence
                else:
                    df.loc[[i],[col]] = comp_predictors_extreme_val(df, col)
            else:
                df.loc[[i],[col]] = col_means_grouped[df['Consequence'][i]]
                
    return df

In [10]:
def get_variants(gene):
    """ load and clean variants df and fill in missing values """
    variants = pd.read_csv(gene+'_updated.csv')
    variants = variants.drop(['Unnamed: 0'], axis = 1)
    
    # define deleterious mutations
    deleterious_list = ["stop_gained", "start_lost", "splice_acceptor_variant", "splice_donor_variant", "splice_donor_region_variant","splice_region_variant","frameshift_variant"]    
    deleterious = variants['Consequence'].apply(lambda x: any(substring in x for substring in deleterious_list))
    variants.loc[deleterious, 'Consequence'] = 'deleterious'
    variants = variants[variants['Consequence'].isin(['synonymous_variant','missense_variant','deleterious'])]

    variants["val"] = 1

    # Extrapolate to fill in NA values 
    variants = define_comp_predictors(variants, 'revel_max') 
    variants = define_comp_predictors(variants, 'am_pathogenicity')   
    variants = define_comp_predictors(variants, 'cadd_raw_score') 
    variants = define_comp_predictors(variants, 'faf_max') 
    variants = define_comp_predictors(variants, 'phylop') 
    variants = define_comp_predictors(variants, 'spliceai_ds_max') 

    variants = variants.drop(['val'], axis = 1)



    return(variants)

In [None]:
genes = ["ACTC1", "ACTN2", "ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "MYL2", "MYL3",  "PLN", "PTPN11", "TNNI3", "TNNT2", "TPM1", "TTR"]
for g in genes:
    file_name = f'{g}_variants.csv'
    variants = get_variants(g)

    variants.to_csv(file_name, index=False)

In [None]:
!dx mkdir -p "Cassa Lab Shared Project:selected_genes/hcm/variant_files/"
!dx cd /selected_genes/hcm/variant_files/
!dx upload *_variants.csv

## All patients

In [20]:
def get_carriers(gene, dir_path="selected_genes/hcm/vcf_files/carrier"):
    """retrieve variant-patient mapping file"""
    
    path = f'/mnt/project/{dir_path}/{gene}.ssv'
    carriers = pd.read_csv(path,
                       sep=" ",
                       names=["Chrom", "empty", "Pos", "Ref", "Alt", "na", "Carriers"],
                       usecols=lambda x: x not in ['na', 'empty'] )
    
    carriers.loc[:, "Carriers"] = carriers["Carriers"].str.strip("|")
    carriers.loc[:, "Carriers"]= carriers["Carriers"].str.split("|", expand = False)
    carriers = carriers.explode("Carriers").rename(columns={"Carriers": "Carrier"})
    carriers = carriers[carriers['Carrier'].apply(lambda x: not x.startswith('W'))] # remove withdrawn
    
    
    return(carriers)

In [21]:
def get_joined(gene, **paths):
    """retrieves both variant and mapping files and joins them together by variant"""
    
    variants = get_variants(gene, paths["var_path"]) if "var_path" in paths else get_variants(gene)
    carriers = get_carriers(gene, paths["car_path"]) if "car_path" in paths else get_carriers(gene)
    variants = variants.astype({'Chrom': 'object', 'Pos':'object'})
    carriers = carriers.astype({'Pos':'object'})
    ann_carriers = carriers.merge(variants, on=["Chrom", "Pos", "Ref", "Alt"], how = "inner")
    return(ann_carriers)


In [22]:
def get_most_severe(ann_carriers):
    
    """accepts a filtered patient-variant df.
    selects and retains only the most severe variant per patient. returns this df."""
    
    # order variant consequence by general severity (deleterious > missense > synonymous)
    consequence_cats = CategoricalDtype(categories=["synonymous_variant", "missense_variant", "deleterious"], ordered=True)
    ann_carriers.loc[:, "Consequence"] = ann_carriers["Consequence"].astype(consequence_cats)

    # choose maximum AF for each variant, where possible
    ann_carriers.loc[:, "max_AF"] = ann_carriers.loc[:, ["faf_max"]].max(axis=1, skipna=True)
    

    # create a random tie breaker index
    shuffled_index = list(range(0,len(ann_carriers)))
    shuffle(shuffled_index)
    ann_carriers.loc[:, "tie_breaker"] = shuffled_index
    
    # group by eid, sort by consequence > > confidence (for del. vars) > AF > tiebreaker
    # select top variant for each group
    ann_carriers_grouped = ann_carriers.groupby("Carrier", group_keys=False).apply(pd.DataFrame.sort_values, ["Consequence", "max_AF", "tie_breaker"], ascending=[False, True, True])
    ann_carriers_grouped = ann_carriers_grouped.drop_duplicates("Carrier", keep="first")
    
    # important!! return categorical variables to normal objects so they don't clog pivot memory
    ann_carriers_grouped.loc[:, "Consequence"] = ann_carriers_grouped["Consequence"].astype(object) 

    ann_carriers_grouped = ann_carriers_grouped.drop(['max_AF','tie_breaker'], axis = 1)

    
    return(ann_carriers_grouped)

In [24]:
def patient_var_mappings(gene, most_severe=True, **paths):
    """
    accepts gene name and some combination of var_path and car_path directories.
    returns a pandas df with one row per patient detailing their most severe variant
    """
    annotated_carriers = get_joined(gene, **paths)
    
    if most_severe:
        annotated_carriers = get_most_severe(annotated_carriers)
        
    return annotated_carriers   

In [25]:
def merge_all_characteristics(gene, dir_path="selected_genes/hcm/csv_files"):
    
    """ Add PRS and demographic information to each patient"""
    prs_csv = pd.read_csv(f'/mnt/project/{dir_path}/applied_hcm_prs.csv')
    phenotypic_info_csv = pd.read_csv(f'/mnt/project/{dir_path}/cardiomyopathy.csv')
    
    prs_csv['Carrier'] = prs_csv['eid'].astype(str)
    prs_csv = prs_csv.drop(['eid'], axis=1)
    phenotypic_info_csv['Carrier'] = phenotypic_info_csv['eid'].astype(str)  
    phenotypic_info_csv = phenotypic_info_csv.drop(['eid'], axis=1)
    
    g = patient_var_mappings(gene).reset_index()
    
    prs_joined = prs_csv.set_index('Carrier').join(g.set_index('Carrier'), how = 'outer')
    all_joined = phenotypic_info_csv.set_index('Carrier').join(prs_joined, how = 'outer')
    all_joined = all_joined.drop(['Pos', 'Ref', 'Alt', 'Chrom', 'index'], axis = 1)

    return all_joined 
    

In [None]:
genes = ["ACTC1", "ACTN2", "ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "MYL2", "MYL3",  "PLN", "PTPN11", "TNNI3", "TNNT2", "TPM1", "TTR"]
for g in genes:
    file_name = f'{g}_with_generated_data.csv'
    all_carriers = merge_all_characteristics(g)

    all_carriers.to_csv(file_name)

In [None]:
!dx mkdir -p "Cassa Lab Shared Project:selected_genes/hcm/cox_model_data/"
!dx cd /selected_genes/hcm/cox_model_data/
!dx upload *_with_generated_data.csv