In [1]:
# load packages
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
import random
from random import shuffle

In [None]:
def load_variants(gene, dir_path = "selected_genes/hcm/parsed_vep_files"):
    """" loads csv with annotated variant information """ 
    path = f'/mnt/project/{dir_path}/{gene}.csv'
    variants = pd.read_csv(path, sep = ",")

    variants = variants[["Chrom", "Pos", "Ref", "Alt", "Name", "vep_consequence", "CADD_raw", "GERP++_RS", "phyloP100way_vertebrate", 'trv', 'ds_ag', 'ds_al', 'ds_dg', 'ds_dl']]

    return variants


In [None]:
def define_comp_predictors(gene, df, col):
    """ groups existing computation predictor scores (CADD_raw, phyloP100way_vertebrate, and GERP++_RS) by functional consequences 
    and assigns them to variants based on this consequence. """

    col_means_grouped = df.groupby(by='vep_consequence')[col].mean()    


    for i in df.index:
        if pd.isnull(df[col][i]):
            if (pd.isnull(col_means_grouped[df['vep_consequence'][i]])) & (df['vep_consequence'][i] == 'synonymous_variant'):
               # what to do if no mean computational value exists for a Synonymous or Intronic consequence
                df.loc[[i],[col]] = comp_predictors_extreme_val(gene, df, col)

            else:
                df.loc[[i],[col]] = col_means_grouped[df['vep_consequence'][i]]
                
    return df

In [None]:
def comp_predictors_extreme_val(gene, df, col):
    """ finds either lowest values or highest in a column for a computational score. returns this value """
                    
    variants = df
        
    
    if col in ['CADD_raw', 'am_pathogenicity', 'REVEL']:
        val = min(variants[variants[col].notna()][col])
    elif (col == 'phyloP100way_vertebrate') | (col == 'GERP++_RS'):
        val = max(variants[variants[col].notna()][col])
    
    return val
                  
        

In [None]:
def define_trv(gene,df,col):
    """ fills in trv values where they are missing, in accordance with their distribution in variants df"""

    variants = load_variants(gene)


    
    distribution = variants[variants[col].notna()][col].tolist()
    empty_list = []

    
    if (distribution == empty_list):
        df = df.drop([col], axis = 1)
        return df
        

    for i in df.index:
        if pd.isnull(df[col][i]):
            df[col][i] = random.choice(distribution)
   

    return df
                 

In [None]:
def add_gnomAD(parsed_gene):   

    gnomAD_csv = pd.read_csv(f'/mnt/project/selected_genes/hcm/csv_files/gnomAD_v4_variants_hcm.tsv', sep = "\t", names = ["Chrom", "Pos", "Ref", "Alt", "Filter", "AC", "AN", "AF", "faf_max", "empty"])
    gene_with_gnomAD = parsed_gene.merge(gnomAD_csv, how='left', on = ['Chrom', 'Pos', 'Ref', 'Alt'])
    gene_with_gnomAD = gene_with_gnomAD[(gene_with_gnomAD['Filter'] == 'PASS')]
    gene_with_gnomAD = gene_with_gnomAD[['Chrom', 'Pos', 'Ref', 'Alt', "Name", "vep_consequence", "CADD_raw", "GERP++_RS", "phyloP100way_vertebrate", "trv", "faf_max", 'ds_ag', 'ds_al', 'ds_dg', 'ds_dl']]

    return gene_with_gnomAD


In [None]:
def add_AI(parsed_gene):
    AI_csv = pd.read_csv(f'/mnt/project/selected_genes/hcm/csv_files/AlphaMissense_hg38.csv', sep = "\t", names = ["Chrom", "Pos", "Ref", "Alt", "genome", "uniprot_id", "transcript_id", "protein_variant", "am_pathogenicity", "am_class"])
    gene_with_AI = parsed_gene.merge(AI_csv, how='left', on = ['Chrom', 'Pos', 'Ref', 'Alt'])
    gene_with_AI = gene_with_AI[['Chrom', 'Pos', 'Ref', 'Alt', "Name", "vep_consequence", "CADD_raw", "GERP++_RS", "phyloP100way_vertebrate", "trv", "faf_max", "am_pathogenicity", 'ds_ag', 'ds_al', 'ds_dg', 'ds_dl']]

    return gene_with_AI


In [None]:
def get_variants(gene):
    """ load and clean variants df and fill in missing values """
    variants = load_variants(gene)
    variants = add_gnomAD(variants)
    variants = add_AI(variants)
    

    
    # define deleterious mutations
    deleterious_list = ["stop_gained", "start_lost", "splice_acceptor_variant", "splice_donor_variant", "splice_donor_region_variant","splice_region_variant","frameshift_variant"]    
    deleterious = variants['vep_consequence'].apply(lambda x: any(substring in x for substring in deleterious_list))
    variants.loc[deleterious, 'vep_consequence'] = 'deleterious'
    

    # filter for allele freq
    variants = variants.loc[(variants["faf_max"] <= 0.002) | (variants["faf_max"].isna())]
    

    variants.loc[:, "val"] = 1

    variants = define_comp_predictors(gene, variants, 'CADD_raw')
    variants = define_comp_predictors(gene, variants, 'phyloP100way_vertebrate')
    variants = define_comp_predictors(gene, variants, 'GERP++_RS') 
    variants = define_comp_predictors(gene, variants, 'faf_max') 
    variants = define_comp_predictors(gene, variants, 'am_pathogenicity')     


    return(variants)






In [None]:
def get_carriers(gene, dir_path="selected_genes/hcm/carrier"):
    """retrieve variant-patient mapping file"""
    
    path = f'/mnt/project/{dir_path}/{gene}.ssv'
    carriers = pd.read_csv(path,
                       sep=" ",
                       names=["Chrom", "empty", "Pos", "Ref", "Alt", "na", "Carriers"],
                       usecols=lambda x: x not in ['na', 'empty'] )
    
    carriers.loc[:, "Carriers"] = carriers["Carriers"].str.strip("|")
    carriers.loc[:, "Carriers"]= carriers["Carriers"].str.split("|", expand = False)
    carriers = carriers.explode("Carriers").rename(columns={"Carriers": "Carrier"})
    carriers = carriers[carriers['Carrier'].apply(lambda x: not x.startswith('W'))] # remove withdrawn
    
    
    return(carriers)





In [None]:
def get_joined(gene, **paths):
    """retrieves both variant and mapping files and joins them together by variant"""
    
    variants = get_variants(gene, paths["var_path"]) if "var_path" in paths else get_variants(gene)
    carriers = get_carriers(gene, paths["car_path"]) if "car_path" in paths else get_carriers(gene)
    
  
    
    variants = variants.astype({'Chrom': 'object', 'Pos':'object'})
    
    carriers = carriers.astype({'Pos':'object'})
    
    ann_carriers = carriers.merge(variants, on=["Chrom", "Pos", "Ref", "Alt"], how = "inner")  
    
    
    return(ann_carriers)


In [None]:
def get_most_severe(ann_carriers):
    
    """accepts a filtered patient-variant df.
    selects and retains only the most severe variant per patient. returns this df."""
    
 
    
    # order variant consequence by general severity (deleterious > missense > synonymous)
    consequence_cats = CategoricalDtype(categories=["synonymous_variant", "missense_variant", "deleterious"], ordered=True)
    ann_carriers.loc[:, "vep_consequence"] = ann_carriers["vep_consequence"].astype(consequence_cats)
    
    # choose maximum AF for each variant, where possible
    ann_carriers.loc[:, "max_AF"] = ann_carriers.loc[:, ["faf_max"]].max(axis=1, skipna=True)
     
    # create a random tie breaker index
    shuffled_index = list(range(0,len(ann_carriers)))
    shuffle(shuffled_index)
    ann_carriers.loc[:, "tie_breaker"] = shuffled_index
    
    # group by eid, sort by consequence > CADD > confidence (for del. vars) > AF > tiebreaker
    # select top variant for each group
    ann_carriers_grouped = ann_carriers.groupby("Carrier", group_keys=False).apply(pd.DataFrame.sort_values, ["vep_consequence", "CADD_raw", "max_AF", "tie_breaker"], ascending=[False, False, True, True])
    ann_carriers_grouped = ann_carriers_grouped.drop_duplicates("Carrier", keep="first")
    
    # important!! return categorical variables to normal objects so they don't clog pivot memory
    ann_carriers_grouped.loc[:, "vep_consequence"] = ann_carriers_grouped["vep_consequence"].astype(object)
    


    
    return(ann_carriers_grouped)

In [None]:
def reshape_consequence(ann_carriers, rename_id=True):
    """
    accepts filtered and prepped patient-variant df
    returns a reshaped df that separates consequences into unique columns

    """
    ann_carriers = ann_carriers.reset_index(drop = True)
    
    for col in ['CADD_raw', 'phyloP100way_vertebrate',  "GERP++_RS", "trv",'ds_ag', 'ds_al', 'ds_dg', 'ds_dl', "am_pathogenicity"]:
        ann_carriers[col] = ann_carriers[col].fillna("temp")
        
    ann_carriers = ann_carriers[["Carrier", "Name", "CADD_raw", "phyloP100way_vertebrate", "GERP++_RS", "trv", 'ds_ag', 'ds_al', 'ds_dg', 'ds_dl', "am_pathogenicity", "vep_consequence", "val"]]

    
    
    ann_carriers = ann_carriers.pivot_table(index=["Carrier", "Name", "CADD_raw", "phyloP100way_vertebrate", "GERP++_RS", "trv", 'ds_ag', 'ds_al', 'ds_dg', 'ds_dl', "am_pathogenicity"], values="val", columns="vep_consequence", fill_value=0)
    ann_carriers.to_csv("ann_carriers_end.csv")

    
    new_names={"Name": "variant_id", "Carrier":"eid"}
    
    ann_carriers = ann_carriers.reset_index()

    for col in ['CADD_raw', 'phyloP100way_vertebrate',  "GERP++_RS", "trv", 'ds_ag', 'ds_al', 'ds_dg', 'ds_dl', "am_pathogenicity"]:
        ann_carriers[col] = ann_carriers[col].replace('temp', None)




    # this variable is set to true whenever get_most_severe was run before this function
    if rename_id:
        new_names["Name"] = "most_severe_variant"
    



    
    return(ann_carriers)




In [None]:
def patient_var_mappings(gene, most_severe=True, **paths):

    """
    accepts gene name and some combination of var_path and car_path directories.
    returns a pandas df with one row per patient detailing their most severe variant
    """
    annotated_carriers = get_joined(gene, **paths)
    
    if most_severe:
        annotated_carriers = get_most_severe(annotated_carriers)
        
  
    return(reshape_consequence(annotated_carriers, rename_id=most_severe))



In [None]:
def merge_all_characteristics(gene):
    
    # read in and clean phenotype, principal components, and polygenic risk score dataframes
    project_path = f'/mnt/project/selected_genes/hcm/csv_files/'
    prs_csv = pd.read_csv(project_path+'applied_hcm_prs.csv')
    principal_components_csv = pd.read_csv(project_path+'principal_components.csv')
    phenotypic_info_csv = pd.read_csv(project_path+'hypertrophic_df.csv')

    prs_csv['Carrier'] = prs_csv['eid'].astype(str)
    prs_csv = prs_csv.drop(['eid'], axis=1)
    principal_components_csv['Carrier'] = principal_components_csv['eid'].astype(str)
    principal_components_csv = principal_components_csv.drop(['eid'], axis=1)

    phenotypic_info_csv['Carrier'] = phenotypic_info_csv['eid'].astype(str)  
    phenotypic_info_csv = phenotypic_info_csv.drop(['eid'], axis=1)
    
    
    g = patient_var_mappings(gene).reset_index()


    # join all dataframes
    prs_joined = prs_csv.set_index('Carrier').join(g.set_index('Carrier'), how = 'left')
    principal_prs_joined = principal_components_csv.set_index('Carrier').join(prs_joined, how = 'outer')
    
    all_joined = phenotypic_info_csv.set_index('Carrier').join(principal_prs_joined, how = 'outer')
    

    # fill in missing values for patients without annotated variants
    if 'trv' in all_joined.columns:
        all_joined = define_trv(gene,all_joined,"trv")
    all_joined["synonymous_variant"] = all_joined["synonymous_variant"].fillna("0")
    all_joined["missense_variant"] = all_joined["missense_variant"].fillna("0")
    all_joined["deleterious"] = all_joined["deleterious"].fillna("0")
    all_joined["ds_ag"] = all_joined["ds_ag"].fillna("0")
    all_joined["ds_al"] = all_joined["ds_al"].fillna("0")
    all_joined["ds_dg"] = all_joined["ds_dg"].fillna("0")
    all_joined["ds_dl"] = all_joined["ds_dl"].fillna("0")
    all_joined["CADD_raw"] = all_joined["CADD_raw"].fillna(comp_predictors_extreme_val(gene, all_joined, "CADD_raw"))
    all_joined["GERP++_RS"] = all_joined["GERP++_RS"].fillna(comp_predictors_extreme_val(gene, all_joined, "GERP++_RS"))
    all_joined["phyloP100way_vertebrate"] = all_joined["phyloP100way_vertebrate"].fillna(comp_predictors_extreme_val(gene, all_joined, "phyloP100way_vertebrate"))
    all_joined["am_pathogenicity"] = all_joined["am_pathogenicity"].fillna(comp_predictors_extreme_val(gene, all_joined, "am_pathogenicity"))

    all_joined = all_joined.drop(['Unnamed: 0', 'birth_date', 'death_age'], axis = 1)
    all_joined["is_family_hist"] = np.where(all_joined["is_family_hist"]==True, 1, 0)
    all_joined["is_hcm"] = np.where(all_joined["is_hcm"]==True, 1, 0)


    
    # hacks
    all_joined = all_joined[all_joined['duration'] > 20]


    return all_joined 
    

In [61]:
genes = ["ACTN2", "ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "PLN", "PTPN11", "TNNI3", "TTR"]

for g in genes:
    file_name = f'{g}_gnomad_AI_data.csv'  
    all_carriers = merge_all_characteristics(g)
   
    all_carriers.to_csv(file_name, index=False)

    

  gnomAD_csv = pd.read_csv(f'/mnt/project/selected_genes/hcm/csv_files/gnomAD_v4_variants_hcm.tsv', sep = "\t", names = ["Chrom", "Pos", "Ref", "Alt", "Filter", "AC", "AN", "AF", "faf_max", "empty"])
  AI_csv = pd.read_csv(f'/mnt/project/selected_genes/hcm/csv_files/AlphaMissense_hg38.csv', sep = "\t", names = ["Chrom", "Pos", "Ref", "Alt", "genome", "uniprot_id", "transcript_id", "protein_variant", "am_pathogenicity", "am_class"])
  ann_carriers.loc[:, "vep_consequence"] = ann_carriers["vep_consequence"].astype(consequence_cats)
  ann_carriers_grouped.loc[:, "vep_consequence"] = ann_carriers_grouped["vep_consequence"].astype(object)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][i] = random.choice(distribution)


KeyError: 'synonymous_variant'

In [None]:
!dx mkdir -p "Cassa Lab Shared Project:selected_genes/hcm/lifelines_data/"
!dx cd /selected_genes/hcm/lifelines_data/
!dx upload *.csv