In [None]:
# load packages
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
import random
from random import shuffle

In [30]:
def load_variants(gene, dir_path = "selected_genes/hcm/parsed_vep_files"):
    """" loads csv with annotated variant information """ 
    path = f'/mnt/project/{dir_path}/{gene}.csv'
    variants = pd.read_csv(path, sep = ",")
    
    
    variants = variants[["Chrom", "Pos", "Ref", "Alt", "Name", 'vep_consequence', 'GERP++_RS', 'trv', 'CADD_raw', 'phyloP100way_vertebrate']]
   
    deleterious_list = ["stop_gained", "start_lost", "splice_acceptor_variant", "splice_donor_variant", "splice_donor_region_variant","splice_region_variant","frameshift_variant"]    
    deleterious = variants['vep_consequence'].apply(lambda x: any(substring in x for substring in deleterious_list))
    variants.loc[deleterious, 'vep_consequence'] = 'deleterious'
    
    variants = variants[(variants['vep_consequence'] == 'deleterious') | (variants['vep_consequence'] == 'missense_variant')]

    return variants





genes = ["ACTN2", "ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "PTPN11", "TNNI3", "TTR", "PLN"]
for g in genes:
    load_variants(g).to_csv(g+".csv")



In [None]:
def add_gnomAD(parsed_gene):   

    gnomAD_csv = pd.read_csv(f'/mnt/project/selected_genes/hcm/csv_files/gnomAD_v4_variants_hcm.tsv', sep = "\t", names = ["Chrom", "Pos", "Ref", "Alt", "Filter", "AC", "AN", "AF", "faf_max", "empty"])
    gnomAD_csv = gnomAD_csv[["Chrom", "Pos", "Ref", "Alt", "Filter", "faf_max",]]
    gene_with_gnomAD = parsed_gene.merge(gnomAD_csv, how='left', on = ['Chrom', 'Pos', 'Ref', 'Alt'])
    gene_with_gnomAD = gene_with_gnomAD[(gene_with_gnomAD['Filter'] == 'PASS')]
    gene_with_gnomAD = gene_with_gnomAD[['Chrom', 'Pos', 'Ref', 'Alt', "Name", "vep_consequence", "faf_max", 'GERP++_RS', 'trv', 'CADD_raw', 'phyloP100way_vertebrate']]
    return gene_with_gnomAD


In [None]:
def add_AI(parsed_gene):
    AI_csv = pd.read_csv(f'/mnt/project/selected_genes/hcm/csv_files/AlphaMissense_hg38.csv', sep = "\t", names = ["Chrom", "Pos", "Ref", "Alt", "genome", "uniprot_id", "transcript_id", "protein_variant", "am_pathogenicity", "am_class"])
    gene_with_AI = parsed_gene.merge(AI_csv, how='left', on = ['Chrom', 'Pos', 'Ref', 'Alt'])
    gene_with_AI = gene_with_AI[['Chrom', 'Pos', 'Ref', 'Alt', "Name", "vep_consequence", "faf_max", "am_pathogenicity", 'GERP++_RS', 'trv', 'CADD_raw', 'phyloP100way_vertebrate']]

    return gene_with_AI


In [None]:
def get_variants(gene):
    """ load and clean variants df and fill in missing values """
    variants = load_variants(gene)
    variants = add_gnomAD(variants)
    variants = add_AI(variants)
    
    
    

    
    # define deleterious mutations


    # filter for allele freq
    variants = variants.loc[(variants["faf_max"] <= 0.002) | (variants["faf_max"].isna())]
    

    variants.loc[:, "val"] = 1

    return(variants)




In [None]:
def get_carriers(gene, dir_path="selected_genes/hcm/carrier"):
    """retrieve variant-patient mapping file"""
    
    path = f'/mnt/project/{dir_path}/{gene}.ssv'
    carriers = pd.read_csv(path,
                       sep=" ",
                       names=["Chrom", "empty", "Pos", "Ref", "Alt", "na", "Carriers"],
                       usecols=lambda x: x not in ['na', 'empty'] )
    
    carriers.loc[:, "Carriers"] = carriers["Carriers"].str.strip("|")
    carriers.loc[:, "Carriers"]= carriers["Carriers"].str.split("|", expand = False)
    carriers = carriers.explode("Carriers").rename(columns={"Carriers": "Carrier"})
    carriers = carriers[carriers['Carrier'].apply(lambda x: not x.startswith('W'))] # remove withdrawn
    
    
    return(carriers)

In [None]:
def get_joined(gene, **paths):
    """retrieves both variant and mapping files and joins them together by variant"""
    
    variants = get_variants(gene, paths["var_path"]) if "var_path" in paths else get_variants(gene)
    carriers = get_carriers(gene, paths["car_path"]) if "car_path" in paths else get_carriers(gene)
  
    
    variants = variants.astype({'Chrom': 'object', 'Pos':'object'})
    
    carriers = carriers.astype({'Pos':'object'})
    
    ann_carriers = carriers.merge(variants, on=["Chrom", "Pos", "Ref", "Alt"], how = "inner")  
    
    
    return(ann_carriers)



In [None]:
def get_most_severe(ann_carriers):
    
    """accepts a filtered patient-variant df.
    selects and retains only the most severe variant per patient. returns this df."""
    
 
    
    # order variant consequence by general severity (deleterious > missense)
    consequence_cats = CategoricalDtype(categories=["missense_variant", "deleterious"], ordered=True)
    ann_carriers.loc[:, "vep_consequence"] = ann_carriers["vep_consequence"].astype(consequence_cats)
    
    # choose maximum AF for each variant, where possible
    ann_carriers.loc[:, "max_AF"] = ann_carriers.loc[:, ["faf_max"]].max(axis=1, skipna=True)
     
    # create a random tie breaker index
    shuffled_index = list(range(0,len(ann_carriers)))
    shuffle(shuffled_index)
    ann_carriers.loc[:, "tie_breaker"] = shuffled_index
    
    # group by eid, sort by consequence > CADD > confidence (for del. vars) > AF > tiebreaker
    # select top variant for each group
    ann_carriers_grouped = ann_carriers.groupby("Carrier", group_keys=False).apply(pd.DataFrame.sort_values, ["vep_consequence", "max_AF", "tie_breaker"], ascending=[False, True, True])
    ann_carriers_grouped = ann_carriers_grouped.drop_duplicates("Carrier", keep="first")
    
    # important!! return categorical variables to normal objects so they don't clog pivot memory
    ann_carriers_grouped.loc[:, "vep_consequence"] = ann_carriers_grouped["vep_consequence"].astype(object)
    


    
    return(ann_carriers_grouped)

In [None]:
def reshape_consequence(ann_carriers, rename_id=True):
    """
    accepts filtered and prepped patient-variant df
    returns a reshaped df that separates consequences into unique columns

    """
    ann_carriers = ann_carriers.reset_index(drop = True)
    
    for col in ["am_pathogenicity"]:
        ann_carriers[col] = ann_carriers[col].fillna("temp")
        
    ann_carriers = ann_carriers[["Carrier", "Name", "am_pathogenicity", "vep_consequence", "val", 'GERP++_RS', 'trv', 'CADD_raw', 'phyloP100way_vertebrate']]
    ann_carriers = ann_carriers.pivot_table(index=["Carrier", "Name", "am_pathogenicity", 'GERP++_RS', 'trv', 'CADD_raw', 'phyloP100way_vertebrate'], values="val", columns="vep_consequence", fill_value=0)
    new_names={"Name": "variant_id", "Carrier":"eid"}

        
    # this variable is set to true whenever get_most_severe was run before this function
    if rename_id:
        new_names["Name"] = "most_severe_variant"
        
        
    



    
    return(ann_carriers)






In [None]:
def patient_var_mappings(gene, most_severe=True, **paths):

    """
    accepts gene name and some combination of var_path and car_path directories.
    returns a pandas df with one row per patient detailing their most severe variant
    """
    annotated_carriers = get_joined(gene, **paths)
    
    if most_severe:
        annotated_carriers = get_most_severe(annotated_carriers)
        
  
    return(reshape_consequence(annotated_carriers, rename_id=most_severe))





In [None]:
def merge_all_characteristics(gene):
    
    # read in and clean phenotype, principal components, and polygenic risk score dataframes
    project_path = f'/mnt/project/selected_genes/hcm/csv_files/'
    prs_csv = pd.read_csv(project_path+'applied_hcm_prs.csv')
    principal_components_csv = pd.read_csv(project_path+'principal_components.csv')
    phenotypic_info_csv = pd.read_csv(project_path+'hypertrophic_df.csv')

    prs_csv['Carrier'] = prs_csv['eid'].astype(str)
    prs_csv = prs_csv.drop(['eid'], axis=1)
    
    
    
    
    
    principal_components_csv['Carrier'] = principal_components_csv['eid'].astype(str)
    principal_components_csv = principal_components_csv.drop(['eid',  'principal_component_2','principal_component_3', 'principal_component_5', 'principal_component_6', 'principal_component_7', 'principal_component_8', 'principal_component_9', 'principal_component_10'], axis=1)

    phenotypic_info_csv['Carrier'] = phenotypic_info_csv['eid'].astype(str)  
    phenotypic_info_csv = phenotypic_info_csv.drop(['eid'], axis=1)
    
    
    g = patient_var_mappings(gene)
    g = g.reset_index()
    prs_csv = prs_csv.reset_index()



    # join all dataframes
    prs_joined = prs_csv.merge(g, how = 'left')
    principal_prs_joined = principal_components_csv.merge(prs_joined, how = 'right')
    
    all_joined = phenotypic_info_csv.merge(principal_prs_joined, how = 'right')
    

    # fill in missing values for patients without annotated variants
    all_joined = all_joined[(all_joined['deleterious'] == True) | (all_joined['missense_variant'] == True)]

    all_joined["missense_variant"] = all_joined["missense_variant"].fillna("0")
    all_joined["deleterious"] = all_joined["deleterious"].fillna("0")

    all_joined["is_family_hist"] = np.where(all_joined["is_family_hist"]==True, 1, 0)
    all_joined["is_hcm"] = np.where(all_joined["is_hcm"]==True, 1, 0)
    all_joined = all_joined.drop(["death_age","Unnamed: 0", "birth_date", 'deleterious', 'missense_variant'], axis = 1)


    return all_joined 
    

In [29]:
genes = ["ACTN2", "ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "PTPN11", "TNNI3", "TTR", "PLN"]
for g in genes:
    file_name = f'{g}.csv'  
    all_carriers = merge_all_characteristics(g)
   
    all_carriers.to_csv(file_name, index=False)

    

  gnomAD_csv = pd.read_csv(f'/mnt/project/selected_genes/hcm/csv_files/gnomAD_v4_variants_hcm.tsv', sep = "\t", names = ["Chrom", "Pos", "Ref", "Alt", "Filter", "AC", "AN", "AF", "faf_max", "empty"])
  AI_csv = pd.read_csv(f'/mnt/project/selected_genes/hcm/csv_files/AlphaMissense_hg38.csv', sep = "\t", names = ["Chrom", "Pos", "Ref", "Alt", "genome", "uniprot_id", "transcript_id", "protein_variant", "am_pathogenicity", "am_class"])


ValueError: cannot set a frame with no defined index and a scalar

In [None]:
!dx mkdir -p "Cassa Lab Shared Project:selected_genes/hcm/lifelines_data/"
!dx cd /selected_genes/hcm/lifelines_data/
!dx upload *.csv

In [23]:
MYBPC3

Unnamed: 0.1,Unnamed: 0,sex,is_family_hist,is_hcm,duration,age,Carrier,principal_component_1,principal_component_4,index,prs_score,Name,am_pathogenicity,GERP++_RS,trv,CADD_raw,phyloP100way_vertebrate,deleterious,missense_variant
300,300.0,0.0,0,0,79.950719,79.950719,1014971,2.25973,-44.25070,1443.0,0.000658,11-47341995-C-T,0.6363,5.13,0.0,4.076783,7.531,0.0,1.0
356,356.0,1.0,0,0,78.000000,78.000000,1017630,57.43850,10.55800,1697.0,-0.013064,11-47338586-C-T,0.0882,3.54,0.0,0.968143,1.537,0.0,1.0
402,402.0,0.0,0,0,74.863792,80.117728,1020263,383.80500,5.21918,1952.0,0.020625,11-47350068-C-T,0.1707,3.43,0.0,1.672304,0.511,0.0,1.0
572,572.0,0.0,0,0,67.370294,67.370294,1028911,-13.13840,1.26250,2791.0,-0.012814,11-47332912-A-G,0.3284,5.25,0.0,3.094806,7.144,0.0,1.0
726,726.0,0.0,1,0,78.450376,78.450376,1036853,-5.87859,-4.38288,3548.0,-0.012519,11-47342016-G-A,0.6316,5.13,0.0,4.282963,2.437,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502079,502079.0,0.0,0,0,77.864476,77.864476,6011707,-13.30810,2.01273,482436.0,0.015772,11-47342734-C-T,0.5758,4.63,0.0,3.708413,7.506,0.0,1.0
502084,502084.0,0.0,0,0,61.779603,61.779603,6011959,-11.68130,-2.81797,482460.0,-0.027074,11-47333208-C-T,0.4444,5.36,0.0,3.801036,7.556,0.0,1.0
502222,502222.0,1.0,0,0,61.535934,61.535934,6018974,-14.20980,2.78787,483140.0,-0.000138,11-47338579-G-A,0.1536,1.05,0.0,1.961707,0.428,0.0,1.0
502299,502299.0,0.0,1,0,72.703628,72.703628,6022063,-12.70350,2.92162,483439.0,0.006107,11-47332912-A-G,0.3284,5.25,0.0,3.094806,7.144,0.0,1.0
