In [14]:
import os
import pandas as pd
import numpy as np
import json

In [15]:
def preprocess_dbNSFP(dbNSFP_data, gene, transcript):
    map_class_dict = {"unknown" : 'VUS',
                      "Conflicting_interpretations_of_pathogenicity":'VUS',
                      "not_provided" : "VUS",
                      "not_provided": "VUS",
                      "Uncertain_significance" : "VUS",
                      "Pathogenic": "Abnormal",
                      "Likely_benign": "Normal",
                      "Benign": "Normal",
                      "Likely_pathogenic": "Abnormal",
                      "Pathogenic/Likely_pathogenic": "Abnormal",
                      "Benign/Likely_benign" : "Normal"}
    dbNSFP_df = pd.read_csv(dbNSFP_data, sep='\t')
    required_columns = ['aaref', 'aaalt', 'aapos', 'genename', 'Ensembl_transcriptid',
                        'clinvar_clnsig' ]

    is_rankscore_column = [column for column in dbNSFP_df.columns if column.endswith('rankscore')]
    required_columns += is_rankscore_column

    dbNSFP_req_df = dbNSFP_df[required_columns]
    dbNSFP_req_df['aapos'] = dbNSFP_req_df['aapos'].str.split(';')
    dbNSFP_req_df['Ensembl_transcriptid'] = dbNSFP_req_df['Ensembl_transcriptid'].str.split(';')
    dbNSFP_req_df['genename'] = dbNSFP_req_df['genename'].str.split(';')
    columns_to_explode = [ 'aapos','Ensembl_transcriptid', 'genename']
    dbNSFP_req_df = dbNSFP_req_df.apply(lambda x: x.explode() if x.name in columns_to_explode else x)
    dbNSFP_req_df = dbNSFP_req_df[(dbNSFP_req_df['genename'] == gene) &
                                  (dbNSFP_req_df['Ensembl_transcriptid'] == transcript)]

    dbNSFP_req_df['aapos'] = dbNSFP_req_df['aapos'].astype(int)
    dbNSFP_req_df = dbNSFP_req_df[dbNSFP_req_df['aapos'] >=0]
    dbNSFP_req_df['aapos'] = dbNSFP_req_df['aapos'].astype(str)
    dbNSFP_req_df['mutations'] = dbNSFP_req_df['aaref'] + dbNSFP_req_df['aapos'] + dbNSFP_req_df['aaalt']
    
    dbNSFP_req_df = dbNSFP_req_df.replace(".", np.nan)

    columns_to_fillna = is_rankscore_column
    dbNSFP_req_df[columns_to_fillna] = dbNSFP_req_df[columns_to_fillna].fillna(0)

    dbNSFP_req_df['clinvar_clnsig'] = dbNSFP_req_df['clinvar_clnsig'].fillna('unknown')
    dbNSFP_req_df['ClinVar_Class'] = dbNSFP_req_df['clinvar_clnsig'].map(map_class_dict)
    dbNSFP_req_df = dbNSFP_req_df.drop_duplicates().reset_index(drop=True)
    dbNSFP_req_df = dbNSFP_req_df[['mutations', 'ClinVar_Class']]
    dbNSFP_req_df = dbNSFP_req_df[(dbNSFP_req_df['ClinVar_Class'] == "Abnormal") | (dbNSFP_req_df['ClinVar_Class'] == "Normal")]
    dbNSFP_req_df = dbNSFP_req_df.drop_duplicates()
    dbNSFP_req_df = dbNSFP_req_df.reset_index(drop=True)
    return dbNSFP_req_df

In [16]:
dbNSFP_df = pd.read_csv('../dbNSFP/dbNSFP4.8a_variant.Aim2.tsv', sep='\t')
required_columns = ['aaref', 'aaalt', 'aapos', 'genename', 'Ensembl_transcriptid',
                        'clinvar_clnsig' ]

is_rankscore_column = [column for column in dbNSFP_df.columns if column.endswith('rankscore')]
required_columns += is_rankscore_column
dbNSFP_req_df = dbNSFP_df[required_columns]
dbNSFP_req_df['aapos'] = dbNSFP_req_df['aapos'].str.split(';')
dbNSFP_req_df['Ensembl_transcriptid'] = dbNSFP_req_df['Ensembl_transcriptid'].str.split(';')
dbNSFP_req_df['genename'] = dbNSFP_req_df['genename'].str.split(';')
columns_to_explode = [ 'aapos','Ensembl_transcriptid', 'genename']
dbNSFP_req_df = dbNSFP_req_df.apply(lambda x: x.explode() if x.name in columns_to_explode else x)
#dbNSFP_req_df['aapos'] = dbNSFP_req_df['aapos'].astype(int)
dbNSFP_req_df = dbNSFP_req_df[~dbNSFP_req_df['aapos'].isna()]
dbNSFP_req_df = dbNSFP_req_df[(dbNSFP_req_df['aaref'] != 'X') | (dbNSFP_req_df['aaalt'] != 'X')]
dbNSFP_req_df
#dbNSFP_req_df = dbNSFP_req_df[(dbNSFP_req_df['genename'] == gene) &
#                                  (dbNSFP_req_df['Ensembl_transcriptid'] == transcript)]

#dbNSFP_req_df['aapos'] = dbNSFP_req_df['aapos'].astype(int)
#dbNSFP_req_df = dbNSFP_req_df[dbNSFP_req_df['aapos'] >=0]
#dbNSFP_req_df['aapos'] = dbNSFP_req_df['aapos'].astype(str)
#dbNSFP_req_df['mutations'] = dbNSFP_req_df['aaref'] + dbNSFP_req_df['aapos'] + dbNSFP_req_df['aaalt']    
#dbNSFP_req_df = dbNSFP_req_df.replace(".", np.nan)
#columns_to_fillna = is_rankscore_column

  dbNSFP_df = pd.read_csv('../dbNSFP/dbNSFP4.8a_variant.Aim2.tsv', sep='\t')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dbNSFP_req_df['aapos'] = dbNSFP_req_df['aapos'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dbNSFP_req_df['Ensembl_transcriptid'] = dbNSFP_req_df['Ensembl_transcriptid'].str.split(';')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retur

Unnamed: 0,aaref,aaalt,aapos,genename,Ensembl_transcriptid,clinvar_clnsig,SIFT_converted_rankscore,SIFT4G_converted_rankscore,Polyphen2_HDIV_rankscore,Polyphen2_HVAR_rankscore,...,GERP++_RS_rankscore,GERP_91_mammals_rankscore,phyloP100way_vertebrate_rankscore,phyloP470way_mammalian_rankscore,phyloP17way_primate_rankscore,phastCons100way_vertebrate_rankscore,phastCons470way_mammalian_rankscore,phastCons17way_primate_rankscore,SiPhy_29way_logOdds_rankscore,bStatistic_converted_rankscore
0,M,L,1,ABCD1,ENST00000218104,.,0.35349,0.28958,0.19712,0.12133,...,0.69188,.,0.30848,.,0.86732,0.71638,0.68203,0.79791,0.50529,0.97548
1,M,V,1,ABCD1,ENST00000218104,Pathogenic,0.49117,0.43531,0.19712,0.14941,...,0.69188,.,0.30848,.,0.86732,0.71638,0.68203,0.79791,0.50529,0.97548
2,M,L,1,ABCD1,ENST00000218104,.,0.35349,0.28958,0.19712,0.12133,...,0.69188,.,0.30848,.,0.86732,0.71638,0.68203,0.79791,0.50529,0.97548
3,M,K,1,ABCD1,ENST00000218104,.,0.72154,0.69154,0.39217,0.33681,...,0.69188,.,0.53,.,0.54293,0.71638,0.68203,0.76049,0.50529,0.97548
4,M,T,1,ABCD1,ENST00000218104,.,0.65419,0.62352,0.35387,0.28123,...,0.69188,.,0.53,.,0.54293,0.71638,0.68203,0.76049,0.50529,0.97548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450555,I,N,36,YY1,ENST00000262238,.,0.29420,0.83351,0.32022,0.32088,...,0.21832,0.18641,0.52699,0.48277,0.05231,0.71638,0.68203,0.32748,0.21596,0.04745
450556,I,T,36,YY1,ENST00000262238,.,0.59928,0.72224,0.07471,0.06944,...,0.21832,0.18641,0.52699,0.48277,0.05231,0.71638,0.68203,0.32748,0.21596,0.04745
450557,I,S,36,YY1,ENST00000262238,.,0.72154,0.79402,0.22494,0.20792,...,0.21832,0.18641,0.52699,0.48277,0.05231,0.71638,0.68203,0.32748,0.21596,0.04745
450558,I,M,36,YY1,ENST00000262238,.,0.27426,0.27503,0.37037,0.39164,...,0.212,0.05632,0.34411,0.2505,0.20698,0.71638,0.68203,0.33024,0.04039,0.04745


In [17]:
gene_list= ['CACNA1S','KDM5B','LMNA','PCSK9','CDKN1C','TNNI2','YAP1','DDX23','HNRNPA1',
            'KRT5','TRPV4','BRCA2','PDX1','HIF1A','YY1','SMAD3','PALB2','BRCA1','CD79B',
            'EZH1','PNPO','RAD51C','TP53','NPC1','LDLR','TNNI3','CPS1','GLS','NFE2L2',
            'SCN2A','SCN9A','GINS1','PRNP','APP','CHEK2','CTNNB1','MCM2','STAG1','TRNT1',
            'FGA','NSD2','PPP3CA','APC','B4GALT7','PRLR','WASF1','BRAF','EZH2','SMO','TRPV5',
            'TRPV6','ABL1','DNM1','NOTCH1','SECISBP2','ABCD1','EMD','F9','FLNA','MECP2','NLGN3','WAS']
dbNSFP_req_gene_df = dbNSFP_req_df[dbNSFP_req_df['genename'].isin(gene_list)]


In [18]:
mart_df = pd.read_csv('../gene2refseq/mart_export.ensemble_refseq.mapping.tsv', sep='\t')

gene_enst_df = dbNSFP_req_gene_df[['genename', 'Ensembl_transcriptid']]
gene_enst_df = gene_enst_df.drop_duplicates()
gene_enst_df = gene_enst_df.reset_index(drop=True)
gene_mart_df = pd.merge(gene_enst_df, mart_df, left_on = 'Ensembl_transcriptid', right_on = 'Transcript_stable_ID', how='left')
gene_df = gene_mart_df[['genename']].drop_duplicates()
gene_df =gene_df.reset_index(drop=True)

gene_transcript_dict = {'NOTCH1' : 'ENST00000277541'}
def filter_genes_with_mane(df, gene_df):
    grouped = df.groupby('genename')
    filtered = grouped.apply(lambda g: g if g['MANE_Select'].notna().any() else g[g['MANE_Select'].isna()].head(1))
    filtered = filtered[['genename', 'Ensembl_transcriptid', 'MANE_Select']]
    filtered = filtered.dropna()
    filtered = filtered.drop_duplicates()
    filtered = filtered.reset_index(drop=True)
    out_df =  pd.merge(gene_df, filtered, on='genename', how='left')
    return out_df


filtered_df = filter_genes_with_mane(gene_mart_df, gene_df)

filtered_df = filtered_df.fillna('ENST00000277541')
enst_list = filtered_df.Ensembl_transcriptid.tolist()


In [19]:
dbNSFP_req_gene_enst_df = dbNSFP_req_gene_df[dbNSFP_req_gene_df['Ensembl_transcriptid'].isin(enst_list)]
dbNSFP_req_gene_enst_df['mutations'] = dbNSFP_req_gene_enst_df['aaref'] + dbNSFP_req_gene_enst_df['aapos'] + dbNSFP_req_gene_enst_df['aaalt']
dbNSFP_req_gene_enst_df['key'] = dbNSFP_req_gene_enst_df['mutations'] + "_" + dbNSFP_req_gene_enst_df['genename']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dbNSFP_req_gene_enst_df['mutations'] = dbNSFP_req_gene_enst_df['aaref'] + dbNSFP_req_gene_enst_df['aapos'] + dbNSFP_req_gene_enst_df['aaalt']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dbNSFP_req_gene_enst_df['key'] = dbNSFP_req_gene_enst_df['mutations'] + "_" + dbNSFP_req_gene_enst_df['genename']


In [20]:
disorder_data_df = pd.read_csv("../Disorder_deleterious_neutral_results.tsv", sep='\t')
disorder_data_df['regions'] = 'Disorder'
disorder_data_df['CAVA_PROTPOS']  = disorder_data_df['CAVA_PROTPOS'].astype(str)
disorder_data_df['mutations'] = disorder_data_df['CAVA_PROTREF'] + disorder_data_df['CAVA_PROTPOS'] + disorder_data_df['CAVA_PROTALT']
disorder_data_df['key'] = disorder_data_df['mutations'] + "_" +  disorder_data_df['CAVA_GENE']

In [21]:
disorder_data_dbnsfp_df = pd.merge(disorder_data_df, dbNSFP_req_gene_enst_df, on='key', how='left')
keep_cols = ['mutations_x', 'GENES', 'Ensembl_transcriptid', 'Class', 'CLNSIG', 
             'SIFT_converted_rankscore',
       'SIFT4G_converted_rankscore', 'Polyphen2_HDIV_rankscore',
       'Polyphen2_HVAR_rankscore', 'LRT_converted_rankscore',
       'MutationTaster_converted_rankscore', 'MutationAssessor_rankscore',
       'FATHMM_converted_rankscore', 'PROVEAN_converted_rankscore',
       'VEST4_rankscore', 'MetaSVM_rankscore', 'MetaLR_rankscore',
       'MetaRNN_rankscore', 'M-CAP_rankscore', 'REVEL_rankscore',
       'MutPred_rankscore', 'MVP_rankscore', 'gMVP_rankscore', 'MPC_rankscore',
       'PrimateAI_rankscore', 'DEOGEN2_rankscore', 'BayesDel_addAF_rankscore',
       'BayesDel_noAF_rankscore', 'ClinPred_rankscore', 'LIST-S2_rankscore',
       'VARITY_R_rankscore', 'VARITY_ER_rankscore', 'VARITY_R_LOO_rankscore',
       'VARITY_ER_LOO_rankscore', 'ESM1b_rankscore', 'EVE_rankscore',
       'AlphaMissense_rankscore', 'PHACTboost_rankscore',
       'MutFormer_rankscore', 'CADD_raw_rankscore', 'DANN_rankscore',
       'fathmm-MKL_coding_rankscore', 'fathmm-XF_coding_rankscore',
       'Eigen-raw_coding_rankscore', 'Eigen-PC-raw_coding_rankscore',
       'GenoCanyon_rankscore', 'integrated_fitCons_rankscore',
       'GM12878_fitCons_rankscore', 'H1-hESC_fitCons_rankscore',
       'HUVEC_fitCons_rankscore', 'LINSIGHT_rankscore', 'GERP++_RS_rankscore',
       'GERP_91_mammals_rankscore', 'phyloP100way_vertebrate_rankscore',
       'phyloP470way_mammalian_rankscore', 'phyloP17way_primate_rankscore',
       'phastCons100way_vertebrate_rankscore',
       'phastCons470way_mammalian_rankscore',
       'phastCons17way_primate_rankscore', 'SiPhy_29way_logOdds_rankscore',
       'bStatistic_converted_rankscore']
disorder_data_dbnsfp_df = disorder_data_dbnsfp_df[keep_cols]
disorder_data_dbnsfp_df = disorder_data_dbnsfp_df.drop_duplicates()
disorder_data_dbnsfp_df = disorder_data_dbnsfp_df.reset_index(drop=True)
mutation_counts = disorder_data_dbnsfp_df['mutations_x'].value_counts()

# Filter to get mutations with counts greater than 1
mutations_gt_1 = mutation_counts[mutation_counts > 1].index.tolist()
mutations_unique = mutation_counts[mutation_counts == 1].index.tolist()
disorder_data_dbnsfp_filtered_df = disorder_data_dbnsfp_df[disorder_data_dbnsfp_df['mutations_x'].isin(mutations_unique)]
dup_mutations_df = disorder_data_dbnsfp_df[disorder_data_dbnsfp_df['mutations_x'].isin(mutations_gt_1)]
first_occurrences_df = dup_mutations_df.drop_duplicates(subset='mutations_x', keep='first')
disorder_data_dbnsfp_filtered_df = pd.concat([disorder_data_dbnsfp_filtered_df,first_occurrences_df], axis=0)
disorder_data_dbnsfp_filtered_df = disorder_data_dbnsfp_filtered_df.drop_duplicates()
disorder_data_dbnsfp_filtered_df = disorder_data_dbnsfp_filtered_df.reset_index(drop=True)


In [22]:
disorder_data_dbnsfp_filtered_df.to_csv("disorder_results_with_dbNSFP.tsv", sep='\t', index=False)

In [44]:
def create_class_label(x):
    del_class = ['Pathogenic', 'Pathogenic/Likely_pathogenic', 'Pathogenic/Likely_pathogenic/Pathogenic,_low_penetrance', 'Pathogenic|risk_factor']
    if x in del_class:
        return 'Deleterious'
    else:
        return "Neutral"
clinvar_ordered_df = pd.read_csv('../ClinVar/clinvar.snp.header.chr.pfam.aim2genes.pathogenic_benign.cava.tsv', sep='\t')
#print(clinvar_ordered_df.columns)
drop_colmns = ['DBVARID', 'GENEINFO', 'MC',
       'ONC', 'ONCCONF', 'ONCDISDB', 'ONCDISDBINCL', 'ONCDN', 'ONCDNINCL',
       'ONCINCL', 'ONCREVSTAT', 'ORIGIN', 'RS', 'SCI', 'SCIDISDB',
       'SCIDISDBINCL', 'SCIDN', 'SCIDNINCL', 'SCIINCL', 'SCIREVSTAT', 
               'CLNDN', 'CLNDNINCL', 'CLNDNINCL', 'CLNHGVS', 'CLNVCSO', 'CLNVI',
              'QUAL', 'FILTER', 'AF_ESP', 'SCIDISDBINCL', 'SCIDN', 'SCIDNINCL', 'SCIINCL', 'SCIREVSTAT',
       'AF_EXAC', 'AF_TGP', 'ALLELEID', 'CLNSIGCONF',
       'CLNSIGINCL', 'CLNVC', 'CLNVCSO', 'CLNVI', 'DBVARID', 'GENEINFO', 'MC',
       'ONC', 'ONCCONF', 'ONCDISDB', 'ONCDISDBINCL', 'ONCDN', 'ONCDNINCL',
       'ONCINCL', 'ONCREVSTAT', 'ORIGIN', 'RS', 'SCI', 'SCIDISDB',
       'SCIDISDBINCL', 'SCIDN', 'SCIDNINCL', 'SCIINCL', 'SCIREVSTAT', 'CAVA_ALTANN', 'CAVA_GENEID',
               'CAVA_ALTCLASS', 'CAVA_ALTFLAG', 'CLNDISDBINCL', 'CAVA_DBSNP', 'CAVA_ALTSO', 'CLNDISDB', 'CAVA_TRINFO']
clinvar_ordered_df = clinvar_ordered_df.drop(drop_colmns, axis=1)
clinvar_ordered_df = clinvar_ordered_df[clinvar_ordered_df['CAVA_SO'] == 'missense_variant']
clinvar_ordered_df = clinvar_ordered_df.drop_duplicates()
clinvar_ordered_df = clinvar_ordered_df.reset_index(drop=True)
clinvar_ordered_df['Class' ] = clinvar_ordered_df['CLNSIG'].apply(lambda x: create_class_label(x))
clinvar_ordered_df['mutation'] = clinvar_ordered_df['CAVA_PROTREF'] + clinvar_ordered_df['CAVA_PROTPOS'] + clinvar_ordered_df['CAVA_PROTALT']
clinvar_ordered_df['key'] = clinvar_ordered_df['mutation'] + '_' + clinvar_ordered_df['CAVA_GENE']


  clinvar_ordered_df = pd.read_csv('../ClinVar/clinvar.snp.header.chr.pfam.aim2genes.pathogenic_benign.cava.tsv', sep='\t')


Unnamed: 0,#CHROM,POS,ID,REF,ALT,CAVA_CLASS,CAVA_CSN,CAVA_GENE,CAVA_IMPACT,CAVA_LOC,...,CAVA_PROTPOS,CAVA_PROTREF,CAVA_SO,CAVA_TRANSCRIPT,CAVA_TYPE,CLNREVSTAT,CLNSIG,Class,mutation,key
0,chr1,55509556,440712,A,C,NSY,c.248A>C_p.Lys83Thr,PCSK9,2,Ex2,...,83,K,missense_variant,NM_174936.4,Substitution,no_assertion_criteria_provided,Pathogenic,Deleterious,K83T,K83T_PCSK9
1,chr1,55509631,440715,T,C,NSY,c.323T>C_p.Leu108Pro,PCSK9,2,Ex2,...,108,L,missense_variant,NM_174936.4,Substitution,no_assertion_criteria_provided,Pathogenic,Deleterious,L108P,L108P_PCSK9
2,chr1,55518073,2874,T,C,NSY,c.646T>C_p.Phe216Leu,PCSK9,2,Ex4,...,216,F,missense_variant,NM_174936.4,Substitution,no_assertion_criteria_provided,Pathogenic,Deleterious,F216L,F216L_PCSK9
3,chr1,55523127,265939,G,C,NSY,c.1120G>C_p.Asp374His,PCSK9,2,Ex7,...,374,D,missense_variant,NM_174936.4,Substitution,"criteria_provided,_multiple_submitters,_no_con...",Pathogenic,Deleterious,D374H,D374H_PCSK9
4,chr1,55523127,2875,G,T,NSY,c.1120G>T_p.Asp374Tyr,PCSK9,2,Ex7,...,374,D,missense_variant,NM_174936.4,Substitution,"criteria_provided,_multiple_submitters,_no_con...",Pathogenic/Likely_pathogenic,Deleterious,D374Y,D374Y_PCSK9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1735,chrX,153594724,1429176,T,C,NSY,c.1180A>G_p.Ser394Gly,FLNA,2,Ex8,...,394,S,missense_variant,NM_001110556.2,Substitution,"criteria_provided,_single_submitter",Benign,Neutral,S394G,S394G_FLNA
1736,chrX,153594965,282366,C,T,NSY,c.1030G>A_p.Val344Ile,FLNA,2,Ex7,...,344,V,missense_variant,NM_001110556.2,Substitution,"criteria_provided,_multiple_submitters,_no_con...",Benign/Likely_benign,Neutral,V344I,V344I_FLNA
1737,chrX,153594986,1003574,C,T,NSY,c.1009G>A_p.Asp337Asn,FLNA,2,Ex7,...,337,D,missense_variant,NM_001110556.2,Substitution,"criteria_provided,_single_submitter",Benign,Neutral,D337N,D337N_FLNA
1738,chrX,153595129,701764,C,T,NSY,c.958G>A_p.Val320Met,FLNA,2,Ex6,...,320,V,missense_variant,NM_001110556.2,Substitution,"criteria_provided,_multiple_submitters,_no_con...",Benign/Likely_benign,Neutral,V320M,V320M_FLNA


In [45]:
clinvar_ordered_dbNSFP_df = pd.merge(clinvar_ordered_df, dbNSFP_req_gene_enst_df, on='key', how='left')


In [50]:
clinvar_ordered_dbNSFP_df = clinvar_ordered_dbNSFP_df[~clinvar_ordered_dbNSFP_df['phyloP470way_mammalian_rankscore'].isna()]
clinvar_ordered_dbNSFP_df.to_csv("ordered_regions_results_with_dbNSFP.tsv", sep='\t', index=False)