In [1]:
import pandas as pd
import glob
import numpy as np

## Gathering variants
Gathering information on other variant types:
- Here: **AlphaMissense 10 most + least pathogenic variants per AD**
- All gnomAD variants in AD
- All Clinvar variants in AD

### Loading in the AD names

In [2]:
clinvar_gene_filepaths= glob.glob('../output/caitlin_experiment/variant fastas/*')
genes = pd.DataFrame({"filepath" : clinvar_gene_filepaths})
genes["Gene"] = genes["filepath"].str.split("/").str[-1].str.split("_").str[0]
known_ADs = pd.read_csv("../output/known_ADs_considering_isoforms_and_canonical.csv")
enst_codes = pd.read_csv("../data/SFARI_tf_ENST_codes.csv", index_col = 0)

# Adding uniprotID and ENST used
genes = pd.merge(known_ADs[["Gene", "uniprotID"]], genes, on = "Gene")
genes = pd.merge(enst_codes, genes, on = "uniprotID")
genes["ENST"] = genes["ENST"].str.split(".").str[0]

# Replacing ERG's ENST with the correct one
genes.loc[genes['Gene'] == 'ERG', 'ENST'] = 'ENST00000288319'

# Found below that these alternative transcripts corresp to the same uniprotID
genes.loc[genes['uniprotID'] == 'P26367', 'ENST'] = 'ENST00000640287'
genes.loc[genes['uniprotID'] == 'P43354', 'ENST'] = 'ENST00000409572'
genes.loc[genes['uniprotID'] == 'P78337', 'ENST'] = 'ENST00000506438'

genes

Unnamed: 0,uniprotID,ENST,Gene,filepath
0,O14770,ENST00000561208,MEIS2,../output/caitlin_experiment/variant fastas/ME...
1,O94983,ENST00000348066,CAMTA2,../output/caitlin_experiment/variant fastas/CA...
2,O94983,ENST00000348066,CAMTA2,../output/caitlin_experiment/variant fastas/CA...
3,O94983,ENST00000348066,CAMTA2,../output/caitlin_experiment/variant fastas/CA...
4,O94983,ENST00000348066,CAMTA2,../output/caitlin_experiment/variant fastas/CA...
5,P11308,ENST00000288319,ERG,../output/caitlin_experiment/variant fastas/ER...
6,P11308,ENST00000288319,ERG,../output/caitlin_experiment/variant fastas/ER...
7,P11308,ENST00000288319,ERG,../output/caitlin_experiment/variant fastas/ER...
8,P11308,ENST00000288319,ERG,../output/caitlin_experiment/variant fastas/ER...
9,P26367,ENST00000640287,PAX6,../output/caitlin_experiment/variant fastas/PA...


In [3]:
ADs = pd.DataFrame({"filepath" : clinvar_gene_filepaths})
ADs["gene"] = ADs["filepath"].str.split("fastas/").str[1].str.split("_").str[0]
ADs["uniprotID"] = ADs["filepath"].str.split("_").str[2]
ADs["start"] = ADs["filepath"].str.split("_AD_").str[1].str.split("-").str[0].astype(int)
ADs["end"] = ADs["filepath"].str.split("_AD_").str[1].str.split("-").str[1].str.split("_").str[0].astype(int)
ADs = pd.merge(ADs, genes)
ADs

Unnamed: 0,filepath,gene,uniprotID,start,end,ENST,Gene
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG
1,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG
2,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,ENST00000348332,NCOA1
3,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,ENST00000348332,NCOA1
4,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,ENST00000348332,NCOA1
5,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,ENST00000348066,CAMTA2
6,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,ENST00000348066,CAMTA2
7,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,ENST00000282549,OTX1
8,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1
9,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1


### AlphaMissense

In [4]:
# What does table look like
test_chunk = pd.read_csv("../data/AlphaMissense_hg38.tsv", sep = "\t", header = 3, nrows = 5000)
test_chunk

Unnamed: 0,#CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class
0,chr1,69094,G,T,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign
1,chr1,69094,G,C,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign
2,chr1,69094,G,A,hg38,Q8NH21,ENST00000335137.4,V2M,0.3296,likely_benign
3,chr1,69095,T,C,hg38,Q8NH21,ENST00000335137.4,V2A,0.2609,likely_benign
4,chr1,69095,T,A,hg38,Q8NH21,ENST00000335137.4,V2E,0.2922,likely_benign
...,...,...,...,...,...,...,...,...,...,...
4995,chr1,942896,G,A,hg38,Q96NU1,ENST00000342066.8,D468N,0.2016,likely_benign
4996,chr1,942896,G,C,hg38,Q96NU1,ENST00000342066.8,D468H,0.4723,ambiguous
4997,chr1,942896,G,T,hg38,Q96NU1,ENST00000342066.8,D468Y,0.2192,likely_benign
4998,chr1,942897,A,C,hg38,Q96NU1,ENST00000342066.8,D468A,0.2714,likely_benign


Will use the protein variant column and uniprotID to merge with our ADs. 

Plan: 
- AlphaMissense table: create a foreign_key column on the uniprotID + "_" + position
- ADs table: expand to create a foreign_key column on every possible uniprotID + "_" + position combo using start and end
- Then, search file for matches on foreign_key column
- Afterwards, only keep ENST matches

In [5]:
# AlphaMissense table: create an index on the uniprot_id + "_" + position

test_chunk["prot_pos"] = test_chunk["protein_variant"].str[1:-1]
test_chunk["foreign_key"] = test_chunk["uniprot_id"] + "_" + test_chunk["prot_pos"]
test_chunk = test_chunk.drop(columns = "prot_pos")
test_chunk

Unnamed: 0,#CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,foreign_key
0,chr1,69094,G,T,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign,Q8NH21_2
1,chr1,69094,G,C,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign,Q8NH21_2
2,chr1,69094,G,A,hg38,Q8NH21,ENST00000335137.4,V2M,0.3296,likely_benign,Q8NH21_2
3,chr1,69095,T,C,hg38,Q8NH21,ENST00000335137.4,V2A,0.2609,likely_benign,Q8NH21_2
4,chr1,69095,T,A,hg38,Q8NH21,ENST00000335137.4,V2E,0.2922,likely_benign,Q8NH21_2
...,...,...,...,...,...,...,...,...,...,...,...
4995,chr1,942896,G,A,hg38,Q96NU1,ENST00000342066.8,D468N,0.2016,likely_benign,Q96NU1_468
4996,chr1,942896,G,C,hg38,Q96NU1,ENST00000342066.8,D468H,0.4723,ambiguous,Q96NU1_468
4997,chr1,942896,G,T,hg38,Q96NU1,ENST00000342066.8,D468Y,0.2192,likely_benign,Q96NU1_468
4998,chr1,942897,A,C,hg38,Q96NU1,ENST00000342066.8,D468A,0.2714,likely_benign,Q96NU1_468


In [6]:
# ADs table: expand to create a foreign_key column on every possible uniprotID + "_" + position combo using start and end
# ADs = ADs[["gene", "uniprotID", "start", "end"]]

# Adding one row per position
ADs["range"] = ADs.apply(lambda row: np.arange(row["start"], row["end"] + 1), axis=1)
ADs = ADs.explode('range')

# Creating and saving foreign key
ADs["foreign_key"] = ADs["uniprotID"] + "_" + ADs["range"].astype(str)
AD_foreign_key_set = set(ADs["foreign_key"])

In [7]:
# Now, search for matches using foreign key column
# Code adapted from http://localhost:8888/notebooks/Desktop/Staller_Lab/SFARI/notebooks/AlphaMissense%20Pathogenicity%20Preds%20-%20V2.ipynb

chunksize = 100000 
am_output = pd.DataFrame()
num_vars = 0
alpha_m = "../data/AlphaMissense_hg38.tsv"


# Reading in chunks, keeping rows overlapping a variant
for chunk in pd.read_csv(alpha_m, sep='\t', chunksize=chunksize, header = 3):

    
    chunk["prot_pos"] = chunk["protein_variant"].str[1:-1]
    chunk["foreign_key"] = chunk["uniprot_id"] + "_" + chunk["prot_pos"]
    chunk = chunk.drop(columns = "prot_pos")

    # display(chunk)

    keep = chunk[chunk["foreign_key"].isin(AD_foreign_key_set)]
    am_output = pd.concat([am_output, keep])
    if len(keep):
        print("Found " + str(len(set(am_output.index))) + " variants")

        # display(joined)

Found 993 variants
Found 1920 variants
Found 3781 variants
Found 4892 variants
Found 5869 variants
Found 7072 variants
Found 7770 variants
Found 8123 variants
Found 9373 variants
Found 9694 variants
Found 10240 variants
Found 10594 variants


In [8]:
# Represent all uniprotIDs
print(len(set(am_output["uniprot_id"])))
print(len(set(ADs["uniprotID"])))

11
11


In [9]:
# Now, only keeping if matches an ENST
am_output["ENST"] = am_output["transcript_id"].str.split(".").str[0]
am_output

Unnamed: 0,#CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,foreign_key,ENST
11140818,chr11,31789940,T,A,hg38,P26367,ENST00000640287.1,L421F,0.2746,likely_benign,P26367_421,ENST00000640287
11140819,chr11,31789940,T,G,hg38,P26367,ENST00000640287.1,L421F,0.2746,likely_benign,P26367_421,ENST00000640287
11140820,chr11,31789941,A,G,hg38,P26367,ENST00000640287.1,L421S,0.9074,likely_pathogenic,P26367_421,ENST00000640287
11140821,chr11,31789942,A,T,hg38,P26367,ENST00000640287.1,L421I,0.0876,likely_benign,P26367_421,ENST00000640287
11140822,chr11,31789942,A,C,hg38,P26367,ENST00000640287.1,L421V,0.1545,likely_benign,P26367_421,ENST00000640287
...,...,...,...,...,...,...,...,...,...,...,...,...
66593620,chr9,36882105,C,T,hg38,Q02548,ENST00000358127.9,G304D,0.7370,likely_pathogenic,Q02548_304,ENST00000358127
66593621,chr9,36882105,C,A,hg38,Q02548,ENST00000358127.9,G304V,0.6740,likely_pathogenic,Q02548_304,ENST00000358127
66593622,chr9,36923355,C,G,hg38,Q02548,ENST00000358127.9,G304R,0.8939,likely_pathogenic,Q02548_304,ENST00000358127
66593623,chr9,36923355,C,A,hg38,Q02548,ENST00000358127.9,G304C,0.5283,ambiguous,Q02548_304,ENST00000358127


In [10]:
am_output_all_ensts = am_output 
am_output = am_output[am_output["ENST"].isin(genes["ENST"])]
am_output

Unnamed: 0,#CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,foreign_key,ENST
11140818,chr11,31789940,T,A,hg38,P26367,ENST00000640287.1,L421F,0.2746,likely_benign,P26367_421,ENST00000640287
11140819,chr11,31789940,T,G,hg38,P26367,ENST00000640287.1,L421F,0.2746,likely_benign,P26367_421,ENST00000640287
11140820,chr11,31789941,A,G,hg38,P26367,ENST00000640287.1,L421S,0.9074,likely_pathogenic,P26367_421,ENST00000640287
11140821,chr11,31789942,A,T,hg38,P26367,ENST00000640287.1,L421I,0.0876,likely_benign,P26367_421,ENST00000640287
11140822,chr11,31789942,A,C,hg38,P26367,ENST00000640287.1,L421V,0.1545,likely_benign,P26367_421,ENST00000640287
...,...,...,...,...,...,...,...,...,...,...,...,...
66593620,chr9,36882105,C,T,hg38,Q02548,ENST00000358127.9,G304D,0.7370,likely_pathogenic,Q02548_304,ENST00000358127
66593621,chr9,36882105,C,A,hg38,Q02548,ENST00000358127.9,G304V,0.6740,likely_pathogenic,Q02548_304,ENST00000358127
66593622,chr9,36923355,C,G,hg38,Q02548,ENST00000358127.9,G304R,0.8939,likely_pathogenic,Q02548_304,ENST00000358127
66593623,chr9,36923355,C,A,hg38,Q02548,ENST00000358127.9,G304C,0.5283,ambiguous,Q02548_304,ENST00000358127


In [11]:
# Some uniprotIDs don't have their ENST represented
len(set(am_output["uniprot_id"]))

11

In [12]:
# Originally, before change - Dont have AM preds for these 4 ADs

no_AM_preds = list(set(genes["uniprotID"]) - set(am_output["uniprot_id"]))
genes[genes["uniprotID"].isin(no_AM_preds)].drop_duplicates()

Unnamed: 0,uniprotID,ENST,Gene,filepath


In [13]:
am_output_all_ensts[am_output_all_ensts["uniprot_id"] == "P26367"]["ENST"].value_counts()
# This ENST matches the same isoform, P26367-1!

ENST
ENST00000640287    993
Name: count, dtype: int64

In [14]:
am_output_all_ensts[am_output_all_ensts["uniprot_id"] == "P43354"]["ENST"].value_counts()
# This ENST matches the same isoform, P43354-1!

ENST
ENST00000409572    698
Name: count, dtype: int64

In [15]:
am_output_all_ensts[am_output_all_ensts["uniprot_id"] == "P78337"]["ENST"].value_counts()
# This ENST matches the same isoform, P78337

# Changing code above for the 3 ENSTs

ENST
ENST00000506438    321
Name: count, dtype: int64

In [16]:
# Checking work to make sure protein positions line up with known AD annotations

In [17]:
target_ADs = known_ADs[known_ADs["uniprotID"].isin(genes["uniprotID"])]
target_ADs

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,ProteinRegionSeq,Notes
48,CAMTA2,285,468,O94983,"[['O94983', 'O94983-2'], ['O94983', 'O94983-2']]",nan / ENST00000348066,O94983 / O94983,"PMID: 16678093, Soto / DelRosso et al.",TF,KAHTSPSSSSSSSSSGFAEPLEIRPSPPTSRGGSSRGGTAILLLTG...,
49,CAMTA2,472,581,O94983,"[['O94983', 'O94983-2']]",ENST00000348066,O94983,DelRosso et al.,TF,SPAPLEPSSRVGRGEALFGGPVGASELEPFSLSSFPDLMGELISDE...,
125,ERG,433,479,P11308,[['P11308-3']],,P11308,"PMID: 9681824, Soto",TF,PHPPALPVTSSSFFAAPNPYWNSPTGGIYPNTRLPTSHMPSHLGTYY,
126,ERG,118,261,P11308,"[['P11308-1', 'P11308-3', 'P11308-5', 'P11308-...",nan / ENST00000288319,P11308 / P11308,"PMID: 14603248, Soto / DelRosso et al.",TF,MTTNERRVIVPADPTLWSTDHVRQWLEWAVKEYGLPDVNILLFQNI...,
267,IKZF1,284,365,Q13422,[['Q13422']],,Q13422,"PMID: 8895580, Soto",TF,GDKGLSDTPYDSSASYEKENEMMKSHVMDQAINNAINYLGAESLRP...,
333,MEIS2,340,477,O14770,[['O14770']],,O14770,"activation_regions.txt, GSL",TF,DQSNRAGFLLDPSVSQGAAYSPEGQPMGSFVLDGQQHMGIRPAGLQ...,
372,NCOA1,1,93,Q15788,"[['Q15788', 'Q15788-2', 'Q15788-3']]",,Q15788,"PMID: 9575154, Soto",TF,MSGLGDSSSDPANPDSHKRKGSPCDTLASSTEKRRREQENKYLEEL...,
373,NCOA1,1241,1385,Q15788,"[['Q15788', 'Q15788-2', 'Q15788-3']]",,Q15788,"PMID: 9427757, Soto",TF,GEANFAPSLSPGSSMVPMPIPPPQSSLLQQTPPASGYQSPDMKAWQ...,
374,NCOA1,840,1011,Q15788,"[['Q15788', 'Q15788-2', 'Q15788-3'], ['Q15788-...",nan / ENST00000288599,Q15788 / Q15788,"PMID: 9427757, 9575154, Soto / DelRosso et al.",TF,VTSVTIKSEILPASLQSATARPTSRLNRLPELELEAIDNQFGQPGT...,
411,NKX2-2,220,273,O95096,[['O95096']],,O95096,"PMID: 10944215, Soto",TF,AQDLAAATFQAGIPFSAYSAQSLQHMQYNAQYSSASTPQYPTAHPL...,


In [18]:
am_output["AA_pos"] = am_output["protein_variant"].str[1:-1].astype(int)
am_output["AA_orig"] = am_output["protein_variant"].str[0]
am_output

Unnamed: 0,#CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,foreign_key,ENST,AA_pos,AA_orig
11140818,chr11,31789940,T,A,hg38,P26367,ENST00000640287.1,L421F,0.2746,likely_benign,P26367_421,ENST00000640287,421,L
11140819,chr11,31789940,T,G,hg38,P26367,ENST00000640287.1,L421F,0.2746,likely_benign,P26367_421,ENST00000640287,421,L
11140820,chr11,31789941,A,G,hg38,P26367,ENST00000640287.1,L421S,0.9074,likely_pathogenic,P26367_421,ENST00000640287,421,L
11140821,chr11,31789942,A,T,hg38,P26367,ENST00000640287.1,L421I,0.0876,likely_benign,P26367_421,ENST00000640287,421,L
11140822,chr11,31789942,A,C,hg38,P26367,ENST00000640287.1,L421V,0.1545,likely_benign,P26367_421,ENST00000640287,421,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66593620,chr9,36882105,C,T,hg38,Q02548,ENST00000358127.9,G304D,0.7370,likely_pathogenic,Q02548_304,ENST00000358127,304,G
66593621,chr9,36882105,C,A,hg38,Q02548,ENST00000358127.9,G304V,0.6740,likely_pathogenic,Q02548_304,ENST00000358127,304,G
66593622,chr9,36923355,C,G,hg38,Q02548,ENST00000358127.9,G304R,0.8939,likely_pathogenic,Q02548_304,ENST00000358127,304,G
66593623,chr9,36923355,C,A,hg38,Q02548,ENST00000358127.9,G304C,0.5283,ambiguous,Q02548_304,ENST00000358127,304,G


In [19]:
for index in target_ADs.index:

    gene_mismatch = False

    # For each AD, filter non delin variants to those in range
    gene = target_ADs["Gene"].loc[index]
    AD_start = target_ADs["Start"].loc[index]
    AD_end = target_ADs["End"].loc[index]
    AD_seq = target_ADs["ProteinRegionSeq"].loc[index]
    ENST = genes[genes["Gene"] == gene]["ENST"].iloc[0]

    # Keep vars within AD
    AD_vars = am_output[am_output["ENST"] == ENST]

    # display(AD_vars)
    AD_vars = AD_vars[(AD_start < AD_vars["AA_pos"]) & (AD_vars["AA_pos"] < AD_end)]
        
    # Use to check correct AA
    pos_to_aa = dict(zip(np.arange(AD_start, AD_end + 1), list(AD_seq)))
    for i in AD_vars.index:
        obs = AD_vars["AA_orig"].loc[i]
        exp = pos_to_aa[AD_vars["AA_pos"].loc[i]]
        if obs != exp:
            gene_mismatch = True

    if gene_mismatch:
        print("Check " + gene)
    else:
        print(gene + " all good!")

CAMTA2 all good!
CAMTA2 all good!
ERG all good!
ERG all good!
IKZF1 all good!
MEIS2 all good!
NCOA1 all good!
NCOA1 all good!
NCOA1 all good!
NKX2-2 all good!
NR4A2 all good!
NR4A2 all good!
OTX1 all good!
PAX5 all good!
PAX6 all good!
PITX1 all good!


### Now, for each AD- save the top 10 and bottom 10 most pathogenic variants

In [20]:
ADs = ADs.drop_duplicates()
ADs

Unnamed: 0,filepath,gene,uniprotID,start,end,ENST,Gene,range,foreign_key
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,118,P11308_118
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,119,P11308_119
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,120,P11308_120
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,121,P11308_121
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,122,P11308_122
...,...,...,...,...,...,...,...,...,...
24,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,ENST00000358127,PAX5,354,Q02548_354
24,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,ENST00000358127,PAX5,355,Q02548_355
24,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,ENST00000358127,PAX5,356,Q02548_356
24,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,ENST00000358127,PAX5,357,Q02548_357


In [21]:
most_pathogenic = pd.DataFrame()
least_pathogenic = pd.DataFrame()

for filepath in set(ADs["filepath"]):
    print(filepath)
    AD_pos_rows = ADs[ADs["filepath"] == filepath]

    # Filter am output to that AD's variants only
    AD_am_output = pd.merge(AD_pos_rows, am_output, on = "foreign_key", how = "left")
    AD_am_output = AD_am_output.dropna()
    AD_am_output = AD_am_output.sort_values(by = "am_pathogenicity", ascending = False)
    display(AD_am_output)
    
    # Sort by pathogenicity to get top 10 and bottom 10 for each AD
    most_pathogenic = pd.concat([most_pathogenic, AD_am_output.iloc[:10]])
    least_pathogenic = pd.concat([least_pathogenic, AD_am_output.iloc[-10:]])

../output/caitlin_experiment/variant fastas/NKX2-2_O95096_AD_220-273


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
351,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,ENST00000377142,NKX2-2,273,O95096_273,chr20,...,T,hg38,O95096,ENST00000377142.5,W273R,0.9918,likely_pathogenic,ENST00000377142,273,W
350,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,ENST00000377142,NKX2-2,273,O95096_273,chr20,...,G,hg38,O95096,ENST00000377142.5,W273R,0.9918,likely_pathogenic,ENST00000377142,273,W
338,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,ENST00000377142,NKX2-2,271,O95096_271,chr20,...,T,hg38,O95096,ENST00000377142.5,W271R,0.9836,likely_pathogenic,ENST00000377142,271,W
337,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,ENST00000377142,NKX2-2,271,O95096_271,chr20,...,G,hg38,O95096,ENST00000377142.5,W271R,0.9836,likely_pathogenic,ENST00000377142,271,W
333,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,ENST00000377142,NKX2-2,271,O95096_271,chr20,...,A,hg38,O95096,ENST00000377142.5,W271C,0.9545,likely_pathogenic,ENST00000377142,271,W
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,ENST00000377142,NKX2-2,256,O95096_256,chr20,...,C,hg38,O95096,ENST00000377142.5,T256A,0.0552,likely_benign,ENST00000377142,256,T
218,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,ENST00000377142,NKX2-2,252,O95096_252,chr20,...,C,hg38,O95096,ENST00000377142.5,S252G,0.0551,likely_benign,ENST00000377142,252,S
221,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,ENST00000377142,NKX2-2,253,O95096_253,chr20,...,C,hg38,O95096,ENST00000377142.5,S253A,0.0544,likely_benign,ENST00000377142,253,S
272,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,ENST00000377142,NKX2-2,261,O95096_261,chr20,...,C,hg38,O95096,ENST00000377142.5,T261A,0.0516,likely_benign,ENST00000377142,261,T


../output/caitlin_experiment/variant fastas/PAX6_P26367_AD_271-422


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
871,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,ENST00000640287,PAX6,403,P26367_403,chr11,...,T,hg38,P26367,ENST00000640287.1,V403D,0.9995,likely_pathogenic,ENST00000640287,403.0,V
8,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,ENST00000640287,PAX6,272,P26367_272,chr11,...,A,hg38,P26367,ENST00000640287.1,R272M,0.9989,likely_pathogenic,ENST00000640287,272.0,R
2,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,ENST00000640287,PAX6,271,P26367_271,chr11,...,G,hg38,P26367,ENST00000640287.1,L271P,0.9989,likely_pathogenic,ENST00000640287,271.0,L
9,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,ENST00000640287,PAX6,272,P26367_272,chr11,...,G,hg38,P26367,ENST00000640287.1,R272T,0.9988,likely_pathogenic,ENST00000640287,272.0,R
883,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,ENST00000640287,PAX6,405,P26367_405,chr11,...,T,hg38,P26367,ENST00000640287.1,V405D,0.9988,likely_pathogenic,ENST00000640287,405.0,V
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
930,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,ENST00000640287,PAX6,412,P26367_412,chr11,...,C,hg38,P26367,ENST00000640287.1,P412A,0.0472,likely_benign,ENST00000640287,412.0,P
768,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,ENST00000640287,PAX6,385,P26367_385,chr11,...,A,hg38,P26367,ENST00000640287.1,P385S,0.0462,likely_benign,ENST00000640287,385.0,P
507,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,ENST00000640287,PAX6,346,P26367_346,chr11,...,C,hg38,P26367,ENST00000640287.1,P346A,0.0455,likely_benign,ENST00000640287,346.0,P
767,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,ENST00000640287,PAX6,385,P26367_385,chr11,...,C,hg38,P26367,ENST00000640287.1,P385A,0.0447,likely_benign,ENST00000640287,385.0,P


../output/caitlin_experiment/variant fastas/NCOA1_Q15788_AD_1241-1385


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
524,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1319,Q15788_1319,chr2,...,A,hg38,Q15788,ENST00000348332.7,V1319D,0.9993,likely_pathogenic,ENST00000348332,1319,V
512,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1317,Q15788_1317,chr2,...,C,hg38,Q15788,ENST00000348332.7,I1317T,0.9985,likely_pathogenic,ENST00000348332,1317,I
510,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1317,Q15788_1317,chr2,...,A,hg38,Q15788,ENST00000348332.7,I1317N,0.9985,likely_pathogenic,ENST00000348332,1317,I
511,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1317,Q15788_1317,chr2,...,G,hg38,Q15788,ENST00000348332.7,I1317S,0.9982,likely_pathogenic,ENST00000348332,1317,I
505,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1316,Q15788_1316,chr2,...,A,hg38,Q15788,ENST00000348332.7,S1316R,0.9950,likely_pathogenic,ENST00000348332,1316,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1363,Q15788_1363,chr2,...,C,hg38,Q15788,ENST00000348332.7,H1363P,0.0483,likely_benign,ENST00000348332,1363,H
643,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1336,Q15788_1336,chr2,...,T,hg38,Q15788,ENST00000348332.7,A1336S,0.0481,likely_benign,ENST00000348332,1336,A
212,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1273,Q15788_1273,chr2,...,G,hg38,Q15788,ENST00000348332.7,P1273A,0.0480,likely_benign,ENST00000348332,1273,P
327,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1290,Q15788_1290,chr2,...,G,hg38,Q15788,ENST00000348332.7,I1290M,0.0473,likely_benign,ENST00000348332,1290,I


../output/caitlin_experiment/variant fastas/ERG_P11308_AD_118-261_seq_adj


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
164,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,142,P11308_142,chr21,...,T,hg38,P11308,ENST00000288319.12,W142R,1.0000,likely_pathogenic,ENST00000288319,142,W
109,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,134,P11308_134,chr21,...,T,hg38,P11308,ENST00000288319.12,W134R,1.0000,likely_pathogenic,ENST00000288319,134,W
108,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,134,P11308_134,chr21,...,G,hg38,P11308,ENST00000288319.12,W134R,1.0000,likely_pathogenic,ENST00000288319,134,W
163,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,142,P11308_142,chr21,...,G,hg38,P11308,ENST00000288319.12,W142R,1.0000,likely_pathogenic,ENST00000288319,142,W
315,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,165,P11308_165,chr21,...,A,hg38,P11308,ENST00000288319.12,G165V,0.9999,likely_pathogenic,ENST00000288319,165,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
908,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,255,P11308_255,chr21,...,T,hg38,P11308,ENST00000288319.12,P255T,0.0491,likely_benign,ENST00000288319,255,P
902,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,254,P11308_254,chr21,...,T,hg38,P11308,ENST00000288319.12,P254T,0.0461,likely_benign,ENST00000288319,254,P
824,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,242,P11308_242,chr21,...,G,hg38,P11308,ENST00000288319.12,Q242P,0.0455,likely_benign,ENST00000288319,242,Q
906,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ENST00000288319,ERG,255,P11308_255,chr21,...,C,hg38,P11308,ENST00000288319.12,P255A,0.0423,likely_benign,ENST00000288319,255,P


../output/caitlin_experiment/variant fastas/MEIS2_O14770_AD_340-477


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
6,../output/caitlin_experiment/variant fastas/ME...,MEIS2,O14770,340,477,ENST00000561208,MEIS2,340,O14770_340,chr15,...,G,hg38,O14770,ENST00000561208.6,D340H,0.9994,likely_pathogenic,ENST00000561208,340,D
4,../output/caitlin_experiment/variant fastas/ME...,MEIS2,O14770,340,477,ENST00000561208,MEIS2,340,O14770_340,chr15,...,A,hg38,O14770,ENST00000561208.6,D340V,0.9990,likely_pathogenic,ENST00000561208,340,D
3,../output/caitlin_experiment/variant fastas/ME...,MEIS2,O14770,340,477,ENST00000561208,MEIS2,340,O14770_340,chr15,...,C,hg38,O14770,ENST00000561208.6,D340G,0.9987,likely_pathogenic,ENST00000561208,340,D
2,../output/caitlin_experiment/variant fastas/ME...,MEIS2,O14770,340,477,ENST00000561208,MEIS2,340,O14770_340,chr15,...,G,hg38,O14770,ENST00000561208.6,D340A,0.9983,likely_pathogenic,ENST00000561208,340,D
17,../output/caitlin_experiment/variant fastas/ME...,MEIS2,O14770,340,477,ENST00000561208,MEIS2,342,O14770_342,chr15,...,G,hg38,O14770,ENST00000561208.6,S342P,0.9982,likely_pathogenic,ENST00000561208,342,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,../output/caitlin_experiment/variant fastas/ME...,MEIS2,O14770,340,477,ENST00000561208,MEIS2,445,O14770_445,chr15,...,C,hg38,O14770,ENST00000561208.6,P445A,0.0517,likely_benign,ENST00000561208,445,P
773,../output/caitlin_experiment/variant fastas/ME...,MEIS2,O14770,340,477,ENST00000561208,MEIS2,455,O14770_455,chr15,...,G,hg38,O14770,ENST00000561208.6,Q455P,0.0513,likely_benign,ENST00000561208,455,Q
309,../output/caitlin_experiment/variant fastas/ME...,MEIS2,O14770,340,477,ENST00000561208,MEIS2,386,O14770_386,chr15,...,C,hg38,O14770,ENST00000561208.6,S386G,0.0511,likely_benign,ENST00000561208,386,S
848,../output/caitlin_experiment/variant fastas/ME...,MEIS2,O14770,340,477,ENST00000561208,MEIS2,466,O14770_466,chr15,...,G,hg38,O14770,ENST00000561208.6,N466T,0.0494,likely_benign,ENST00000561208,466,N


../output/caitlin_experiment/variant fastas/NCOA1_Q15788_AD_840-1011


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
626,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,ENST00000348332,NCOA1,936,Q15788_936,chr2,...,C,hg38,Q15788,ENST00000348332.7,L936P,0.9999,likely_pathogenic,ENST00000348332,936,L
749,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,ENST00000348332,NCOA1,955,Q15788_955,chr2,...,C,hg38,Q15788,ENST00000348332.7,L955P,0.9998,likely_pathogenic,ENST00000348332,955,L
724,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,ENST00000348332,NCOA1,951,Q15788_951,chr2,...,C,hg38,Q15788,ENST00000348332.7,L951P,0.9997,likely_pathogenic,ENST00000348332,951,L
739,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,ENST00000348332,NCOA1,954,Q15788_954,chr2,...,C,hg38,Q15788,ENST00000348332.7,A954P,0.9996,likely_pathogenic,ENST00000348332,954,A
730,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,ENST00000348332,NCOA1,952,Q15788_952,chr2,...,T,hg38,Q15788,ENST00000348332.7,D952V,0.9995,likely_pathogenic,ENST00000348332,952,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,ENST00000348332,NCOA1,902,Q15788_902,chr2,...,A,hg38,Q15788,ENST00000348332.7,S902N,0.0500,likely_benign,ENST00000348332,902,S
14,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,ENST00000348332,NCOA1,842,Q15788_842,chr2,...,G,hg38,Q15788,ENST00000348332.7,S842G,0.0500,likely_benign,ENST00000348332,842,S
918,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,ENST00000348332,NCOA1,983,Q15788_983,chr2,...,C,hg38,Q15788,ENST00000348332.7,I983L,0.0455,likely_benign,ENST00000348332,983,I
1017,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,ENST00000348332,NCOA1,998,Q15788_998,chr2,...,G,hg38,Q15788,ENST00000348332.7,P998A,0.0424,likely_benign,ENST00000348332,998,P


../output/caitlin_experiment/variant fastas/PITX1_P78337_AD_234-283


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
303,../output/caitlin_experiment/variant fastas/PI...,PITX1,P78337,234,283,ENST00000506438,PITX1,281,P78337_281,chr5,...,C,hg38,P78337,ENST00000506438.5,S281R,0.9976,likely_pathogenic,ENST00000506438,281,S
302,../output/caitlin_experiment/variant fastas/PI...,PITX1,P78337,234,283,ENST00000506438,PITX1,281,P78337_281,chr5,...,T,hg38,P78337,ENST00000506438.5,S281R,0.9976,likely_pathogenic,ENST00000506438,281,S
307,../output/caitlin_experiment/variant fastas/PI...,PITX1,P78337,234,283,ENST00000506438,PITX1,281,P78337_281,chr5,...,G,hg38,P78337,ENST00000506438.5,S281R,0.9976,likely_pathogenic,ENST00000506438,281,S
311,../output/caitlin_experiment/variant fastas/PI...,PITX1,P78337,234,283,ENST00000506438,PITX1,282,P78337_282,chr5,...,T,hg38,P78337,ENST00000506438.5,L282Q,0.9955,likely_pathogenic,ENST00000506438,282,L
272,../output/caitlin_experiment/variant fastas/PI...,PITX1,P78337,234,283,ENST00000506438,PITX1,276,P78337_276,chr5,...,A,hg38,P78337,ENST00000506438.5,D276V,0.9929,likely_pathogenic,ENST00000506438,276,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,../output/caitlin_experiment/variant fastas/PI...,PITX1,P78337,234,283,ENST00000506438,PITX1,246,P78337_246,chr5,...,C,hg38,P78337,ENST00000506438.5,N246S,0.0561,likely_benign,ENST00000506438,246,N
170,../output/caitlin_experiment/variant fastas/PI...,PITX1,P78337,234,283,ENST00000506438,PITX1,259,P78337_259,chr5,...,T,hg38,P78337,ENST00000506438.5,P259T,0.0544,likely_benign,ENST00000506438,259,P
168,../output/caitlin_experiment/variant fastas/PI...,PITX1,P78337,234,283,ENST00000506438,PITX1,259,P78337_259,chr5,...,C,hg38,P78337,ENST00000506438.5,P259A,0.0513,likely_benign,ENST00000506438,259,P
169,../output/caitlin_experiment/variant fastas/PI...,PITX1,P78337,234,283,ENST00000506438,PITX1,259,P78337_259,chr5,...,A,hg38,P78337,ENST00000506438.5,P259S,0.0480,likely_benign,ENST00000506438,259,P


../output/caitlin_experiment/variant fastas/CAMTA2_O94983_AD_472-581


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
545,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,ENST00000348066,CAMTA2,557,O94983_557,chr17,...,G,hg38,O94983,ENST00000348066.8,W557R,0.9999,likely_pathogenic,ENST00000348066,557,W
450,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,ENST00000348066,CAMTA2,542,O94983_542,chr17,...,T,hg38,O94983,ENST00000348066.8,W542R,0.9999,likely_pathogenic,ENST00000348066,542,W
451,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,ENST00000348066,CAMTA2,542,O94983_542,chr17,...,G,hg38,O94983,ENST00000348066.8,W542R,0.9999,likely_pathogenic,ENST00000348066,542,W
511,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,ENST00000348066,CAMTA2,552,O94983_552,chr17,...,G,hg38,O94983,ENST00000348066.8,L552P,0.9999,likely_pathogenic,ENST00000348066,552,L
544,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,ENST00000348066,CAMTA2,557,O94983_557,chr17,...,T,hg38,O94983,ENST00000348066.8,W557R,0.9999,likely_pathogenic,ENST00000348066,557,W
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,ENST00000348066,CAMTA2,482,O94983_482,chr17,...,C,hg38,O94983,ENST00000348066.8,V482G,0.0517,likely_benign,ENST00000348066,482,V
345,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,ENST00000348066,CAMTA2,526,O94983_526,chr17,...,C,hg38,O94983,ENST00000348066.8,P526A,0.0505,likely_benign,ENST00000348066,526,P
52,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,ENST00000348066,CAMTA2,480,O94983_480,chr17,...,C,hg38,O94983,ENST00000348066.8,S480G,0.0479,likely_benign,ENST00000348066,480,S
122,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,ENST00000348066,CAMTA2,492,O94983_492,chr17,...,C,hg38,O94983,ENST00000348066.8,P492A,0.0478,likely_benign,ENST00000348066,492,P


../output/caitlin_experiment/variant fastas/ERG_P11308_AD_433-479_seq_adj


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
135,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,433,479,ENST00000288319,ERG,453,P11308_453,chr21,...,T,hg38,P11308,ENST00000288319.12,W453R,0.9825,likely_pathogenic,ENST00000288319,453,W
134,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,433,479,ENST00000288319,ERG,453,P11308_453,chr21,...,G,hg38,P11308,ENST00000288319.12,W453R,0.9825,likely_pathogenic,ENST00000288319,453,W
175,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,433,479,ENST00000288319,ERG,460,P11308_460,chr21,...,T,hg38,P11308,ENST00000288319.12,I460K,0.9491,likely_pathogenic,ENST00000288319,460,I
90,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,433,479,ENST00000288319,ERG,446,P11308_446,chr21,...,G,hg38,P11308,ENST00000288319.12,F446L,0.9353,likely_pathogenic,ENST00000288319,446,F
84,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,433,479,ENST00000288319,ERG,446,P11308_446,chr21,...,C,hg38,P11308,ENST00000288319.12,F446L,0.9353,likely_pathogenic,ENST00000288319,446,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,433,479,ENST00000288319,ERG,450,P11308_450,chr21,...,C,hg38,P11308,ENST00000288319.12,N450S,0.0514,likely_benign,ENST00000288319,450,N
24,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,433,479,ENST00000288319,ERG,436,P11308_436,chr21,...,A,hg38,P11308,ENST00000288319.12,P436S,0.0508,likely_benign,ENST00000288319,436,P
17,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,433,479,ENST00000288319,ERG,435,P11308_435,chr21,...,C,hg38,P11308,ENST00000288319.12,P435A,0.0498,likely_benign,ENST00000288319,435,P
228,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,433,479,ENST00000288319,ERG,468,P11308_468,chr21,...,C,hg38,P11308,ENST00000288319.12,T468A,0.0485,likely_benign,ENST00000288319,468,T


../output/caitlin_experiment/variant fastas/CAMTA2_O94983_AD_285-468


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
595,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,ENST00000348066,CAMTA2,380,O94983_380,chr17,...,G,hg38,O94983,ENST00000348066.8,F380L,0.9959,likely_pathogenic,ENST00000348066,380,F
589,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,ENST00000348066,CAMTA2,380,O94983_380,chr17,...,T,hg38,O94983,ENST00000348066.8,F380L,0.9959,likely_pathogenic,ENST00000348066,380,F
590,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,ENST00000348066,CAMTA2,380,O94983_380,chr17,...,C,hg38,O94983,ENST00000348066.8,F380L,0.9959,likely_pathogenic,ENST00000348066,380,F
592,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,ENST00000348066,CAMTA2,380,O94983_380,chr17,...,G,hg38,O94983,ENST00000348066.8,F380S,0.9957,likely_pathogenic,ENST00000348066,380,F
559,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,ENST00000348066,CAMTA2,375,O94983_375,chr17,...,G,hg38,O94983,ENST00000348066.8,F375L,0.9897,likely_pathogenic,ENST00000348066,375,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,ENST00000348066,CAMTA2,368,O94983_368,chr17,...,C,hg38,O94983,ENST00000348066.8,P368A,0.0491,likely_benign,ENST00000348066,368,P
918,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,ENST00000348066,CAMTA2,431,O94983_431,chr17,...,G,hg38,O94983,ENST00000348066.8,Q431P,0.0491,likely_benign,ENST00000348066,431,Q
427,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,ENST00000348066,CAMTA2,354,O94983_354,chr17,...,C,hg38,O94983,ENST00000348066.8,M354V,0.0482,likely_benign,ENST00000348066,354,M
798,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,ENST00000348066,CAMTA2,411,O94983_411,chr17,...,G,hg38,O94983,ENST00000348066.8,S411P,0.0407,likely_benign,ENST00000348066,411,S


../output/caitlin_experiment/variant fastas/NR4A2_P43354_AD_1-91


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
11,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,1,91,ENST00000409572,NR4A2,3,P43354_3,chr2,...,G,hg38,P43354,ENST00000409572.5,C3R,0.9992,likely_pathogenic,ENST00000409572,3.0,C
7,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,1,91,ENST00000409572,NR4A2,3,P43354_3,chr2,...,C,hg38,P43354,ENST00000409572.5,C3W,0.9987,likely_pathogenic,ENST00000409572,3.0,C
10,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,1,91,ENST00000409572,NR4A2,3,P43354_3,chr2,...,T,hg38,P43354,ENST00000409572.5,C3Y,0.9979,likely_pathogenic,ENST00000409572,3.0,C
9,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,1,91,ENST00000409572,NR4A2,3,P43354_3,chr2,...,G,hg38,P43354,ENST00000409572.5,C3S,0.9967,likely_pathogenic,ENST00000409572,3.0,C
13,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,1,91,ENST00000409572,NR4A2,3,P43354_3,chr2,...,T,hg38,P43354,ENST00000409572.5,C3S,0.9967,likely_pathogenic,ENST00000409572,3.0,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,1,91,ENST00000409572,NR4A2,66,P43354_66,chr2,...,C,hg38,P43354,ENST00000409572.5,N66S,0.0551,likely_benign,ENST00000409572,66.0,N
173,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,1,91,ENST00000409572,NR4A2,29,P43354_29,chr2,...,G,hg38,P43354,ENST00000409572.5,E29D,0.0529,likely_benign,ENST00000409572,29.0,E
172,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,1,91,ENST00000409572,NR4A2,29,P43354_29,chr2,...,A,hg38,P43354,ENST00000409572.5,E29D,0.0529,likely_benign,ENST00000409572,29.0,E
73,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,1,91,ENST00000409572,NR4A2,13,P43354_13,chr2,...,G,hg38,P43354,ENST00000409572.5,Q13P,0.0516,likely_benign,ENST00000409572,13.0,Q


../output/caitlin_experiment/variant fastas/NR4A2_P43354_AD_584-598


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
91,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,ENST00000409572,NR4A2,598,P43354_598,chr2,...,T,hg38,P43354,ENST00000409572.5,F598L,0.9994,likely_pathogenic,ENST00000409572,598,F
97,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,ENST00000409572,NR4A2,598,P43354_598,chr2,...,G,hg38,P43354,ENST00000409572.5,F598L,0.9994,likely_pathogenic,ENST00000409572,598,F
92,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,ENST00000409572,NR4A2,598,P43354_598,chr2,...,C,hg38,P43354,ENST00000409572.5,F598L,0.9994,likely_pathogenic,ENST00000409572,598,F
56,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,ENST00000409572,NR4A2,592,P43354_592,chr2,...,G,hg38,P43354,ENST00000409572.5,F592S,0.9991,likely_pathogenic,ENST00000409572,592,F
82,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,ENST00000409572,NR4A2,596,P43354_596,chr2,...,G,hg38,P43354,ENST00000409572.5,L596S,0.9991,likely_pathogenic,ENST00000409572,596,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,ENST00000409572,NR4A2,586,P43354_586,chr2,...,G,hg38,P43354,ENST00000409572.5,A586P,0.1422,likely_benign,ENST00000409572,586,A
17,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,ENST00000409572,NR4A2,586,P43354_586,chr2,...,T,hg38,P43354,ENST00000409572.5,A586T,0.1317,likely_benign,ENST00000409572,586,A
42,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,ENST00000409572,NR4A2,590,P43354_590,chr2,...,C,hg38,P43354,ENST00000409572.5,K590R,0.1224,likely_benign,ENST00000409572,590,K
16,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,ENST00000409572,NR4A2,586,P43354_586,chr2,...,A,hg38,P43354,ENST00000409572.5,A586S,0.1086,likely_benign,ENST00000409572,586,A


../output/caitlin_experiment/variant fastas/IKZF1_Q13422_AD_284-365


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
245,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,ENST00000331340,IKZF1,318,Q13422_318,chr7,...,A,hg38,Q13422,ENST00000331340.8,A318D,0.9855,likely_pathogenic,ENST00000331340,318,A
242,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,ENST00000331340,IKZF1,318,Q13422_318,chr7,...,C,hg38,Q13422,ENST00000331340.8,A318P,0.9841,likely_pathogenic,ENST00000331340,318,A
276,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,ENST00000331340,IKZF1,323,Q13422_323,chr7,...,T,hg38,Q13422,ENST00000331340.8,G323W,0.9752,likely_pathogenic,ENST00000331340,323,G
222,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,ENST00000331340,IKZF1,315,Q13422_315,chr7,...,A,hg38,Q13422,ENST00000331340.8,I315N,0.9746,likely_pathogenic,ENST00000331340,315,I
223,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,ENST00000331340,IKZF1,315,Q13422_315,chr7,...,G,hg38,Q13422,ENST00000331340.8,I315S,0.9707,likely_pathogenic,ENST00000331340,315,I
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,ENST00000331340,IKZF1,354,Q13422_354,chr7,...,G,hg38,Q13422,ENST00000331340.8,L354V,0.0767,likely_benign,ENST00000331340,354,L
503,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,ENST00000331340,IKZF1,359,Q13422_359,chr7,...,A,hg38,Q13422,ENST00000331340.8,P359T,0.0746,likely_benign,ENST00000331340,359,P
496,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,ENST00000331340,IKZF1,358,Q13422_358,chr7,...,C,hg38,Q13422,ENST00000331340.8,T358P,0.0719,likely_benign,ENST00000331340,358,T
52,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,ENST00000331340,IKZF1,292,Q13422_292,chr7,...,G,hg38,Q13422,ENST00000331340.8,P292A,0.0705,likely_benign,ENST00000331340,292,P


../output/caitlin_experiment/variant fastas/PAX5_Q02548_AD_304-358


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
74,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,ENST00000358127,PAX5,315,Q02548_315,chr9,...,C,hg38,Q02548,ENST00000358127.9,Y315D,0.9930,likely_pathogenic,ENST00000358127,315,Y
75,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,ENST00000358127,PAX5,315,Q02548_315,chr9,...,G,hg38,Q02548,ENST00000358127.9,Y315H,0.9917,likely_pathogenic,ENST00000358127,315,Y
352,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,ENST00000358127,PAX5,358,Q02548_358,chr9,...,G,hg38,Q02548,ENST00000358127.9,W358R,0.9884,likely_pathogenic,ENST00000358127,358,W
351,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,ENST00000358127,PAX5,358,Q02548_358,chr9,...,T,hg38,Q02548,ENST00000358127.9,W358R,0.9884,likely_pathogenic,ENST00000358127,358,W
67,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,ENST00000358127,PAX5,314,Q02548_314,chr9,...,T,hg38,Q02548,ENST00000358127.9,G314R,0.9880,likely_pathogenic,ENST00000358127,314,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,ENST00000358127,PAX5,349,Q02548_349,chr9,...,A,hg38,Q02548,ENST00000358127.9,P349S,0.0570,likely_benign,ENST00000358127,349,P
161,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,ENST00000358127,PAX5,329,Q02548_329,chr9,...,T,hg38,Q02548,ENST00000358127.9,A329T,0.0558,likely_benign,ENST00000358127,329,A
171,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,ENST00000358127,PAX5,331,Q02548_331,chr9,...,C,hg38,Q02548,ENST00000358127.9,T331A,0.0547,likely_benign,ENST00000358127,331,T
226,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,ENST00000358127,PAX5,340,Q02548_340,chr9,...,G,hg38,Q02548,ENST00000358127.9,E340D,0.0484,likely_benign,ENST00000358127,340,E


../output/caitlin_experiment/variant fastas/OTX1_P32242_AD_172-354


Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
1180,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,ENST00000282549,OTX1,351,P32242_351,chr2,...,C,hg38,P32242,ENST00000282549.7,F351S,0.9993,likely_pathogenic,ENST00000282549,351,F
1182,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,ENST00000282549,OTX1,351,P32242_351,chr2,...,A,hg38,P32242,ENST00000282549.7,F351L,0.9992,likely_pathogenic,ENST00000282549,351,F
1183,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,ENST00000282549,OTX1,351,P32242_351,chr2,...,G,hg38,P32242,ENST00000282549.7,F351L,0.9992,likely_pathogenic,ENST00000282549,351,F
1177,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,ENST00000282549,OTX1,351,P32242_351,chr2,...,C,hg38,P32242,ENST00000282549.7,F351L,0.9992,likely_pathogenic,ENST00000282549,351,F
1164,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,ENST00000282549,OTX1,349,P32242_349,chr2,...,A,hg38,P32242,ENST00000282549.7,W349R,0.9983,likely_pathogenic,ENST00000282549,349,W
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,ENST00000282549,OTX1,202,P32242_202,chr2,...,G,hg38,P32242,ENST00000282549.7,N202S,0.0480,likely_benign,ENST00000282549,202,N
670,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,ENST00000282549,OTX1,276,P32242_276,chr2,...,C,hg38,P32242,ENST00000282549.7,H276P,0.0471,likely_benign,ENST00000282549,276,H
821,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,ENST00000282549,OTX1,297,P32242_297,chr2,...,C,hg38,P32242,ENST00000282549.7,H297P,0.0466,likely_benign,ENST00000282549,297,H
722,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,ENST00000282549,OTX1,283,P32242_283,chr2,...,C,hg38,P32242,ENST00000282549.7,H283P,0.0439,likely_benign,ENST00000282549,283,H


In [22]:
pd.DataFrame(most_pathogenic["filepath"].value_counts())

Unnamed: 0_level_0,count
filepath,Unnamed: 1_level_1
../output/caitlin_experiment/variant fastas/NKX2-2_O95096_AD_220-273,10
../output/caitlin_experiment/variant fastas/PAX6_P26367_AD_271-422,10
../output/caitlin_experiment/variant fastas/NCOA1_Q15788_AD_1241-1385,10
../output/caitlin_experiment/variant fastas/ERG_P11308_AD_118-261_seq_adj,10
../output/caitlin_experiment/variant fastas/MEIS2_O14770_AD_340-477,10
../output/caitlin_experiment/variant fastas/NCOA1_Q15788_AD_840-1011,10
../output/caitlin_experiment/variant fastas/PITX1_P78337_AD_234-283,10
../output/caitlin_experiment/variant fastas/CAMTA2_O94983_AD_472-581,10
../output/caitlin_experiment/variant fastas/ERG_P11308_AD_433-479_seq_adj,10
../output/caitlin_experiment/variant fastas/CAMTA2_O94983_AD_285-468,10


In [23]:
pd.DataFrame(least_pathogenic["filepath"].value_counts())

Unnamed: 0_level_0,count
filepath,Unnamed: 1_level_1
../output/caitlin_experiment/variant fastas/NKX2-2_O95096_AD_220-273,10
../output/caitlin_experiment/variant fastas/PAX6_P26367_AD_271-422,10
../output/caitlin_experiment/variant fastas/NCOA1_Q15788_AD_1241-1385,10
../output/caitlin_experiment/variant fastas/ERG_P11308_AD_118-261_seq_adj,10
../output/caitlin_experiment/variant fastas/MEIS2_O14770_AD_340-477,10
../output/caitlin_experiment/variant fastas/NCOA1_Q15788_AD_840-1011,10
../output/caitlin_experiment/variant fastas/PITX1_P78337_AD_234-283,10
../output/caitlin_experiment/variant fastas/CAMTA2_O94983_AD_472-581,10
../output/caitlin_experiment/variant fastas/ERG_P11308_AD_433-479_seq_adj,10
../output/caitlin_experiment/variant fastas/CAMTA2_O94983_AD_285-468,10


In [24]:
most_pathogenic.to_csv("../output/am_10_most_pathogenic_15ADs.csv")
least_pathogenic.to_csv("../output/am_10_least_pathogenic_15ADs.csv")

In [25]:
most_pathogenic["am_class"].value_counts()

am_class
likely_pathogenic    150
Name: count, dtype: int64

In [26]:
least_pathogenic["am_class"].value_counts()

am_class
likely_benign    150
Name: count, dtype: int64

In [27]:
most_pathogenic[most_pathogenic["gene"] == "NCOA1"]

Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
524,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1319,Q15788_1319,chr2,...,A,hg38,Q15788,ENST00000348332.7,V1319D,0.9993,likely_pathogenic,ENST00000348332,1319.0,V
512,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1317,Q15788_1317,chr2,...,C,hg38,Q15788,ENST00000348332.7,I1317T,0.9985,likely_pathogenic,ENST00000348332,1317.0,I
510,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1317,Q15788_1317,chr2,...,A,hg38,Q15788,ENST00000348332.7,I1317N,0.9985,likely_pathogenic,ENST00000348332,1317.0,I
511,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1317,Q15788_1317,chr2,...,G,hg38,Q15788,ENST00000348332.7,I1317S,0.9982,likely_pathogenic,ENST00000348332,1317.0,I
505,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1316,Q15788_1316,chr2,...,A,hg38,Q15788,ENST00000348332.7,S1316R,0.995,likely_pathogenic,ENST00000348332,1316.0,S
477,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1313,Q15788_1313,chr2,...,T,hg38,Q15788,ENST00000348332.7,N1313I,0.995,likely_pathogenic,ENST00000348332,1313.0,N
506,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1316,Q15788_1316,chr2,...,G,hg38,Q15788,ENST00000348332.7,S1316R,0.995,likely_pathogenic,ENST00000348332,1316.0,S
499,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1316,Q15788_1316,chr2,...,C,hg38,Q15788,ENST00000348332.7,S1316R,0.995,likely_pathogenic,ENST00000348332,1316.0,S
485,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1314,Q15788_1314,chr2,...,T,hg38,Q15788,ENST00000348332.7,N1314I,0.9841,likely_pathogenic,ENST00000348332,1314.0,N
525,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1319,Q15788_1319,chr2,...,G,hg38,Q15788,ENST00000348332.7,V1319G,0.9792,likely_pathogenic,ENST00000348332,1319.0,V


In [28]:
least_pathogenic[least_pathogenic["gene"] == "NCOA1"]

Unnamed: 0,filepath,gene,uniprotID,start,end,ENST_x,Gene,range,foreign_key,#CHROM,...,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,ENST_y,AA_pos,AA_orig
426,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1305,Q15788_1305,chr2,...,G,hg38,Q15788,ENST00000348332.7,T1305A,0.0537,likely_benign,ENST00000348332,1305.0,T
147,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1263,Q15788_1263,chr2,...,G,hg38,Q15788,ENST00000348332.7,P1263A,0.0529,likely_benign,ENST00000348332,1263.0,P
206,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1272,Q15788_1272,chr2,...,G,hg38,Q15788,ENST00000348332.7,P1272A,0.0509,likely_benign,ENST00000348332,1272.0,P
432,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1306,Q15788_1306,chr2,...,G,hg38,Q15788,ENST00000348332.7,P1306A,0.0494,likely_benign,ENST00000348332,1306.0,P
764,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1353,Q15788_1353,chr2,...,T,hg38,Q15788,ENST00000348332.7,P1353S,0.0486,likely_benign,ENST00000348332,1353.0,P
834,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1363,Q15788_1363,chr2,...,C,hg38,Q15788,ENST00000348332.7,H1363P,0.0483,likely_benign,ENST00000348332,1363.0,H
643,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1336,Q15788_1336,chr2,...,T,hg38,Q15788,ENST00000348332.7,A1336S,0.0481,likely_benign,ENST00000348332,1336.0,A
212,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1273,Q15788_1273,chr2,...,G,hg38,Q15788,ENST00000348332.7,P1273A,0.048,likely_benign,ENST00000348332,1273.0,P
327,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1290,Q15788_1290,chr2,...,G,hg38,Q15788,ENST00000348332.7,I1290M,0.0473,likely_benign,ENST00000348332,1290.0,I
646,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,ENST00000348332,NCOA1,1336,Q15788_1336,chr2,...,G,hg38,Q15788,ENST00000348332.7,A1336G,0.0365,likely_benign,ENST00000348332,1336.0,A
