In [65]:
import pandas as pd
import glob
import numpy as np

## Gathering variants
Gathering information on other variant types:
- Here: **AlphaMissense 10 most + least pathogenic variants per AD**
- All gnomAD variants in AD
- All Clinvar variants in AD

### Loading in the AD names

In [104]:
variant_fasta_filepaths = glob.glob('../output/caitlin_experiment/variant fastas/*')
variant_fasta_filepaths

['../output/caitlin_experiment/variant fastas/ERG_P11308_AD_118-261_seq_adj',
 '../output/caitlin_experiment/variant fastas/NCOA1_Q15788_AD_840-1011',
 '../output/caitlin_experiment/variant fastas/CAMTA2_O94983_AD_472-581',
 '../output/caitlin_experiment/variant fastas/OTX1_P32242_AD_172-354',
 '../output/caitlin_experiment/variant fastas/NCOA1_Q15788_AD_1241-1385',
 '../output/caitlin_experiment/variant fastas/CAMTA2_O94983_AD_285-468',
 '../output/caitlin_experiment/variant fastas/IKZF1_Q13422_AD_284-365',
 '../output/caitlin_experiment/variant fastas/NKX2-2_O95096_AD_220-273',
 '../output/caitlin_experiment/variant fastas/PAX6_P26367_AD_271-422',
 '../output/caitlin_experiment/variant fastas/NR4A2_P43354_AD_584-598',
 '../output/caitlin_experiment/variant fastas/MEIS2_O14770_AD_340-477',
 '../output/caitlin_experiment/variant fastas/PITX1_P78337_AD_234-283',
 '../output/caitlin_experiment/variant fastas/ERG_P11308_AD_433-479_seq_adj',
 '../output/caitlin_experiment/variant fastas/NR

In [105]:
ADs = pd.DataFrame({"filepath" : variant_fasta_filepaths})
ADs["gene"] = ADs["filepath"].str.split("fastas/").str[1].str.split("_").str[0]
ADs["uniprotID"] = ADs["filepath"].str.split("_").str[2]
ADs["start"] = ADs["filepath"].str.split("_AD_").str[1].str.split("-").str[0].astype(int)
ADs["end"] = ADs["filepath"].str.split("_AD_").str[1].str.split("-").str[1].str.split("_").str[0].astype(int)
ADs

Unnamed: 0,filepath,gene,uniprotID,start,end
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261
1,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011
2,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581
3,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354
4,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385
5,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468
6,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365
7,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273
8,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422
9,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598


### AlphaMissense

In [106]:
# What does table look like
test_chunk = pd.read_csv("../data/AlphaMissense_hg38.tsv", sep = "\t", header = 3, nrows = 5000)
test_chunk

Unnamed: 0,#CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class
0,chr1,69094,G,T,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign
1,chr1,69094,G,C,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign
2,chr1,69094,G,A,hg38,Q8NH21,ENST00000335137.4,V2M,0.3296,likely_benign
3,chr1,69095,T,C,hg38,Q8NH21,ENST00000335137.4,V2A,0.2609,likely_benign
4,chr1,69095,T,A,hg38,Q8NH21,ENST00000335137.4,V2E,0.2922,likely_benign
...,...,...,...,...,...,...,...,...,...,...
4995,chr1,942896,G,A,hg38,Q96NU1,ENST00000342066.8,D468N,0.2016,likely_benign
4996,chr1,942896,G,C,hg38,Q96NU1,ENST00000342066.8,D468H,0.4723,ambiguous
4997,chr1,942896,G,T,hg38,Q96NU1,ENST00000342066.8,D468Y,0.2192,likely_benign
4998,chr1,942897,A,C,hg38,Q96NU1,ENST00000342066.8,D468A,0.2714,likely_benign


Will use the protein variant column and uniprotID to merge with our ADs. 

Plan: 
- AlphaMissense table: create a foreign_key column on the uniprot_id + "_" + position
- ADs table: expand to create a foreign_key column on every possible uniprotID + "_" + position combo using start and end
- Then, search file for matches on foreign_key column

In [107]:
# AlphaMissense table: create an index on the uniprot_id + "_" + position

test_chunk["prot_pos"] = test_chunk["protein_variant"].str[1:-1]
test_chunk["foreign_key"] = test_chunk["uniprot_id"] + "_" + test_chunk["prot_pos"]
test_chunk = test_chunk.drop(columns = "prot_pos")
test_chunk

Unnamed: 0,#CHROM,POS,REF,ALT,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,foreign_key
0,chr1,69094,G,T,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign,Q8NH21_2
1,chr1,69094,G,C,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign,Q8NH21_2
2,chr1,69094,G,A,hg38,Q8NH21,ENST00000335137.4,V2M,0.3296,likely_benign,Q8NH21_2
3,chr1,69095,T,C,hg38,Q8NH21,ENST00000335137.4,V2A,0.2609,likely_benign,Q8NH21_2
4,chr1,69095,T,A,hg38,Q8NH21,ENST00000335137.4,V2E,0.2922,likely_benign,Q8NH21_2
...,...,...,...,...,...,...,...,...,...,...,...
4995,chr1,942896,G,A,hg38,Q96NU1,ENST00000342066.8,D468N,0.2016,likely_benign,Q96NU1_468
4996,chr1,942896,G,C,hg38,Q96NU1,ENST00000342066.8,D468H,0.4723,ambiguous,Q96NU1_468
4997,chr1,942896,G,T,hg38,Q96NU1,ENST00000342066.8,D468Y,0.2192,likely_benign,Q96NU1_468
4998,chr1,942897,A,C,hg38,Q96NU1,ENST00000342066.8,D468A,0.2714,likely_benign,Q96NU1_468


In [108]:
# ADs table: expand to create a foreign_key column on every possible uniprotID + "_" + position combo using start and end
# ADs = ADs[["gene", "uniprotID", "start", "end"]]

# Adding one row per position
ADs["range"] = ADs.apply(lambda row: np.arange(row["start"], row["end"] + 1), axis=1)
ADs = ADs.explode('range')

# Creating and saving foreign key
ADs["foreign_key"] = ADs["uniprotID"] + "_" + ADs["range"].astype(str)
AD_foreign_key_set = set(ADs["foreign_key"])

In [80]:
# Now, search for matches using foreign key column
# Code adapted from http://localhost:8888/notebooks/Desktop/Staller_Lab/SFARI/notebooks/AlphaMissense%20Pathogenicity%20Preds%20-%20V2.ipynb

chunksize = 100000 
am_output = pd.DataFrame()
num_vars = 0
alpha_m = "../data/AlphaMissense_hg38.tsv"


# Reading in chunks, keeping rows overlapping a variant
for chunk in pd.read_csv(alpha_m, sep='\t', chunksize=chunksize, header = 3):

    
    chunk["prot_pos"] = chunk["protein_variant"].str[1:-1]
    chunk["foreign_key"] = chunk["uniprot_id"] + "_" + chunk["prot_pos"]
    chunk = chunk.drop(columns = "prot_pos")

    # display(chunk)

    keep = chunk[chunk["foreign_key"].isin(AD_foreign_key_set)]
    am_output = pd.concat([am_output, keep])
    if len(keep):
        print("Found " + str(len(set(am_output.index))) + " variants")

        # display(joined)

Found 993 variants
Found 1920 variants
Found 3781 variants
Found 4892 variants
Found 5869 variants
Found 7072 variants
Found 7770 variants
Found 8123 variants
Found 9373 variants
Found 9694 variants
Found 10240 variants
Found 10594 variants


In [109]:
# Does this match how many variants I expected?
# Expect number of positions * 20 residues

len(AD_foreign_key_set) * 20

32440

In [110]:
# Represents all uniprotIDs ✅
print(len(set(am_output["uniprot_id"])))
print(len(set(ADs["uniprotID"])))

11
11


### Now, for each AD- save the top 10 and bottom 10 most pathogenic variants

In [111]:
ADs

Unnamed: 0,filepath,gene,uniprotID,start,end,range,foreign_key
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,118,P11308_118
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,119,P11308_119
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,120,P11308_120
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,121,P11308_121
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,122,P11308_122
...,...,...,...,...,...,...,...
14,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,354,Q02548_354
14,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,355,Q02548_355
14,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,356,Q02548_356
14,../output/caitlin_experiment/variant fastas/PA...,PAX5,Q02548,304,358,357,Q02548_357


In [146]:
most_pathogenic = pd.DataFrame()
least_pathogenic = pd.DataFrame()

for filepath in set(ADs["filepath"]):
    AD_pos_rows = ADs[ADs["filepath"] == filepath]

    # Filter am output to that AD's variants only
    AD_am_output = pd.merge(AD_pos_rows, am_output, on = "foreign_key", how = "left")
    AD_am_output = AD_am_output.sort_values(by = "am_pathogenicity", ascending = False)

    # Sort by pathogenicity to get top 10 and bottom 10 for each AD
    most_pathogenic = pd.concat([most_pathogenic, AD_am_output.iloc[:10]])
    least_pathogenic = pd.concat([least_pathogenic, AD_am_output.iloc[:10]])

In [147]:
pd.DataFrame(most_pathogenic["filepath"].value_counts())

Unnamed: 0_level_0,count
filepath,Unnamed: 1_level_1
../output/caitlin_experiment/variant fastas/NCOA1_Q15788_AD_840-1011,10
../output/caitlin_experiment/variant fastas/ERG_P11308_AD_118-261_seq_adj,10
../output/caitlin_experiment/variant fastas/IKZF1_Q13422_AD_284-365,10
../output/caitlin_experiment/variant fastas/ERG_P11308_AD_433-479_seq_adj,10
../output/caitlin_experiment/variant fastas/NR4A2_P43354_AD_1-91,10
../output/caitlin_experiment/variant fastas/PAX6_P26367_AD_271-422,10
../output/caitlin_experiment/variant fastas/CAMTA2_O94983_AD_472-581,10
../output/caitlin_experiment/variant fastas/PITX1_P78337_AD_234-283,10
../output/caitlin_experiment/variant fastas/NKX2-2_O95096_AD_220-273,10
../output/caitlin_experiment/variant fastas/PAX5_Q02548_AD_304-358,10


In [148]:
pd.DataFrame(least_pathogenic["filepath"].value_counts())

Unnamed: 0_level_0,count
filepath,Unnamed: 1_level_1
../output/caitlin_experiment/variant fastas/NCOA1_Q15788_AD_840-1011,10
../output/caitlin_experiment/variant fastas/ERG_P11308_AD_118-261_seq_adj,10
../output/caitlin_experiment/variant fastas/IKZF1_Q13422_AD_284-365,10
../output/caitlin_experiment/variant fastas/ERG_P11308_AD_433-479_seq_adj,10
../output/caitlin_experiment/variant fastas/NR4A2_P43354_AD_1-91,10
../output/caitlin_experiment/variant fastas/PAX6_P26367_AD_271-422,10
../output/caitlin_experiment/variant fastas/CAMTA2_O94983_AD_472-581,10
../output/caitlin_experiment/variant fastas/PITX1_P78337_AD_234-283,10
../output/caitlin_experiment/variant fastas/NKX2-2_O95096_AD_220-273,10
../output/caitlin_experiment/variant fastas/PAX5_Q02548_AD_304-358,10
