In [39]:
import pandas as pd
import glob
import numpy as np
import protfasta

## Summary of counts:
- Patient variants
- AlphaMissense 10 most + least pathogenic variants per AD
- All gnomAD variants in AD
- All Clinvar variants in AD
- COSMIC

### Patient variants

In [43]:
# Loading in the AD names
variant_fasta_filepaths = glob.glob('../output/caitlin_experiment/variant fastas/*')
ADs = pd.DataFrame({"filepath" : variant_fasta_filepaths})
ADs["gene"] = ADs["filepath"].str.split("fastas/").str[1].str.split("_").str[0]
ADs["uniprotID"] = ADs["filepath"].str.split("_").str[2]
ADs["start"] = ADs["filepath"].str.split("_AD_").str[1].str.split("-").str[0].astype(int)
ADs["end"] = ADs["filepath"].str.split("_AD_").str[1].str.split("-").str[1].str.split("_").str[0].astype(int)
ADs["name"] = ADs["filepath"].str.split("/").str[-1].str.split("_seq_adj").str[0]
ADs

Unnamed: 0,filepath,gene,uniprotID,start,end,name
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ERG_P11308_AD_118-261
1,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,NCOA1_Q15788_AD_840-1011
2,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,CAMTA2_O94983_AD_472-581
3,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,OTX1_P32242_AD_172-354
4,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,NCOA1_Q15788_AD_1241-1385
5,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,CAMTA2_O94983_AD_285-468
6,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,IKZF1_Q13422_AD_284-365
7,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,NKX2-2_O95096_AD_220-273
8,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,PAX6_P26367_AD_271-422
9,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,NR4A2_P43354_AD_584-598


In [45]:
def add_counts_col(filepath, colname, clinvar = False):
    ADs_with_clinvar_vars = [_.split("/")[-1] for _ in glob.glob("../soto_analysis/outputs/AD_clinvar_for_15_variant_fasta/*")]

    
    patient_variant_counts = []
    for AD in ADs["name"]:
        if clinvar and AD not in ADs_with_clinvar_vars:
                patient_variant_counts.append(0)
        else:
            fasta_dict = protfasta.read_fasta(filepath + AD)
            fasta_df = pd.DataFrame({"id":fasta_dict.keys(), "seq":fasta_dict.values()})
            fasta_df = fasta_df.drop_duplicates(subset = ["seq"])
            patient_variant_counts.append(len(fasta_df))
    ADs[colname] = patient_variant_counts
    return ADs

In [47]:
add_counts_col("../soto_analysis/outputs/AD_variant_fasta/", "patient")

Unnamed: 0,filepath,gene,uniprotID,start,end,name,patient
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ERG_P11308_AD_118-261,35
1,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,NCOA1_Q15788_AD_840-1011,47
2,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,CAMTA2_O94983_AD_472-581,41
3,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,OTX1_P32242_AD_172-354,67
4,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,NCOA1_Q15788_AD_1241-1385,48
5,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,CAMTA2_O94983_AD_285-468,66
6,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,IKZF1_Q13422_AD_284-365,36
7,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,NKX2-2_O95096_AD_220-273,35
8,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,PAX6_P26367_AD_271-422,41
9,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,NR4A2_P43354_AD_584-598,3


In [49]:
add_counts_col("../soto_analysis/outputs/AD_gnomad_variant_fasta/", "gnomad")

Unnamed: 0,filepath,gene,uniprotID,start,end,name,patient,gnomad
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ERG_P11308_AD_118-261,35,138
1,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,NCOA1_Q15788_AD_840-1011,47,188
2,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,CAMTA2_O94983_AD_472-581,41,123
3,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,OTX1_P32242_AD_172-354,67,240
4,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,NCOA1_Q15788_AD_1241-1385,48,183
5,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,CAMTA2_O94983_AD_285-468,66,235
6,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,IKZF1_Q13422_AD_284-365,36,111
7,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,NKX2-2_O95096_AD_220-273,35,84
8,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,PAX6_P26367_AD_271-422,41,160
9,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,NR4A2_P43354_AD_584-598,3,9


In [51]:
add_counts_col("../soto_analysis/outputs/AD_clinvar_for_15_variant_fasta/", "clinvar", clinvar = True)

Unnamed: 0,filepath,gene,uniprotID,start,end,name,patient,gnomad,clinvar
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ERG_P11308_AD_118-261,35,138,1
1,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,NCOA1_Q15788_AD_840-1011,47,188,0
2,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,CAMTA2_O94983_AD_472-581,41,123,0
3,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,OTX1_P32242_AD_172-354,67,240,3
4,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,NCOA1_Q15788_AD_1241-1385,48,183,0
5,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,CAMTA2_O94983_AD_285-468,66,235,0
6,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,IKZF1_Q13422_AD_284-365,36,111,7
7,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,NKX2-2_O95096_AD_220-273,35,84,2
8,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,PAX6_P26367_AD_271-422,41,160,31
9,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,NR4A2_P43354_AD_584-598,3,9,1


In [53]:
ADs

Unnamed: 0,filepath,gene,uniprotID,start,end,name,patient,gnomad,clinvar
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ERG_P11308_AD_118-261,35,138,1
1,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,NCOA1_Q15788_AD_840-1011,47,188,0
2,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,CAMTA2_O94983_AD_472-581,41,123,0
3,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,OTX1_P32242_AD_172-354,67,240,3
4,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,NCOA1_Q15788_AD_1241-1385,48,183,0
5,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,CAMTA2_O94983_AD_285-468,66,235,0
6,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,IKZF1_Q13422_AD_284-365,36,111,7
7,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,NKX2-2_O95096_AD_220-273,35,84,2
8,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,PAX6_P26367_AD_271-422,41,160,31
9,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,NR4A2_P43354_AD_584-598,3,9,1


In [55]:
summary = ADs


In [57]:
summary = summary.drop(columns = ["filepath", "name"])
summary

Unnamed: 0,gene,uniprotID,start,end,patient,gnomad,clinvar
0,ERG,P11308,118,261,35,138,1
1,NCOA1,Q15788,840,1011,47,188,0
2,CAMTA2,O94983,472,581,41,123,0
3,OTX1,P32242,172,354,67,240,3
4,NCOA1,Q15788,1241,1385,48,183,0
5,CAMTA2,O94983,285,468,66,235,0
6,IKZF1,Q13422,284,365,36,111,7
7,NKX2-2,O95096,220,273,35,84,2
8,PAX6,P26367,271,422,41,160,31
9,NR4A2,P43354,584,598,3,9,1


In [59]:
summary.sum(numeric_only=True, axis=0)

start      5799
end        7406
patient     555
gnomad     1939
clinvar      65
dtype: int64

In [61]:
cc_names = pd.read_excel("../data/gene_names_with_location_and_pmvs.xlsx")
cc_names

Unnamed: 0,pMVS #,Gene Name,gene,start,end
0,377,MEIS2_AD,MEIS2,340,477
1,402,IKZF1_AD,IKZF1,284,365
2,405,CAMTA2_AD2,CAMTA2,472,581
3,409,PITX1_AD,PITX1,234,283
4,383,NR4A2_AD2,NR4A2,584,598
5,404,CAMTA2_AD1,CAMTA2,285,468
6,407,OTX1_AD,OTX1,172,354
7,410,PAX5_AD,PAX5,304,358
8,403,PAX6_AD,PAX6,271,422
9,379,NCOA1_AD2,NCOA1,1241,1385


In [63]:
summary = pd.merge(cc_names, summary)
summary

Unnamed: 0,pMVS #,Gene Name,gene,start,end,uniprotID,patient,gnomad,clinvar
0,377,MEIS2_AD,MEIS2,340,477,O14770,66,184,3
1,402,IKZF1_AD,IKZF1,284,365,Q13422,36,111,7
2,405,CAMTA2_AD2,CAMTA2,472,581,O94983,41,123,0
3,409,PITX1_AD,PITX1,234,283,P78337,23,76,3
4,383,NR4A2_AD2,NR4A2,584,598,P43354,3,9,1
5,404,CAMTA2_AD1,CAMTA2,285,468,O94983,66,235,0
6,407,OTX1_AD,OTX1,172,354,P32242,67,240,3
7,410,PAX5_AD,PAX5,304,358,Q02548,15,70,12
8,403,PAX6_AD,PAX6,271,422,P26367,41,160,31
9,379,NCOA1_AD2,NCOA1,1241,1385,Q15788,48,183,0


In [65]:
# Addign cosmic

cosmic_counts = pd.read_csv("../output/caitlin_experiment/cosmic_counts.csv")
cosmic_counts = cosmic_counts.rename(columns = {"count" : "cosmic"})
cosmic_counts

Unnamed: 0,Gene Name,cosmic
0,OTX1_AD,79
1,ERG_AD2,73
2,IKZF1_AD,73
3,NCOA1_AD3,56
4,MEIS2_AD,55
5,PAX5_AD,52
6,NCOA1_AD2,46
7,PAX6_AD,46
8,CAMTA2_AD1,43
9,CAMTA2_AD2,30


In [67]:
summary = pd.merge(summary, cosmic_counts)
summary

Unnamed: 0,pMVS #,Gene Name,gene,start,end,uniprotID,patient,gnomad,clinvar,cosmic
0,377,MEIS2_AD,MEIS2,340,477,O14770,66,184,3,55
1,402,IKZF1_AD,IKZF1,284,365,Q13422,36,111,7,73
2,405,CAMTA2_AD2,CAMTA2,472,581,O94983,41,123,0,30
3,409,PITX1_AD,PITX1,234,283,P78337,23,76,3,18
4,383,NR4A2_AD2,NR4A2,584,598,P43354,3,9,1,4
5,404,CAMTA2_AD1,CAMTA2,285,468,O94983,66,235,0,43
6,407,OTX1_AD,OTX1,172,354,P32242,67,240,3,79
7,410,PAX5_AD,PAX5,304,358,Q02548,15,70,12,52
8,403,PAX6_AD,PAX6,271,422,P26367,41,160,31,46
9,379,NCOA1_AD2,NCOA1,1241,1385,Q15788,48,183,0,46


In [69]:
phosphosite_counts = pd.read_csv("../output/caitlin_experiment/phosphosite_counts.csv")
phosphosite_counts = phosphosite_counts.rename(columns = {"count" : "PhosphositePlus"})
phosphosite_counts

Unnamed: 0,Gene Name,PhosphositePlus
0,IKZF1_AD,14
1,CAMTA2_AD1,10
2,NCOA1_AD3,5
3,ERG_AD2,4
4,NCOA1_AD2,4
5,NR4A2_AD1,1
6,OTX1_AD,1
7,PAX6_AD,1


In [71]:
summary = pd.merge(summary, phosphosite_counts, how = "left")
summary = summary.fillna(0)
summary["PhosphositePlus"] = summary["PhosphositePlus"].astype(int)
summary

Unnamed: 0,pMVS #,Gene Name,gene,start,end,uniprotID,patient,gnomad,clinvar,cosmic,PhosphositePlus
0,377,MEIS2_AD,MEIS2,340,477,O14770,66,184,3,55,0
1,402,IKZF1_AD,IKZF1,284,365,Q13422,36,111,7,73,14
2,405,CAMTA2_AD2,CAMTA2,472,581,O94983,41,123,0,30,0
3,409,PITX1_AD,PITX1,234,283,P78337,23,76,3,18,0
4,383,NR4A2_AD2,NR4A2,584,598,P43354,3,9,1,4,0
5,404,CAMTA2_AD1,CAMTA2,285,468,O94983,66,235,0,43,10
6,407,OTX1_AD,OTX1,172,354,P32242,67,240,3,79,1
7,410,PAX5_AD,PAX5,304,358,Q02548,15,70,12,52,0
8,403,PAX6_AD,PAX6,271,422,P26367,41,160,31,46,1
9,379,NCOA1_AD2,NCOA1,1241,1385,Q15788,48,183,0,46,4


In [73]:
summary["length"] = summary["end"].astype(int) - summary["start"].astype(int) + 1
summary["variant_sum"] = summary["patient"] + summary["gnomad"] + summary["clinvar"] + summary["cosmic"]
summary = summary.sort_values(by = "length", ascending = False)
summary

Unnamed: 0,pMVS #,Gene Name,gene,start,end,uniprotID,patient,gnomad,clinvar,cosmic,PhosphositePlus,length,variant_sum
5,404,CAMTA2_AD1,CAMTA2,285,468,O94983,66,235,0,43,10,184,344
6,407,OTX1_AD,OTX1,172,354,P32242,67,240,3,79,1,183,389
13,380,NCOA1_AD3,NCOA1,840,1011,Q15788,47,188,0,56,5,172,291
8,403,PAX6_AD,PAX6,271,422,P26367,41,160,31,46,1,152,278
9,379,NCOA1_AD2,NCOA1,1241,1385,Q15788,48,183,0,46,4,145,277
12,406,ERG_AD2,ERG,118,261,P11308,35,138,1,73,4,144,247
0,377,MEIS2_AD,MEIS2,340,477,O14770,66,184,3,55,0,138,308
2,405,CAMTA2_AD2,CAMTA2,472,581,O94983,41,123,0,30,0,110,194
11,382,NR4A2_AD1,NR4A2,1,91,P43354,13,81,2,26,1,91,122
1,402,IKZF1_AD,IKZF1,284,365,Q13422,36,111,7,73,14,82,227


In [83]:
total = 0
for column in ["patient", "gnomad", "clinvar", "cosmic"]:
    print(column)
    print(sum(summary[column]))
    total += sum(summary[column])
print()

print("total")
print(total - 555)

patient
555
gnomad
1939
clinvar
65
cosmic
646

total
2650
