In [23]:
import pandas as pd
import glob
import numpy as np
import protfasta

## Summary of counts:
- Patient variants
- AlphaMissense 10 most + least pathogenic variants per AD
- All gnomAD variants in AD
- All Clinvar variants in AD

### Patient variants

In [24]:
# Loading in the AD names
variant_fasta_filepaths = glob.glob('../output/caitlin_experiment/variant fastas/*')
ADs = pd.DataFrame({"filepath" : variant_fasta_filepaths})
ADs["gene"] = ADs["filepath"].str.split("fastas/").str[1].str.split("_").str[0]
ADs["uniprotID"] = ADs["filepath"].str.split("_").str[2]
ADs["start"] = ADs["filepath"].str.split("_AD_").str[1].str.split("-").str[0].astype(int)
ADs["end"] = ADs["filepath"].str.split("_AD_").str[1].str.split("-").str[1].str.split("_").str[0].astype(int)
ADs["name"] = ADs["filepath"].str.split("/").str[-1].str.split("_seq_adj").str[0]
ADs

Unnamed: 0,filepath,gene,uniprotID,start,end,name
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ERG_P11308_AD_118-261
1,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,NCOA1_Q15788_AD_840-1011
2,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,CAMTA2_O94983_AD_472-581
3,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,OTX1_P32242_AD_172-354
4,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,NCOA1_Q15788_AD_1241-1385
5,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,CAMTA2_O94983_AD_285-468
6,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,IKZF1_Q13422_AD_284-365
7,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,NKX2-2_O95096_AD_220-273
8,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,PAX6_P26367_AD_271-422
9,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,NR4A2_P43354_AD_584-598


In [25]:
patient_variant_counts = []
for AD in ADs["name"]:
    patient_variant_counts.append(len(protfasta.read_fasta("../soto_analysis/outputs/AD_variant_fasta/" + AD)))
ADs["patient"] = patient_variant_counts
ADs

Unnamed: 0,filepath,gene,uniprotID,start,end,name,patient
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ERG_P11308_AD_118-261,37
1,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,NCOA1_Q15788_AD_840-1011,47
2,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,CAMTA2_O94983_AD_472-581,41
3,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,OTX1_P32242_AD_172-354,67
4,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,NCOA1_Q15788_AD_1241-1385,48
5,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,CAMTA2_O94983_AD_285-468,67
6,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,IKZF1_Q13422_AD_284-365,36
7,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,NKX2-2_O95096_AD_220-273,35
8,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,PAX6_P26367_AD_271-422,42
9,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,NR4A2_P43354_AD_584-598,3


### AlphaMissense
10 most pathogenic + 10 least pathogenic = 20 variants per AD

In [26]:
ADs["am_top"] = 10
ADs["am_bottom"] = 10
ADs

Unnamed: 0,filepath,gene,uniprotID,start,end,name,patient,am_top,am_bottom
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ERG_P11308_AD_118-261,37,10,10
1,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,NCOA1_Q15788_AD_840-1011,47,10,10
2,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,CAMTA2_O94983_AD_472-581,41,10,10
3,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,OTX1_P32242_AD_172-354,67,10,10
4,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,NCOA1_Q15788_AD_1241-1385,48,10,10
5,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,CAMTA2_O94983_AD_285-468,67,10,10
6,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,IKZF1_Q13422_AD_284-365,36,10,10
7,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,NKX2-2_O95096_AD_220-273,35,10,10
8,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,PAX6_P26367_AD_271-422,42,10,10
9,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,NR4A2_P43354_AD_584-598,3,10,10


### gnomAD

In [27]:
gnomAD_counts = []
for AD in ADs["name"]:
    gnomAD_counts.append(len(protfasta.read_fasta("../soto_analysis/outputs/AD_gnomad_variant_fasta/" + AD)))
ADs["gnomAD"] = gnomAD_counts
ADs

Unnamed: 0,filepath,gene,uniprotID,start,end,name,patient,am_top,am_bottom,gnomAD
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ERG_P11308_AD_118-261,37,10,10,135
1,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,NCOA1_Q15788_AD_840-1011,47,10,10,190
2,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,CAMTA2_O94983_AD_472-581,41,10,10,127
3,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,OTX1_P32242_AD_172-354,67,10,10,244
4,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,NCOA1_Q15788_AD_1241-1385,48,10,10,187
5,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,CAMTA2_O94983_AD_285-468,67,10,10,238
6,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,IKZF1_Q13422_AD_284-365,36,10,10,113
7,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,NKX2-2_O95096_AD_220-273,35,10,10,86
8,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,PAX6_P26367_AD_271-422,42,10,10,163
9,../output/caitlin_experiment/variant fastas/NR...,NR4A2,P43354,584,598,NR4A2_P43354_AD_584-598,3,10,10,9


In [28]:
ADs = ADs.sort_values(by = "gnomAD", ascending = False)
ADs

Unnamed: 0,filepath,gene,uniprotID,start,end,name,patient,am_top,am_bottom,gnomAD
3,../output/caitlin_experiment/variant fastas/OT...,OTX1,P32242,172,354,OTX1_P32242_AD_172-354,67,10,10,244
5,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,285,468,CAMTA2_O94983_AD_285-468,67,10,10,238
1,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,840,1011,NCOA1_Q15788_AD_840-1011,47,10,10,190
4,../output/caitlin_experiment/variant fastas/NC...,NCOA1,Q15788,1241,1385,NCOA1_Q15788_AD_1241-1385,48,10,10,187
10,../output/caitlin_experiment/variant fastas/ME...,MEIS2,O14770,340,477,MEIS2_O14770_AD_340-477,67,10,10,186
8,../output/caitlin_experiment/variant fastas/PA...,PAX6,P26367,271,422,PAX6_P26367_AD_271-422,42,10,10,163
0,../output/caitlin_experiment/variant fastas/ER...,ERG,P11308,118,261,ERG_P11308_AD_118-261,37,10,10,135
2,../output/caitlin_experiment/variant fastas/CA...,CAMTA2,O94983,472,581,CAMTA2_O94983_AD_472-581,41,10,10,127
6,../output/caitlin_experiment/variant fastas/IK...,IKZF1,Q13422,284,365,IKZF1_Q13422_AD_284-365,36,10,10,113
7,../output/caitlin_experiment/variant fastas/NK...,NKX2-2,O95096,220,273,NKX2-2_O95096_AD_220-273,35,10,10,86


### Summary

In [29]:
summary = ADs[["name", "patient", "gnomAD", "am_top", "am_bottom"]]
summary

Unnamed: 0,name,patient,gnomAD,am_top,am_bottom
3,OTX1_P32242_AD_172-354,67,244,10,10
5,CAMTA2_O94983_AD_285-468,67,238,10,10
1,NCOA1_Q15788_AD_840-1011,47,190,10,10
4,NCOA1_Q15788_AD_1241-1385,48,187,10,10
10,MEIS2_O14770_AD_340-477,67,186,10,10
8,PAX6_P26367_AD_271-422,42,163,10,10
0,ERG_P11308_AD_118-261,37,135,10,10
2,CAMTA2_O94983_AD_472-581,41,127,10,10
6,IKZF1_Q13422_AD_284-365,36,113,10,10
7,NKX2-2_O95096_AD_220-273,35,86,10,10


In [30]:
summary["variant_sum"] = summary["patient"] + summary["gnomAD"] + 20
summary = summary.sort_values(by = "variant_sum", ascending = False)
summary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary["variant_sum"] = summary["patient"] + summary["gnomAD"] + 20


Unnamed: 0,name,patient,gnomAD,am_top,am_bottom,variant_sum
3,OTX1_P32242_AD_172-354,67,244,10,10,331
5,CAMTA2_O94983_AD_285-468,67,238,10,10,325
10,MEIS2_O14770_AD_340-477,67,186,10,10,273
1,NCOA1_Q15788_AD_840-1011,47,190,10,10,257
4,NCOA1_Q15788_AD_1241-1385,48,187,10,10,255
8,PAX6_P26367_AD_271-422,42,163,10,10,225
0,ERG_P11308_AD_118-261,37,135,10,10,192
2,CAMTA2_O94983_AD_472-581,41,127,10,10,188
6,IKZF1_Q13422_AD_284-365,36,113,10,10,169
7,NKX2-2_O95096_AD_220-273,35,86,10,10,141
