In [3]:
import pandas as pd
import glob

## Gathering variants
Gathering information on other variant types:
- AlphaMissense 10 most + least pathogenic variants per AD
- All gnomAD variants in AD
- Here: **All Clinvar variants in AD**

In [7]:
clinvar_gene_filepaths= glob.glob("../data/clinvar_by_gene/*")
clinvar_gene_filepaths

['../data/clinvar_by_gene/CAMTA2.txt',
 '../data/clinvar_by_gene/NKX2-2.txt',
 '../data/clinvar_by_gene/NR4A2.txt',
 '../data/clinvar_by_gene/PITX1.txt',
 '../data/clinvar_by_gene/ERG.txt',
 '../data/clinvar_by_gene/NCOA1.txt',
 '../data/clinvar_by_gene/IKZF1.txt',
 '../data/clinvar_by_gene/MEIS2.txt',
 '../data/clinvar_by_gene/PAX5.txt',
 '../data/clinvar_by_gene/PAX6.txt']

In [12]:
genes = pd.DataFrame({"filepath" : clinvar_gene_filepaths})
genes["gene"] = genes["filepath"].str.split("/").str[-1].str.split(".txt").str[0]
genes

Unnamed: 0,filepath,gene
0,../data/clinvar_by_gene/CAMTA2.txt,CAMTA2
1,../data/clinvar_by_gene/NKX2-2.txt,NKX2-2
2,../data/clinvar_by_gene/NR4A2.txt,NR4A2
3,../data/clinvar_by_gene/PITX1.txt,PITX1
4,../data/clinvar_by_gene/ERG.txt,ERG
5,../data/clinvar_by_gene/NCOA1.txt,NCOA1
6,../data/clinvar_by_gene/IKZF1.txt,IKZF1
7,../data/clinvar_by_gene/MEIS2.txt,MEIS2
8,../data/clinvar_by_gene/PAX5.txt,PAX5
9,../data/clinvar_by_gene/PAX6.txt,PAX6


Will use the protein variant column to merge with our ADs.

Plan:

For each AD:
- Keep variants where the protein change location falls within the AD.

In [17]:
# Filter to known ADs from these genes
known_ADs = pd.read_csv("../output/known_ADs_considering_isoforms_and_canonical.csv")
ADs_on_genes = known_ADs[known_ADs["Gene"].isin(genes["gene"])]
ADs_on_genes = ADs_on_genes.reset_index(drop = True)
ADs_on_genes

Unnamed: 0,Gene,Start,End,uniprotID,Matching Isoforms,Canonical Transcript ID,orig_uniprotID,Reference,TileType,ProteinRegionSeq,Notes
0,CAMTA2,285,468,O94983,"[['O94983', 'O94983-2'], ['O94983', 'O94983-2']]",nan / ENST00000348066,O94983 / O94983,"PMID: 16678093, Soto / DelRosso et al.",TF,KAHTSPSSSSSSSSSGFAEPLEIRPSPPTSRGGSSRGGTAILLLTG...,
1,CAMTA2,472,581,O94983,"[['O94983', 'O94983-2']]",ENST00000348066,O94983,DelRosso et al.,TF,SPAPLEPSSRVGRGEALFGGPVGASELEPFSLSSFPDLMGELISDE...,
2,ERG,433,479,P11308,[['P11308-3']],,P11308,"PMID: 9681824, Soto",TF,PHPPALPVTSSSFFAAPNPYWNSPTGGIYPNTRLPTSHMPSHLGTYY,
3,ERG,118,261,P11308,"[['P11308-1', 'P11308-3', 'P11308-5', 'P11308-...",nan / ENST00000288319,P11308 / P11308,"PMID: 14603248, Soto / DelRosso et al.",TF,MTTNERRVIVPADPTLWSTDHVRQWLEWAVKEYGLPDVNILLFQNI...,
4,IKZF1,284,365,Q13422,[['Q13422']],,Q13422,"PMID: 8895580, Soto",TF,GDKGLSDTPYDSSASYEKENEMMKSHVMDQAINNAINYLGAESLRP...,
5,MEIS2,340,470,O14770-4,[['O14770-4']],,O14770-4,"PMID: 20553494, Soto",TF,DQSNRAVSQGAAYSPEGQPMGSFVLDGQQHMGIRPAGLQSMPGDYV...,
6,MEIS2,340,477,O14770,[['O14770']],,O14770,"activation_regions.txt, GSL",TF,DQSNRAGFLLDPSVSQGAAYSPEGQPMGSFVLDGQQHMGIRPAGLQ...,
7,NCOA1,1,93,Q15788,"[['Q15788', 'Q15788-2', 'Q15788-3']]",,Q15788,"PMID: 9575154, Soto",TF,MSGLGDSSSDPANPDSHKRKGSPCDTLASSTEKRRREQENKYLEEL...,
8,NCOA1,1241,1385,Q15788,"[['Q15788', 'Q15788-2', 'Q15788-3']]",,Q15788,"PMID: 9427757, Soto",TF,GEANFAPSLSPGSSMVPMPIPPPQSSLLQQTPPASGYQSPDMKAWQ...,
9,NCOA1,840,1011,Q15788,"[['Q15788', 'Q15788-2', 'Q15788-3'], ['Q15788-...",nan / ENST00000288599,Q15788 / Q15788,"PMID: 9427757, 9575154, Soto / DelRosso et al.",TF,VTSVTIKSEILPASLQSATARPTSRLNRLPELELEAIDNQFGQPGT...,


In [19]:
# For each AD, fetch the relevant ClinVar database

index = 0

In [21]:
row = ADs_on_genes.iloc[index]
row

Gene                                                                  CAMTA2
Start                                                                    285
End                                                                      468
uniprotID                                                             O94983
Matching Isoforms           [['O94983', 'O94983-2'], ['O94983', 'O94983-2']]
Canonical Transcript ID                                nan / ENST00000348066
orig_uniprotID                                               O94983 / O94983
Reference                             PMID: 16678093, Soto / DelRosso et al.
TileType                                                                  TF
ProteinRegionSeq           KAHTSPSSSSSSSSSGFAEPLEIRPSPPTSRGGSSRGGTAILLLTG...
Notes                                                                    NaN
Name: 0, dtype: object

In [26]:
clinvar_filepath = genes[genes["gene"] == row["Gene"]]["filepath"].iloc[0]
clinvar_filepath

'../data/clinvar_by_gene/CAMTA2.txt'

In [39]:
gene_vars = pd.read_csv(clinvar_filepath, sep = "\t")
gene_vars["prot_pos"] = gene_vars["Protein change"].str[1:-1]
# gene_vars["prot_pos"] = gene_vars["prot_pos"].astype(int)
gene_vars

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,...,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24,prot_pos
0,NM_015099.4(CAMTA2):c.*12C>T,CAMTA2|LOC126862471,P1230S,not specified,VCV003136935,17,4872039,17,4968744,3136935,...,"Oct 13, 2023","criteria provided, single submitter",,,,,,,,1230
1,NM_015099.4(CAMTA2):c.*10C>T,CAMTA2|LOC126862471,P1229L,not specified,VCV003263096,17,4872041,17,4968746,3263096,...,"May 1, 2024","criteria provided, single submitter",,,,,,,,1229
2,NM_015099.4(CAMTA2):c.3594G>C (p.Pro1198=),CAMTA2|LOC126862471,G1221R,not specified,VCV003136934,17,4872066,17,4968771,3136934,...,"Oct 27, 2022","criteria provided, single submitter",,,,,,,,1221
3,NM_015099.4(CAMTA2):c.3582G>A (p.Gly1194=),CAMTA2|LOC126862471,A1217T,not specified,VCV002281097,17,4872078,17,4968783,2281097,...,"Mar 31, 2022","criteria provided, single submitter",,,,,,,,1217
4,NM_015099.4(CAMTA2):c.3567C>T (p.Asn1189=),CAMTA2|LOC126862471,P1212S,not specified,VCV002247091,17,4872093,17,4968798,2247091,...,"Aug 30, 2021","criteria provided, single submitter",,,,,,,,1212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,NM_015099.4(CAMTA2):c.-32C>T,CAMTA2,P13L,not specified,VCV002307915,17,4889549,17,4986254,2307915,...,"Aug 17, 2022","criteria provided, single submitter",,,,,,,,13
81,NM_015099.4(CAMTA2):c.-39C>G,CAMTA2,L11V,not specified,VCV003136933,17,4889556,17,4986261,3136933,...,"Mar 7, 2024","criteria provided, single submitter",,,,,,,,11
82,NM_015099.4(CAMTA2):c.-42C>A,CAMTA2,P10T,not specified,VCV002290795,17,4889559,17,4986264,2290795,...,"May 25, 2022","criteria provided, single submitter",,,,,,,,10
83,NM_015099.4(CAMTA2):c.-45C>T,CAMTA2,R9W,not specified,VCV003136930,17,4889562,17,4986267,3136930,...,"Jan 4, 2022","criteria provided, single submitter",,,,,,,,9


In [38]:
test = [print(_) for _ in gene_vars["Protein change"]]
test

P1230S
P1229L
G1221R
A1217T
P1212S
R1179G, R1180G, P1202R, R1175G
R1083Q, R1085Q, R1082Q, R1106Q
R1064Q, R1065Q, R1067Q, R1088Q
E1044K, E1046K, E1067K, E1043K
P1004S, P1028S, P1007S, P1005S
E1005K, E1002K, E1026K, E1003K
P1024A, P1000A, P1001A, P1003A
E963K, E964K, E966K, E987K
V959M, V962M, V960M, V983M
R952L, R955L, R976L, R953L
E947K, E948K, E950K, E971K
M941V, M938V, M939V, M962V
K904N, K927N, K906N, K903N
A899G, A902G, A900G, A923G
G884R, G905R, G881R, G882R
D880G, D877G, D901G, D878G
S872F, S874F, S895F, S871F
V852F, V851F, V854F, V875F
V852I, V875I, V851I, V854I
S851P, S853P, S850P, S874P
Q813E, Q812E, Q815E, Q836E
R806L, R827L, R803L, R804L
R806C, R803C, R827C, R804C
A782T, A805T, A784T, A781T
R777P, R800P, R779P, R776P
R776H, R779H, R800H, R777H
A771V, A772V, A774V, A795V
N751T, N774T, N753T, N750T
E751K, E728K, E727K, E730K
S712N, S713N, S715N, S736N
R710W, R733W, R709W, R712W
H705R, H707R, H704R, H728R
R702H, R704H, R701H, R725H
R693H, R717H, R694H, R696H
V685D, V686D, V688D

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [42]:
gene_vars.iloc[5]

Name                                           NM_015099.4(CAMTA2):c.3538C>G (p.Arg1180Gly)
Gene(s)                                                                 CAMTA2|LOC126862471
Protein change                                               R1179G, R1180G, P1202R, R1175G
Condition(s)                                                                  not specified
Accession                                                                      VCV002331054
GRCh37Chromosome                                                                         17
GRCh37Location                                                                      4872209
GRCh38Chromosome                                                                         17
GRCh38Location                                                                      4968914
VariationID                                                                         2331054
AlleleID(s)                                                                     

In [44]:
# Issue- multiple protein changes listed
# Idea - make bed file of clinvar variants instead. Then use soto analysis functions

In [45]:
all_clinvar_vars = []

for filepath in clinvar_gene_filepaths:
    df = pd.read_csv(filepath, sep = "\t")
    all_clinvar_vars.append(df)

all_clinvar_vars = pd.concat(all_clinvar_vars)
all_clinvar_vars

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,...,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24
0,NM_015099.4(CAMTA2):c.*12C>T,CAMTA2|LOC126862471,P1230S,not specified,VCV003136935,17,4872039,17,4968744,3136935,...,Uncertain significance,"Oct 13, 2023","criteria provided, single submitter",,,,,,,
1,NM_015099.4(CAMTA2):c.*10C>T,CAMTA2|LOC126862471,P1229L,not specified,VCV003263096,17,4872041,17,4968746,3263096,...,Uncertain significance,"May 1, 2024","criteria provided, single submitter",,,,,,,
2,NM_015099.4(CAMTA2):c.3594G>C (p.Pro1198=),CAMTA2|LOC126862471,G1221R,not specified,VCV003136934,17,4872066,17,4968771,3136934,...,Likely benign,"Oct 27, 2022","criteria provided, single submitter",,,,,,,
3,NM_015099.4(CAMTA2):c.3582G>A (p.Gly1194=),CAMTA2|LOC126862471,A1217T,not specified,VCV002281097,17,4872078,17,4968783,2281097,...,Uncertain significance,"Mar 31, 2022","criteria provided, single submitter",,,,,,,
4,NM_015099.4(CAMTA2):c.3567C>T (p.Asn1189=),CAMTA2|LOC126862471,P1212S,not specified,VCV002247091,17,4872093,17,4968798,2247091,...,Uncertain significance,"Aug 30, 2021","criteria provided, single submitter",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,NM_001368894.2(PAX6):c.3G>T (p.Met1Ile),PAX6,"M1I, A35S",Aniridia 1,VCV000492990,11,31827957,11,31806409,492990,...,Pathogenic,"Aug 2, 2017","criteria provided, single submitter",,,,,,,
226,NM_001368894.2(PAX6):c.3G>A (p.Met1Ile),PAX6,"M1I, A35T",Aniridia 1|Irido-corneo-trabecular dysgenesis,VCV000460461,11,31827957,11,31806409,460461,...,Pathogenic,"Jul 13, 2021","criteria provided, single submitter",,,,,,,
227,NM_001368894.2(PAX6):c.1A>C (p.Met1Leu),PAX6,"M1L, H34P",Aniridia 1|Irido-corneo-trabecular dysgenesis,VCV000430972,11,31827959,11,31806411,430972,...,Pathogenic,"Nov 27, 2017","criteria provided, single submitter",,,,,,,
228,NM_001368894.2(PAX6):c.1A>G (p.Met1Val),PAX6,"M1V, H34R",Irido-corneo-trabecular dysgenesis,VCV000430971,11,31827959,11,31806411,430971,...,Pathogenic,"Jul 28, 2023","criteria provided, single submitter",,,,,,,


In [51]:
pax6_vars = all_clinvar_vars[all_clinvar_vars["Name"].str.contains("PAX6")]
pax6_vars

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,...,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24
0,NM_001368894.2(PAX6):c.*272T>G,ELP4|PAX6,"L328V, L411V, L464V, L479V, L478V",Autosomal dominant keratitis|Anophthalmia-micr...,VCV000877565,11,31811210,11,31789662,877565,...,Uncertain significance,"Jan 13, 2018","criteria provided, single submitter",,,,,,,
1,NM_001368894.2(PAX6):c.*247T>A,ELP4|PAX6,"F402L, F319L, F455L, F469L, F470L",Autosomal dominant keratitis|Anophthalmia-micr...,VCV000877566,11,31811235,11,31789687,877566,...,Uncertain significance,"Jan 12, 2018","criteria provided, single submitter",,,,,,,
2,NM_001368894.2(PAX6):c.*207G>A,ELP4|PAX6,"G442E, G456E, G457E, G306E, G389E",Autosomal dominant keratitis|11p partial monos...,VCV000304353,11,31811275,11,31789727,304353,...,Uncertain significance,"Jan 12, 2018","criteria provided, single submitter",,,,,,,
3,NM_001368894.2(PAX6):c.*107G>C,ELP4|PAX6,"E423Q, E409Q, E424Q, E273Q, E356Q",carboxymethyl-dextran-A2-gadolinium-DOTA|Conge...,VCV000304355,11,31811375,11,31789827,304355,...,Uncertain significance,"Jan 13, 2018","criteria provided, single submitter",,,,,,,
4,NM_001368894.2(PAX6):c.*90A>C,ELP4|PAX6,"K350T, K267T, K403T, K417T, K418T",Autosomal dominant keratitis|Anophthalmia-micr...,VCV000879183,11,31811392,11,31789844,879183,...,Uncertain significance,"Apr 27, 2017","criteria provided, single submitter",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,NM_001368894.2(PAX6):c.3G>T (p.Met1Ile),PAX6,"M1I, A35S",Aniridia 1,VCV000492990,11,31827957,11,31806409,492990,...,Pathogenic,"Aug 2, 2017","criteria provided, single submitter",,,,,,,
226,NM_001368894.2(PAX6):c.3G>A (p.Met1Ile),PAX6,"M1I, A35T",Aniridia 1|Irido-corneo-trabecular dysgenesis,VCV000460461,11,31827957,11,31806409,460461,...,Pathogenic,"Jul 13, 2021","criteria provided, single submitter",,,,,,,
227,NM_001368894.2(PAX6):c.1A>C (p.Met1Leu),PAX6,"M1L, H34P",Aniridia 1|Irido-corneo-trabecular dysgenesis,VCV000430972,11,31827959,11,31806411,430972,...,Pathogenic,"Nov 27, 2017","criteria provided, single submitter",,,,,,,
228,NM_001368894.2(PAX6):c.1A>G (p.Met1Val),PAX6,"M1V, H34R",Irido-corneo-trabecular dysgenesis,VCV000430971,11,31827959,11,31806411,430971,...,Pathogenic,"Jul 28, 2023","criteria provided, single submitter",,,,,,,


In [60]:
pax6_vars[pax6_vars["Name"].str.contains("271")]

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,...,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24
59,NM_001368894.2(PAX6):c.812G>C (p.Trp271Ser),PAX6,"W121S, W190S, W257S, W282S, W56S, W204S, W338S...",Aniridia 1|Irido-corneo-trabecular dysgenesis,VCV002953082,11,31815346,11,31793798,2953082,...,Uncertain significance,"Oct 19, 2023","criteria provided, single submitter",,,,,,,


In [62]:
known_ADs[known_ADs["Gene"] == "PAX6"]["ProteinRegionSeq"].iloc[0]

'LRNQRRQASNTPSHIPISSSFSTSVYQPIPQPTTPVSSFTSGSMLGRTDTALTNTYSALPPMPSFTMANNLPMQPPVPSQTSSYSCMLPTSPSVNGRSYDTYTPPHMQTHMNSQPMGTSGTTSTGLISPGVSVPVQVPGSEPDMSQYWPRLQ'

In [50]:
all_clinvar_vars["Protein change"].str.contains(",")

0      False
1      False
2      False
3      False
4      False
       ...  
225     True
226     True
227     True
228     True
229    False
Name: Protein change, Length: 861, dtype: object

In [46]:
all_clinvar_vars[a

ValueError: Cannot mask with non-boolean array containing NA / NaN values