In [2]:
import pandas as pd
import numpy as np
import itertools
import os

In [3]:
uniprotID_ENST_mapping = pd.read_csv("../data/SFARI_TFs_with_ENST.csv")
uniprotID_ENST_mapping = uniprotID_ENST_mapping[["uniprotID", "ENST"]]
uniprotID_ENST_mapping["ENST"] = uniprotID_ENST_mapping["ENST"].str.split(".").str[0]
uniprotID_ENST_mapping_dict= dict(zip(uniprotID_ENST_mapping["uniprotID"], uniprotID_ENST_mapping["ENST"]))
uniprotID_ENST_mapping_dict['O60479'] = 'ENST00000434704'

In [4]:
d_domains = "../soto_analysis/outputs/mutations/domains_bed_format/"
uniprotIDs =[f for f in os.listdir(d_domains) if os.path.isfile(os.path.join(d_domains, f))]

In [36]:
def candidate_check(uniprotID):
    # Helper functions
    # Checks if range contains the AD
    def range_is_within(dom_start, dom_end, exon_start, exon_end):
        return (dom_start >= exon_start) and (dom_end <= exon_end)


    # Returns ranges from list
    # https://stackoverflow.com/questions/4628333/converting-a-list-of-integers-into-range-in-python
    def ranges(i):
        def ranges_helper(i):
            for a, b in itertools.groupby(enumerate(i), lambda pair: pair[1] - pair[0]):
                b = list(b)
                yield b[0][1], b[-1][1]

        return list(ranges_helper(i))

    # Lengths of ranges
    def range_lengths(ranges):
        lengths = []

        for exon_range in ranges:
            start_index = exon_range[0]
            end_index = exon_range[1]

            lengths.append(sum(TF_cds.iloc[start_index:end_index + 1]["exon_len"]))

        return lengths

    # Whether all ranges are divisible by 3
    def ranges_divis_3(ranges):
        range_lengths_full = range_lengths(ranges)
        return all([_%3 == 0 for _ in range_lengths_full])
    
    
    ENST = uniprotID_ENST_mapping_dict[uniprotID]
    
    # TF CDS coordinates
    TF_cds = pd.read_csv("../soto_analysis/outputs/mutations/cds_bed_format/" + ENST, sep = "\t", header = None)
    TF_cds["exon_len"] = (TF_cds[2] - TF_cds[1])
    TF_cds = TF_cds.sort_values(by = 2, ascending = True).reset_index(drop = True)

    # AD CDS coordinates
    AD_cds = pd.read_csv("../soto_analysis/outputs/mutations/domains_bed_format/" + uniprotID, sep = "\t", header = None)
    AD_cds = AD_cds[AD_cds[3] == "AD"]

    contains_AD_status = []

    # Checking whether each exon contains an AD region
    for i in TF_cds.index:
        contains_AD = False
        for j in AD_cds.index:
            dom_start = AD_cds[1].loc[j]
            dom_end = AD_cds[2].loc[j]
            exon_start = TF_cds[1].loc[i]
            exon_end = TF_cds[2].loc[i]

            contains_AD = contains_AD or range_is_within(dom_start,dom_end,exon_start,exon_end)

        contains_AD_status.append(contains_AD)      
    TF_cds["has_AD"] = contains_AD_status
    
    if TF_cds["has_AD"].iloc[0] == True:
        print(uniprotID + " is not a candidate.")
        print("First exon has AD.")
        print("-----")
        return False
    
    if TF_cds["has_AD"].iloc[-1] == True:
        print(uniprotID + " is not a candidate.")
        print("Last exon has AD.")
        print("-----")
        return False

    # Sets of indexes of exons with ADs or no ADs
    AD_exons = set(TF_cds[TF_cds["has_AD"] == True].index)
    all_exons = set(np.arange(len(TF_cds)))
    no_AD_exons = all_exons - AD_exons

    # Turning exon index sets into ranges
    no_AD_exon_ranges = ranges(no_AD_exons)
    AD_exon_ranges = ranges(AD_exons)
    
    
    # Check whether all surrounding exon ranges and the included exons are divisible by 3
    result = ranges_divis_3(no_AD_exon_ranges) and ranges_divis_3(AD_exon_ranges)
    
    if not result:
        print(uniprotID + " is not a candidate.")
    else:
        print(uniprotID + " IS A CANDIDATE!")
        print(ENST)
        
        
    if not ranges_divis_3(no_AD_exon_ranges):
        print("At least one exon range between AD exons not divisible by 3.")
        
    if not ranges_divis_3(AD_exon_ranges):
        print("At least one range of AD exons not divisible by 3.")
    
    display(TF_cds)
    print("-----")
    
    return result




In [37]:
results = []

for uniprotID in uniprotIDs:
    results.append(candidate_check(uniprotID))

Q14938 is not a candidate.
Last exon has AD.
-----
O14770 is not a candidate.
First exon has AD.
-----
P10827 is not a candidate.
First exon has AD.
-----
P11308 is not a candidate.
First exon has AD.
-----
Q92731 is not a candidate.
First exon has AD.
-----
P35398 is not a candidate.
At least one exon range between AD exons not divisible by 3.


Unnamed: 0,0,1,2,3,4,5,exon_len,has_AD
0,15,60497454,60497619,ENSG00000069667,ENST00000335670,-,165,False
1,15,60499891,60500004,ENSG00000069667,ENST00000335670,-,113,False
2,15,60500958,60501069,ENSG00000069667,ENST00000335670,-,111,False
3,15,60502759,60502867,ENSG00000069667,ENST00000335670,-,108,True
4,15,60503534,60503667,ENSG00000069667,ENST00000335670,-,133,True
5,15,60505507,60505629,ENSG00000069667,ENST00000335670,-,122,True
6,15,60511225,60511621,ENSG00000069667,ENST00000335670,-,396,True
7,15,60514615,60514757,ENSG00000069667,ENST00000335670,-,142,False
8,15,60531765,60531851,ENSG00000069667,ENST00000335670,-,86,False
9,15,60678656,60678686,ENSG00000069667,ENST00000335670,-,30,False


-----
Q6N021 is not a candidate.
Last exon has AD.
-----
Q14872 is not a candidate.
First exon has AD.
-----
Q02548 is not a candidate.
At least one exon range between AD exons not divisible by 3.
At least one range of AD exons not divisible by 3.


Unnamed: 0,0,1,2,3,4,5,exon_len,has_AD
0,9,36840559,36840636,ENSG00000196092,ENST00000358127,-,77,False
1,9,36846842,36846929,ENSG00000196092,ENST00000358127,-,87,True
2,9,36882003,36882105,ENSG00000196092,ENST00000358127,-,102,True
3,9,36923354,36923484,ENSG00000196092,ENST00000358127,-,130,True
4,9,36966548,36966724,ENSG00000196092,ENST00000358127,-,176,False
5,9,37002647,37002776,ENSG00000196092,ENST00000358127,-,129,False
6,9,37006472,37006537,ENSG00000196092,ENST00000358127,-,65,False
7,9,37014996,37015194,ENSG00000196092,ENST00000358127,-,198,False
8,9,37020635,37020801,ENSG00000196092,ENST00000358127,-,166,False
9,9,37033985,37034031,ENSG00000196092,ENST00000358127,-,46,False


-----
Q86V15 is not a candidate.
At least one exon range between AD exons not divisible by 3.
At least one range of AD exons not divisible by 3.


Unnamed: 0,0,1,2,3,4,5,exon_len,has_AD
0,1,10638941,10640059,ENSG00000130940,ENST00000377022,-,1118,False
1,1,10642858,10643000,ENSG00000130940,ENST00000377022,-,142,False
2,1,10643159,10643311,ENSG00000130940,ENST00000377022,-,152,False
3,1,10644916,10645088,ENSG00000130940,ENST00000377022,-,172,False
4,1,10646127,10646326,ENSG00000130940,ENST00000377022,-,199,False
5,1,10647800,10648139,ENSG00000130940,ENST00000377022,-,339,False
6,1,10649069,10649192,ENSG00000130940,ENST00000377022,-,123,False
7,1,10649282,10649437,ENSG00000130940,ENST00000377022,-,155,False
8,1,10650691,10650755,ENSG00000130940,ENST00000377022,-,64,False
9,1,10650940,10651076,ENSG00000130940,ENST00000377022,-,136,False


-----
Q9UGU0 is not a candidate.
Last exon has AD.
-----
P25490 is not a candidate.
First exon has AD.
-----
Q9NQB0 is not a candidate.
First exon has AD.
-----
P26367 is not a candidate.
First exon has AD.
-----
Q03164 is not a candidate.
At least one exon range between AD exons not divisible by 3.
At least one range of AD exons not divisible by 3.


Unnamed: 0,0,1,2,3,4,5,exon_len,has_AD
0,11,118436512,118436944,ENSG00000118058,ENST00000389506,+,432,False
1,11,118468774,118468844,ENSG00000118058,ENST00000389506,+,70,False
2,11,118471661,118474315,ENSG00000118058,ENST00000389506,+,2654,False
3,11,118476804,118476982,ENSG00000118058,ENST00000389506,+,178,False
4,11,118477966,118478201,ENSG00000118058,ENST00000389506,+,235,False
5,11,118480173,118480238,ENSG00000118058,ENST00000389506,+,65,False
6,11,118481714,118482092,ENSG00000118058,ENST00000389506,+,378,False
7,11,118482421,118482495,ENSG00000118058,ENST00000389506,+,74,False
8,11,118484182,118484314,ENSG00000118058,ENST00000389506,+,132,False
9,11,118484861,118484975,ENSG00000118058,ENST00000389506,+,114,False


-----
Q6P1N0 is not a candidate.
At least one exon range between AD exons not divisible by 3.
At least one range of AD exons not divisible by 3.


Unnamed: 0,0,1,2,3,4,5,exon_len,has_AD
0,19,13906441,13906501,ENSG00000132024,ENST00000318003,+,60,False
1,19,13909822,13909958,ENSG00000132024,ENST00000318003,+,136,True
2,19,13912322,13912438,ENSG00000132024,ENST00000318003,+,116,False
3,19,13912527,13912593,ENSG00000132024,ENST00000318003,+,66,False
4,19,13913167,13913302,ENSG00000132024,ENST00000318003,+,135,False
5,19,13913403,13913638,ENSG00000132024,ENST00000318003,+,235,False
6,19,13918069,13918194,ENSG00000132024,ENST00000318003,+,125,False
7,19,13918503,13918576,ENSG00000132024,ENST00000318003,+,73,False
8,19,13918745,13918817,ENSG00000132024,ENST00000318003,+,72,False
9,19,13918911,13919042,ENSG00000132024,ENST00000318003,+,131,False


-----
Q9Y4A8 is not a candidate.
Last exon has AD.
-----
O60479 is not a candidate.
First exon has AD.
-----
Q12857 is not a candidate.
First exon has AD.
-----
P78337 is not a candidate.
First exon has AD.
-----
Q96QS3 is not a candidate.
First exon has AD.
-----
P19532 is not a candidate.
First exon has AD.
-----
Q9H4W6 is not a candidate.
At least one exon range between AD exons not divisible by 3.
At least one range of AD exons not divisible by 3.


Unnamed: 0,0,1,2,3,4,5,exon_len,has_AD
0,10,129837928,129837960,ENSG00000108001,ENST00000355311,-,32,False
1,10,129840244,129840442,ENSG00000108001,ENST00000355311,-,198,False
2,10,129840843,129841032,ENSG00000108001,ENST00000355311,-,189,False
3,10,129842115,129842293,ENSG00000108001,ENST00000355311,-,178,False
4,10,129843136,129843202,ENSG00000108001,ENST00000355311,-,66,False
5,10,129848391,129848480,ENSG00000108001,ENST00000355311,-,89,True
6,10,129867140,129867267,ENSG00000108001,ENST00000355311,-,127,True
7,10,129867781,129867912,ENSG00000108001,ENST00000355311,-,131,True
8,10,129873451,129873596,ENSG00000108001,ENST00000355311,-,145,False
9,10,129877767,129877849,ENSG00000108001,ENST00000355311,-,82,False


-----
Q9BXK1 is not a candidate.
First exon has AD.
-----
P11473 is not a candidate.
First exon has AD.
-----
P20393 is not a candidate.
Last exon has AD.
-----
P15884 is not a candidate.
Last exon has AD.
-----
Q5T1R4 is not a candidate.
Last exon has AD.
-----
P32242 is not a candidate.
Last exon has AD.
-----
Q6ZRS2 is not a candidate.
Last exon has AD.
-----
O95096 is not a candidate.
First exon has AD.
-----
Q15788 is not a candidate.
First exon has AD.
-----
O75840 is not a candidate.
Last exon has AD.
-----
Q13485 is not a candidate.
First exon has AD.
-----
Q8NBF1 is not a candidate.
First exon has AD.
-----
Q13422 is not a candidate.
Last exon has AD.
-----
Q9Y458 is not a candidate.
Last exon has AD.
-----
O94983 is not a candidate.
At least one exon range between AD exons not divisible by 3.


Unnamed: 0,0,1,2,3,4,5,exon_len,has_AD
0,17,4968755,4968819,ENSG00000108509,ENST00000348066,-,64,False
1,17,4968906,4968981,ENSG00000108509,ENST00000348066,-,75,False
2,17,4969149,4969337,ENSG00000108509,ENST00000348066,-,188,False
3,17,4969499,4969520,ENSG00000108509,ENST00000348066,-,21,False
4,17,4969629,4969701,ENSG00000108509,ENST00000348066,-,72,False
5,17,4969901,4970085,ENSG00000108509,ENST00000348066,-,184,False
6,17,4970339,4970536,ENSG00000108509,ENST00000348066,-,197,False
7,17,4972231,4972536,ENSG00000108509,ENST00000348066,-,305,False
8,17,4972768,4972991,ENSG00000108509,ENST00000348066,-,223,False
9,17,4973174,4973253,ENSG00000108509,ENST00000348066,-,79,False


-----
Q9HBZ2 is not a candidate.
Last exon has AD.
-----
P35548 is not a candidate.
Last exon has AD.
-----
Q9UGL1 IS A CANDIDATE!
ENST00000367265


Unnamed: 0,0,1,2,3,4,5,exon_len,has_AD
0,1,202729035,202729173,ENSG00000117139,ENST00000367265,-,138,False
1,1,202729706,202730027,ENSG00000117139,ENST00000367265,-,321,False
2,1,202730908,202731063,ENSG00000117139,ENST00000367265,-,155,True
3,1,202731827,202731939,ENSG00000117139,ENST00000367265,-,112,True
4,1,202733400,202733886,ENSG00000117139,ENST00000367265,-,486,True
5,1,202735428,202735587,ENSG00000117139,ENST00000367265,-,159,False
6,1,202736212,202736392,ENSG00000117139,ENST00000367265,-,180,False
7,1,202740673,202740812,ENSG00000117139,ENST00000367265,-,139,False
8,1,202741366,202741722,ENSG00000117139,ENST00000367265,-,356,False
9,1,202742390,202742505,ENSG00000117139,ENST00000367265,-,115,False


-----
P43354 is not a candidate.
First exon has AD.
-----
Q06413 is not a candidate.
First exon has AD.
-----
Q06889 is not a candidate.
First exon has AD.
-----
Q9UL68 is not a candidate.
At least one exon range between AD exons not divisible by 3.
At least one range of AD exons not divisible by 3.


Unnamed: 0,0,1,2,3,4,5,exon_len,has_AD
0,2,1791866,1792007,ENSG00000186487,ENST00000428368,-,141,False
1,2,1792320,1792464,ENSG00000186487,ENST00000428368,-,144,False
2,2,1801695,1801799,ENSG00000186487,ENST00000428368,-,104,False
3,2,1809075,1809167,ENSG00000186487,ENST00000428368,-,92,False
4,2,1839148,1839370,ENSG00000186487,ENST00000428368,-,222,False
5,2,1840759,1840843,ENSG00000186487,ENST00000428368,-,84,False
6,2,1851640,1851703,ENSG00000186487,ENST00000428368,-,63,False
7,2,1886538,1886607,ENSG00000186487,ENST00000428368,-,69,False
8,2,1887487,1887609,ENSG00000186487,ENST00000428368,-,122,False
9,2,1889240,1889477,ENSG00000186487,ENST00000428368,-,237,False


-----


In [38]:
result_status = pd.DataFrame({"uniprotID" : uniprotIDs,
             "candidate" : results})
result_status

Unnamed: 0,uniprotID,candidate
0,Q14938,False
1,O14770,False
2,P10827,False
3,P11308,False
4,Q92731,False
5,P35398,False
6,Q6N021,False
7,Q14872,False
8,Q02548,False
9,Q86V15,False


In [39]:
potential_candidates = result_status[result_status["candidate"]]
potential_candidates

Unnamed: 0,uniprotID,candidate
40,Q9UGL1,True


In [40]:
SFARI_TFs = pd.read_csv("../data/SFARI_TFs_with_ENST.csv")
pd.merge(SFARI_TFs[["gene-symbol", "uniprotID"]], potential_candidates, on = "uniprotID")

Unnamed: 0,gene-symbol,uniprotID,candidate
0,KDM5B,Q9UGL1,True


In [41]:
# KDM5B - all variant, AD vs DBD

In [43]:
Q9UGL1_domains = pd.read_csv("../soto_analysis/outputs/mutations/domains_bed_format/Q9UGL1", sep = "\t", header = None)
Q9UGL1_ADs = Q9UGL1_domains[Q9UGL1_domains[3] == "AD"]
Q9UGL1_ADs

Unnamed: 0,0,1,2,3,4,5,6,7
0,1,202730911,202731063,AD,ENSG00000117139,.,-,ENST00000367265
1,1,202731827,202731939,AD,ENSG00000117139,.,-,ENST00000367265
2,1,202733400,202733406,AD,ENSG00000117139,.,-,ENST00000367265


In [None]:
# In exons 25, 24, and 23

In [44]:
# MEIS2 instead?

In [46]:
37101311 - 36889204

212107