In [62]:
import re
import pandas as pd
from Bio import SeqIO
import hashlib

In [63]:
class SimpleCluster:
    def __init__(self, x):
        self.x = x

    def __hash__(self):
        return int(hashlib.md5("".join(self.x).encode()).hexdigest(), 16)


def get_hash(plist):
    c = SimpleCluster(plist)
    return hash(c)

In [66]:
clusters = []
cluster_dict = {}

In [67]:
with open("Pdam.recipe_clusters.csv", "r") as cluster_fi:
    for line in cluster_fi:
        c = line.strip().split(",")
        clusters.append(c)
        cluster_dict[get_hash(c)] = c

In [13]:
len(clusters)

582

In [15]:
loc_map_dir = "/afs/csail.mit.edu/u/s/samsl/Work/databases/coral/Pdam/pdam_loc_mapping"

In [70]:
blast_results = pd.read_csv(f"{loc_map_dir}/blast_results.tsv", sep="\t", header=None)
blast_results = blast_results.sort_values(0).drop_duplicates([0, 1])
blast_results = blast_results.set_index(0)

In [123]:
blast_results = blast_results[blast_results[10] < 1e-5]

In [124]:
len(blast_results)

21894

In [125]:
blast_results.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
pdam_00000001-RA,XP_027056588.1,100.0,360,0,0,1,360,31,390,0.0,737.0
pdam_00000002-RA,XP_027056458.1,93.44,747,14,2,1,712,13,759,0.0,1417.0
pdam_00000003-RA,XP_027056664.1,100.0,333,0,0,1,333,1,333,0.0,701.0
pdam_00000005-RA,XP_027056460.1,71.429,63,18,0,1,63,170,232,2.5500000000000003e-27,102.0
pdam_00000006-RA,XP_027056665.1,88.066,243,29,0,1,243,1,243,2.1200000000000003e-159,446.0


In [126]:
pep_fasta = SeqIO.to_dict(
    SeqIO.parse(
        f"{loc_map_dir}/Pocillopora_damicornis_gca003704095v1.ASM370409v1.pep.all.fa",
        "fasta",
    )
)

In [127]:
REGEXP = ".*gene:(LOC\\d+) .*"
regcomp = re.compile(REGEXP)

In [128]:
xp_loc_map = {}
for k, v in pep_fasta.items():
    try:
        locid = regcomp.search(v.description).group(1)
        xp_loc_map[k] = locid
    except AttributeError:
        continue

In [129]:
len(xp_loc_map)

25170

In [130]:
blast_results.loc["pdam_00000005-RA", 1]

'XP_027056460.1'

In [131]:
def cluster_name_map(c, blast_map, loc_map):
    mapped_names = []
    for pdam_id in c:
        try:
            xpid = blast_results.loc[pdam_id, 1]
            locid = xp_loc_map[xpid]
        except (KeyError, IndexError):
            locid = "Failed to Map"
        mapped_names.append(locid)
    return mapped_names

In [132]:
chash = 1332063120138743063
# chash = 143188377126489644
selected_cluster = cluster_dict[chash]

In [133]:
selected_cluster

['pdam_00002129-RA',
 'pdam_00001718-RA',
 'pdam_00014456-RA',
 'pdam_00009498-RA',
 'pdam_00009962-RA',
 'pdam_00018634-RA',
 'pdam_00008012-RA',
 'pdam_00019148-RA',
 'pdam_00013926-RA',
 'pdam_00005755-RA',
 'pdam_00007804-RA',
 'pdam_00019311-RA',
 'pdam_00005793-RA',
 'pdam_00013319-RA']

In [134]:
len(selected_cluster)

14

In [135]:
for en, i in enumerate(cluster_name_map(selected_cluster, blast_results, xp_loc_map)):
    # print(en+1, i)
    print(i)

LOC113669986
LOC113664316
LOC113668089
LOC113682998
LOC113680209
LOC113665596
LOC113668798
LOC113673347
LOC113666247
LOC113684269
LOC113676398
LOC113670542
LOC113684250
LOC113665980
