In [116]:
data_dir = "/home/cadel/data/deduce/Bej04/synteny"

In [138]:
import os
import pandas as pd
from functools import reduce
import numpy as np

def load_uces():
    beds = []
    genome_names = []
    for f in os.listdir(data_dir):
        if f.endswith(".bed"):
            path = os.path.join(data_dir, f)
            name = f.replace(".bed", "")
            genome_names.append(name)
            df = pd.read_csv(path, sep='\t', names=["contig", "start", "end", "uce_id", "identity"])
            df["genome"] = np.repeat(name, len(df))
            beds.append(df)
    
    return reduce(lambda acc, new: acc.append(new), beds), genome_names
    
uces, genome_names = load_uces()
uces['id'] = uces.index
uces = uces.reset_index()

In [131]:
from typing import NamedTuple, Dict, Tuple, List
from collections import defaultdict

combined_uces = defaultdict(lambda: defaultdict(list))

for u in uces.to_dict('records'):
    combined_uces[u["id"]][u["genome"]].append((u["contig"], u["start"]))

In [86]:
import networkx as nx

def filter_appearances(apps, contig_filter):
    return {g: [p[1] for p in apps[g] if p[0] == ctg] for g, ctg in contig_filter if len([p[1] for p in apps[g] if p[0] == ctg]) != 0}



In [145]:
import itertools
from collections import defaultdict
combined_uces
all_contigs = set([(genome_name, contig[0]) for genome_apps in combined_uces.values() for genome_name, contigs in genome_apps.items() for contig in contigs])

contigs_by_genome = []
for k, g in itertools.groupby(sorted(all_contigs, key=lambda x: x[0]), key=lambda x: x[0]):
    contigs_by_genome.append(list(g))
    
contigs_by_genome

[[('bej04_200bp_100_minimap_7f14d17-hg16', 'chr18'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr17'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr2'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr4'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr8'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chrX'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr19'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr13'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr15'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr10'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr14'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr12'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr11'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr5'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr16'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr20'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr1'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr3'),
  ('bej04_200bp_100_minimap_7f14d17-hg16', 'chr22'),


In [180]:
from typing import NamedTuple, Tuple, Any, List

genome_instances = []
for g, u in itertools.groupby(sorted(uces.to_dict('records'), key=lambda x: x["genome"]), key=lambda x: x["genome"]):
    genome_instances.append(list(u))

Node = NamedTuple("Node", [("instances", Tuple[Any]), ("contigs", List[str])])

possible_nodes = [Node(n, [x["contig"] for x in n]) for n in itertools.product(*genome_instances)]

# list(itertools.product(*genome_instances))[:5]
# sorted(itertools.product(*genome_instances), key=lambda x: [y["contig"] for y in x])
    

In [184]:
sorted_possible_nodes = list(sorted(possible_nodes, key=lambda x: x.contigs))

KeyboardInterrupt: 

In [166]:
def find_syntenic_blocks_fast(uce_df, min_genomes = 3):
    
    uce_graph = nx.DiGraph()

    for instance_group in itertools.product(unassigned_uces.items(), unassigned_uces.items()):
        id_1, unfiltered_appearances_1 = u1
        id_2, unfiltered_appearances_2 = u2

        if id_1 == id_2:
            continue

        appearances_1 = filter_appearances(unfiltered_appearances_1, contig_filter)
        appearances_2 = filter_appearances(unfiltered_appearances_2, contig_filter)

        if len(appearances_1) < min_genomes or len(appearances_2) < min_genomes:
            continue

        uce_graph.add_node(id_1)
        uce_graph.add_node(id_2)

        relationships = set()

        # Compare appearances
        # Add edge if at least threshold relationships

        #existing_relationships = uce_graph.get_edge_data(id_1, id_2, default=set())
#combined_uces

s = 0
min_genomes = 3
for contig_filter in list(itertools.product(*contigs_by_genome))[:5]:
    #print(contig_filter)
    relevant_contigs = {g:c for g,c in contig_filter}
    relevant_instances = uces[uces["genome"].map(relevant_contigs).eq(uces["contig"])]
    s += len(relevant_instances)
    genome_instances = []
    for g, u in itertools.groupby(sorted(relevant_instances.to_dict('records'), key=lambda x: x["genome"]), key=lambda x: x["genome"]):
        genome_instances.append(list(u))

    possible_nodes = sorted(itertools.product(*genome_instances), key=lambda x: [y["contig"] for y in x])
    
    uce_graph = nx.DiGraph()
    
        
    
print(len(uces))
print(s)

rc = {'bej04_200bp_100_minimap_7f14d17-hg16': 'chr1', 'bej04_200bp_100_minimap_7f14d17-mm3': 'chr1', 'bej04_200bp_100_minimap_7f14d17-rn3': 'chr14'}
uces[uces["genome"].map(rc).eq(uces["contig"])]

[({'index': 1, 'contig': 'chr18', 'start': 28605195, 'end': 28605434, 'uce_id': 1, 'identity': 100.0, 'genome': 'bej04_200bp_100_minimap_7f14d17-hg16', 'id': 1}, {'index': 4, 'contig': 'chr11', 'start': 88826144, 'end': 88826433, 'uce_id': 4, 'identity': 100.0, 'genome': 'bej04_200bp_100_minimap_7f14d17-mm3', 'id': 4}, {'index': 2, 'contig': 'chr14', 'start': 63603227, 'end': 63603522, 'uce_id': 2, 'identity': 100.0, 'genome': 'bej04_200bp_100_minimap_7f14d17-rn3', 'id': 2}), ({'index': 1, 'contig': 'chr18', 'start': 28605195, 'end': 28605434, 'uce_id': 1, 'identity': 100.0, 'genome': 'bej04_200bp_100_minimap_7f14d17-hg16', 'id': 1}, {'index': 4, 'contig': 'chr11', 'start': 88826144, 'end': 88826433, 'uce_id': 4, 'identity': 100.0, 'genome': 'bej04_200bp_100_minimap_7f14d17-mm3', 'id': 4}, {'index': 11, 'contig': 'chr14', 'start': 105107089, 'end': 105107415, 'uce_id': 11, 'identity': 100.0, 'genome': 'bej04_200bp_100_minimap_7f14d17-rn3', 'id': 11}), ({'index': 1, 'contig': 'chr18', '

Unnamed: 0,index,contig,start,end,uce_id,identity,genome,id
103,103,chr1,42406758,42406975,103,100.0,bej04_200bp_100_minimap_7f14d17-mm3,103
144,144,chr1,179011895,179012098,144,100.0,bej04_200bp_100_minimap_7f14d17-mm3,144
156,156,chr1,6770436,6770904,156,100.0,bej04_200bp_100_minimap_7f14d17-mm3,156
192,192,chr1,169639290,169639646,192,100.0,bej04_200bp_100_minimap_7f14d17-mm3,192
223,223,chr1,169002251,169002498,223,100.0,bej04_200bp_100_minimap_7f14d17-mm3,223
...,...,...,...,...,...,...,...,...
1319,330,chr14,105438064,105438284,327,100.0,bej04_200bp_100_minimap_7f14d17-rn3,330
1324,335,chr14,78026475,78026813,332,100.0,bej04_200bp_100_minimap_7f14d17-rn3,335
1352,363,chr14,105859017,105859219,359,100.0,bej04_200bp_100_minimap_7f14d17-rn3,363
1387,398,chr14,106814887,106815119,394,100.0,bej04_200bp_100_minimap_7f14d17-rn3,398


In [126]:
import math

def find_syntenic_blocks(uces, min_genomes = 3):
    unassigned_uces = uces.copy()
    
    its = len(list(itertools.product(*contigs_by_genome)))
    step = its // 10
    
    i = 0
    for contig_filter in itertools.product(*contigs_by_genome):
        if i % step == 0:
            print(f"{i}/{its}")
        i += 1
        uce_graph = nx.DiGraph()

        for u1, u2 in itertools.product(unassigned_uces.items(), unassigned_uces.items()):
            id_1, unfiltered_appearances_1 = u1
            id_2, unfiltered_appearances_2 = u2

            if id_1 == id_2:
                continue

            appearances_1 = filter_appearances(unfiltered_appearances_1, contig_filter)
            appearances_2 = filter_appearances(unfiltered_appearances_2, contig_filter)

            if len(appearances_1) < min_genomes or len(appearances_2) < min_genomes:
                continue

            uce_graph.add_node(id_1)
            uce_graph.add_node(id_2)

            relationships = set()

            # Compare appearances
            # Add edge if at least threshold relationships

            #existing_relationships = uce_graph.get_edge_data(id_1, id_2, default=set())

%time find_syntenic_blocks(combined_uces)

0/10580


KeyboardInterrupt: 

In [50]:
combined_uces[0]
filter_appearances(combined_uces[0], (("bej04_200bp_100_minimap_7f14d17-mm3", "chr3"),("bej04_200bp_100_minimap_7f14d17-hg16", "chr9"),("bej04_200bp_100_minimap_7f14d17-rn3", "chr3")))

{'bej04_200bp_100_minimap_7f14d17-hg16': [123897915],
 'bej04_200bp_100_minimap_7f14d17-rn3': [13289743]}

In [130]:
from typing import List, Any

def contigs_in_order(p, n, genome, forward_strand):
    return set(y for x, y in itertools.product(p.get(genome, []), n.get(genome, [])) if x[0] == y[0] and (x[1] > y[1] if forward_strand else y[1] > x[1]))
    
def is_valid_extension(path, next_apps, forward_strand):
    # Run through the path in reverse to hopefully exit ASAP
    valid_contigs = next_apps.copy()
    
    for _, apps in reversed(path):
        for genome in apps:
            ok_ctgs = contigs_in_order(valid_contigs, apps, genome, forward_strand)
            #print(valid_contigs, apps, genome, forward_strand, ok_ctgs)
            valid_contigs[genome] = ok_ctgs
        
        valid_genomes = 0
        for genome in apps:
            if len(valid_contigs[genome]) > 0:
                valid_genomes += 1
        
        if valid_genomes < 3:
            return False
    
    return True
            

def dfs_extend(path: List[Any], forward_strand) -> List[Any]:
    print(".", end="")
    
    last_id, last_apps = path[-1]
    new_paths = []
    
    for next_id, next_apps in combined_uces.items():
        if next_id == last_id:
            continue
        
        # Compare appearances
        if is_valid_extension(path, next_apps, forward_strand):
            new_paths.extend(dfs_extend(path + [(next_id, next_apps)], forward_strand)) 
    
    return new_paths
        

def dfs_block(root_id):
    viable_blocks = dfs_extend([(root_id, combined_uces[root_id])], True) # + dfs_extend([(root_id, combined_uces[root_id])], True)
    return viable_blocks

%time dfs_block(33)
#combined_uces[0]


........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

KeyboardInterrupt: 

In [121]:
contigs_in_order({'bej04_200bp_100_minimap_7f14d17-mm3': set(), 'bej04_200bp_100_minimap_7f14d17-hg16': [('chr18', 28605195)], 'bej04_200bp_100_minimap_7f14d17-rn3': [('chr18', 13310057)]},
                 {'bej04_200bp_100_minimap_7f14d17-mm3': [('chr2', 34739150)], 'bej04_200bp_100_minimap_7f14d17-hg16': [('chr18', 123897915)], 'bej04_200bp_100_minimap_7f14d17-rn3': [('chr3', 13289743)]},
                "bej04_200bp_100_minimap_7f14d17-hg16", False)

print(combined_uces[33])
print(combined_uces[34])


defaultdict(<class 'list'>, {'bej04_200bp_100_minimap_7f14d17-mm3': [('chr2', 58428541)], 'bej04_200bp_100_minimap_7f14d17-hg16': [('chr2', 157862654)], 'bej04_200bp_100_minimap_7f14d17-rn3': [('chr3', 39248658)]})
defaultdict(<class 'list'>, {'bej04_200bp_100_minimap_7f14d17-mm3': [('chr2', 62475852)], 'bej04_200bp_100_minimap_7f14d17-hg16': [('chr2', 162297585)], 'bej04_200bp_100_minimap_7f14d17-rn3': [('chr3', 43374007)]})
