In [2]:
data_dir = "/home/cadel/projects/thesis/UNSWThesis/data/synteny"

# Load anchors from BED files

In [14]:
import os
import csv
from collections import namedtuple

Anchor = namedtuple('Anchor', 'anchor_id genome contig start end uce_id identity')

def load_uces():
    uces_by_genome = {}
    for fname in os.listdir(data_dir):
        if fname.endswith(".bed"):
            path = os.path.join(data_dir, fname)
            name = fname.replace(".bed", "")
            
            with open(path, "r") as f:
                records = [Anchor(i, line[0], int(line[1]),int(line[2]),line[3], float(line[4])) for i, line in enumerate(csv.reader(f, delimiter="\t"))]
            
            uces_by_genome[name] = records
    return uces_by_genome

def load_anchors():
    anchors = []
    genome_names = []
    for fname in os.listdir(data_dir):
        if fname.endswith(".bed"):
            path = os.path.join(data_dir, fname)
            name = fname.replace(".bed", "")
            
            with open(path, "r") as f:
                records = [Anchor(i, name, line[0], int(line[1]),int(line[2]),line[3], float(line[4])) for i, line in enumerate(csv.reader(f, delimiter="\t"))]
            
            anchors.extend(records)
            genome_names.append(name)
            
    return anchors, genome_names

anchors, genome_names = load_anchors()

In [31]:
# Group by UCE ID, filter by #unique genomes

import itertools

UCEAnchor = namedtuple("UCEAnchor", "uce_id anchors contig_filter")

def apply_contigs_filter(uce_anchors, contigs_filter, genome_names):
    contig_by_genome = {k:v for k,v in zip(genome_names, contigs_filter)}
    return [a for a in uce_anchors if contig_by_genome[a.genome] == a.contig]

uces = []
for uce_id, uce_anchors_i in itertools.groupby(sorted(anchors, key=lambda x: x.uce_id), key=lambda x: x.uce_id):
    #print(uce_id, len(list(uce_anchors)))
    uce_anchors = list(uce_anchors_i)
    relevant_contigs = [set([None])] * len(genome_names)
    for genome_sort, genome_anchors in itertools.groupby(sorted(uce_anchors, key=lambda x: (genome_names.index(x.genome), x.genome)), key=lambda x: (genome_names.index(x.genome), x.genome)):
        genome_i, genome_name = genome_sort
        relevant_contigs[genome_i] = set(x.contig for x in genome_anchors)
        
    for contig_filter in itertools.product(*relevant_contigs):
        uces.append(UCEAnchor(uce_id, apply_contigs_filter(uce_anchors, contig_filter, genome_names), contig_filter))
    
uces[:5]

[UCEAnchor(uce_id='0', anchors=[Anchor(anchor_id=0, genome='ste08_5_of_5_1c77a9f-mm8', contig='chr12', start=106530308, end=106530526, uce_id='0', identity=100.0), Anchor(anchor_id=0, genome='ste08_5_of_5_1c77a9f-canFam2', contig='chr8', start=68656375, end=68656593, uce_id='0', identity=100.0), Anchor(anchor_id=0, genome='ste08_5_of_5_1c77a9f-bosTau2', contig='chr5', start=75219010, end=75219228, uce_id='0', identity=100.0), Anchor(anchor_id=0, genome='ste08_5_of_5_1c77a9f-rn4', contig='chr6', start=130304090, end=130304308, uce_id='0', identity=100.0), Anchor(anchor_id=0, genome='ste08_5_of_5_1c77a9f-hg18', contig='chr14', start=96501120, end=96501338, uce_id='0', identity=100.0)], contig_filter=('chr12', 'chr8', 'chr5', 'chr6', 'chr14')),
 UCEAnchor(uce_id='1', anchors=[Anchor(anchor_id=1, genome='ste08_5_of_5_1c77a9f-mm8', contig='chr6', start=15152718, end=15152843, uce_id='1', identity=100.0), Anchor(anchor_id=1, genome='ste08_5_of_5_1c77a9f-canFam2', contig='chr14', start=565815

In [145]:
import networkx as nx
import numpy as np
len(uces)

def anchors_to_tuple(uce, genome_names):
    positions = [None] * len(genome_names)
    for genome, genome_anchors_i in itertools.groupby(sorted(uce.anchors, key=lambda x: x.genome), key=lambda x: x.genome):
        genome_anchors = list(genome_anchors_i)
        positions[genome_names.index(genome)] = genome_anchors[0].start if len(genome_anchors) > 0 else None
    return uce.uce_id, positions

def reduce_until_acyclic(synteny_map, max_genomes):
    edges = synteny_map.edges(data=True)
    edges_by_support = list(sorted([e for e in edges if e[2].get("support") < max_genomes],
                                   key=lambda x: x[2].get("support")))
                            #       key=lambda x: (x[2].get("support"), -x[2].get("average_distance"))))
    
    while synteny_map.number_of_edges() > 0:
        if len(edges_by_support) > 0:
            edge_to_delete = edges_by_support.pop()
        synteny_map.remove_edge(edge_to_delete[0], edge_to_delete[1])
        
        if nx.algorithms.dag.is_directed_acyclic_graph(synteny_map): return synteny_map
    
    raise Exception("Could not reduce any more")

def find_paths_dfs(g_unred, max_genomes):
    if not nx.algorithms.dag.is_directed_acyclic_graph(g_unred):
        g_unred = reduce_until_acyclic(g_unred, max_genomes)
    g = nx.algorithms.transitive_reduction(g_unred)

    sources = [n for n, d in g.in_degree() if d == 0]
    sinks = [n for n, d in g.out_degree() if d == 0]
        
    def dfs(node):
        if g.out_degree(node) == 0:
            # Leaf
            return [[node]]
        else:
            return [[node] + path for neighbor in g.neighbors(node) for path in dfs(neighbor)]

    paths = []
    for source in sources:
        paths.extend(dfs(source))
    return paths


def build_synteny_map(uces, genome_names, min_genomes=5, min_length=5):
    blocks = []
    uces_by_contigs = sorted(uces, key=lambda x: x.contig_filter)
    
    for contig_filter, uce_group_i in itertools.groupby(uces_by_contigs, key=lambda x: x.contig_filter):
        g = nx.DiGraph()

        nodes = [anchors_to_tuple(uce, genome_names) for uce in uce_group_i]
        
        for ref_i in range(len(genome_names) - min_genomes + 1):
            sorted_nodes = sorted(nodes, key=lambda x: x[1][ref_i])
            for i in range(len(sorted_nodes)):
                j = i + 1

                while j < len(sorted_nodes):
                    relationship_support = len([1 for n, p in zip(sorted_nodes[j][1], sorted_nodes[i][1]) if n > p])
                    average_distance = np.mean([n - p for n, p in zip(sorted_nodes[j][1], sorted_nodes[i][1]) if n > p])

                    if relationship_support >= min_genomes:
                        g.add_nodes_from([
                            sorted_nodes[i][0],
                            sorted_nodes[j][0],
                        ])
                        g.add_edge(sorted_nodes[i][0], sorted_nodes[j][0], support=relationship_support, average_distance=average_distance)

                    j += 1
        
#         if len(g) > 10:
#             nx.draw(g)
#             break
        blocks.extend(p for p in find_paths_dfs(g, len(genome_names)) if len(p) >= min_length)
        
    return blocks

print(genome_names)
m = build_synteny_map(uces, genome_names, min_genomes=3)

['ste08_5_of_5_1c77a9f-mm8', 'ste08_5_of_5_1c77a9f-canFam2', 'ste08_5_of_5_1c77a9f-bosTau2', 'ste08_5_of_5_1c77a9f-rn4', 'ste08_5_of_5_1c77a9f-hg18']


In [142]:
def minimise_blocks(blocks):
    minimal = []
    seen = set()
    
    for block in sorted(blocks, key=lambda x: len(x), reverse=True):
        accept = True
        for uce_id in block:
            if uce_id in seen:
                accept = False
        if accept:
            minimal.append(block)
            for uce_id in block:
                seen.add(uce_id)
    return minimal
print(len(m))
print(len(minimise_blocks(m)))

for x in minimise_blocks(m):
    print(",".join(x))

4176
129
302,2135,1805,194,2791,1947,1797,1183,2330,974,2729,1196,1360,37,650,2016,195,382,436,281,1340,2385,215,2224,1237,2633,1057,326,1428,2435,1392,2802,2486,2285,1076,406,1071,2715,617,2667,132,2744,519,940,1129,216,2341,1929,1091,1378,1182,2098,1565,1435,600,2420,118,1351,1037,1283,2743,1644,580,2602,2147,2216,1789,2193,2089,1819,473,2645,2781,147,1948,1430,359,1112,1029,65
350,1104,991,1020,503,674,1895,1613,2008,2179,466,1589,81,939,2415,2125,2644,871,1403,992,780,1348,212,76,1230,1872,1177,487,840,751,640,556,2039,622,2164,1860,689,213,1655,89,1320,817,2069,1099,1056,2821,644,1676,1473,2105,1342,848,2686,58,335,2022,730,1784,847,2499,71,125,628,1434,1546,830,1471,1528,2421,966,2365,1991,1766,1414,1255
927,4,1977,2526,1442,1338,1846,75,988,1679,632,1950,449,2104,621,1446,1794,2613,1235,1770,1329,1374,94,2327,2346,2680,60,994,303,2636,1356,1569,750,828,2087,1544,517,1493,1835,186,656,1169,928,592,383,2211,552,1968,1264,32,2606,2472,1273,1103,1808,776,742,1697,842,577,474,2252,15