In [1]:
import sys
sys.path.append('..')
import edlib
import numpy as np
from collections import Counter, defaultdict
import operator
from string import ascii_uppercase


from lrd_parser import LRD_Report
from utils.bio import hamming_distance, identity_shift, OverlapAlignment, compress_homopolymer
import networkx as nx

import matplotlib
%matplotlib inline 
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

from ndex2.nice_cx_network import NiceCXNetwork
import ndex2.client as nc
import ndex2

%load_ext autoreload
%autoreload 2

# Ref unique k-mers

In [2]:
man_assembly_fn = "/Poppy/abzikadze/centroFlye/centroFlye_repo/data/D6Z1/CEN6_ManVERSION3.tsv"

units = []
with open(man_assembly_fn) as f:
    for line in f:
        line = line.strip().split('\t')
        st, en = int(line[-2]), int(line[-1])
        if en < 17:
            en -= 1
        units.append((st, en))
        
def units2monomers(units):
    monomers = []
    for (s, e) in units:
        monomers.append(ascii_uppercase[s:e+1])
    monomers = ''.join(monomers)
    return monomers

ref_monomers = units2monomers(units)

k = 200
ref_kmers = [ref_monomers[i:i+k] for i in range(len(ref_monomers) - k + 1)]

ref_kmers = Counter(ref_kmers)
set_ref_kmers = set(ref_kmers.keys())
ref_unique_kmers = set(kmer for kmer, cnt in ref_kmers.items() if cnt == 1)

In [3]:
len(ref_unique_kmers)

8802

In [4]:
def get_kmer_pos(string, k):
    pos = defaultdict(list)
    for i in range(len(string)-k+1):
        pos[string[i:i+k]].append(i)
    return pos

ref_pos = get_kmer_pos(string=ref_monomers, k=k)

# Recruitment of unique k-mers

In [None]:
lrd_report_fn = '/Poppy/abzikadze/centroFlye/centroFlye_repo/experiments/20191023/lrd_d6z1_rel3_Karen/decomposition.tsv'
monomers_fn = "/Poppy/abzikadze/tandem_flye/data/human/isolated_centromeres/extracted_HORs/CEN6/monomers/inferred_monomers_single.fa"

lrd_report = LRD_Report(lrd_report_fn=lrd_report_fn, monomers_fn=monomers_fn)

In [None]:
def get_rare_mono_kmers(lrd_report, k, min_mult, max_mult):
    all_kmers = Counter()
    kmer_locations = defaultdict(list)
    for r_id, record in lrd_report.records.items():
        string = record.string
        for i in range(len(string)-k+1):
            kmer = string[i:i+k]
            if '=' not in kmer:
                all_kmers[kmer] += 1
                kmer_locations[kmer].append((r_id, i))
                
    rare_kmers = {kmer: cnt for kmer, cnt in all_kmers.items()
                  if min_mult <= cnt <= max_mult}
    reads2rare_kmers = defaultdict(list)
    for kmer in rare_kmers:
        locs = kmer_locations[kmer]
        for loc in locs:
            r_id, coordinate = loc
            reads2rare_kmers[r_id].append(coordinate)
    for r_id in reads2rare_kmers:
        reads2rare_kmers[r_id].sort()
    return rare_kmers, reads2rare_kmers


In [None]:
rare_min_mult, rare_max_mult = 5, 30

rare_kmers, reads2rare_kmers = get_rare_mono_kmers(lrd_report, k=k, min_mult=rare_min_mult, max_mult=rare_max_mult)

In [None]:
set_rare_kmers = set(rare_kmers.keys())
len(ref_unique_kmers), len(set_rare_kmers & ref_unique_kmers), len(set_rare_kmers - set_ref_kmers), len(set_rare_kmers)

(8802, 5961, 2862, 9785)

Need to create a plot of dependency of everything depending on parameters

In [None]:
def get_dist_graph(lrd_report, reads2rare_kmers, k):
    dist_graph = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    for r_id, coords in reads2rare_kmers.items():
        print(r_id)
        seq = lrd_report.records[r_id].string
        for c1 in coords:
            for c2 in coords:
                kmer1 = seq[c1:c1+k]
                kmer2 = seq[c2:c2+k]
                dist_graph[kmer1][kmer2][c2-c1] += 1
    return dist_graph

In [None]:
dist_graph = get_dist_graph(lrd_report=lrd_report,
                            reads2rare_kmers=reads2rare_kmers,
                            k=k)

00739941-c14c-40bd-9fad-fabff753795b
35e23dc0-4219-4951-b491-be041f325bf3
42affd5a-e784-44ae-8899-d978b5a8ee10
6e54ff5e-7a55-4dad-bae3-004d255dddbd
7e9301cd-6184-44f7-9db4-849ac08d33a5
8e85dd90-8045-4df1-ba7b-b6b576deaa4d
9f11e0b2-3f1f-46b0-bcfb-72fd5eaff950
a7a6eb6b-2225-4396-bb6b-2c0809993922
ab79a298-1f1a-44da-a174-6a52ae8abfcd
ca0eeaaf-2aee-4e3e-9134-acca35d636ad
db403540-296f-4a52-b04b-489c9f8116ef
f93d3935-7068-41ed-84ad-7b53b5460df5
8f633f5b-eb70-4ab0-94d2-77c708561492
e1e9d70e-c20b-47fe-89b3-8777b85ccc43
df9e6b8f-e94b-4bac-a69e-9e98b1769c42
df436222-50d1-4975-861e-58ab06750067
c0e1dce1-3466-415a-bde7-a6e371a3db86
871dbdd1-62eb-4c4b-b2fa-000febf9ba16
31700b15-5b3d-4d8c-b047-7836cb183613
42da64e4-e647-4426-b497-cda32cd3ca28
5da9ab31-d608-4581-b5fd-37acce77300e
ea920754-ed98-43fc-aab4-10bc923ba16e
747e0d59-fbd1-4378-bc82-77287e4bc114
e271f539-e586-4ca0-b153-01c72400eaa3
1b4bd09e-07f3-4455-b041-c4599b6fbe3d
7a11e524-31c6-47b5-98bc-e2bb4551972f
71fc9912-72f2-4f3e-95a0-a0311ebd0387
1

In [None]:
def get_unique_mono_kmers(dist_graph, min_coverage, min_dist, threshold=0.8):
    unique_kmers = set()
    for kmer1 in dist_graph:
        for kmer2 in dist_graph[kmer1]:
            freqs = dist_graph[kmer1][kmer2]
            argmax_freq, max_freq = None, 0
            for dist, freq in freqs.items():
                if freq > max_freq:
                    max_freq = freq
                    argmax_freq = dist
            if argmax_freq < min_dist:
                continue
            sum_freq = sum(freqs.values())
            # if len(freqs) > 1:
            #    print(freqs.values())
            if sum_freq >= min_coverage and max_freq / sum_freq >= threshold:
                unique_kmers.add(kmer1)
                unique_kmers.add(kmer2)
    return unique_kmers

In [None]:
unique_kmers = get_unique_mono_kmers(dist_graph, min_coverage=10, min_dist=k)

In [None]:
len(unique_kmers)

In [None]:
len(rare_kmers)

In [None]:
len(ref_unique_kmers), len(unique_kmers & ref_unique_kmers), len(unique_kmers - set_ref_kmers), len(unique_kmers)

In [None]:
ref_freq_unique_kmers = {kmer: ref_kmers[kmer] for kmer in rare_kmers}
ref_freq_unique_kmers_cnt = Counter(ref_freq_unique_kmers.values())
plt.bar(ref_freq_unique_kmers_cnt.keys(), height = ref_freq_unique_kmers_cnt.values())

In [None]:
ref_freq_unique_kmers_cnt

In [None]:
ref_freq_unique_kmers = {kmer: ref_kmers[kmer] for kmer in unique_kmers}
ref_freq_unique_kmers_cnt = Counter(ref_freq_unique_kmers.values())
plt.bar(ref_freq_unique_kmers_cnt.keys(), height = ref_freq_unique_kmers_cnt.values())

In [None]:
ref_freq_unique_kmers_cnt

In [None]:
repetitive_pos = [ref_pos[kmer] for kmer in unique_kmers if ref_kmers[kmer] > 1]
repetitive_pos = sum(repetitive_pos, [])

In [None]:
plt.bar(repetitive_pos, height=1)