### Imports

In [None]:
%load_ext autoreload
%autoreload 2
 
import sys, os

try:
    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../..')))
except:
    __file__ = os.path.join(os.getcwd(),'make_aggregate_files.ipynb')
    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../..')))
    
import numpy as np
import dreem 
import dreem.util as util
import pandas as pd
import os

### Create test files for ```test_set_1.py```


In [None]:
%reload_ext autoreload

sample_name = 'test_set_1'
number_of_constructs = 2
number_of_reads = [10]*number_of_constructs
mutations = [[[25]]*4+[[50,75]]*(n-4) for n in number_of_reads]
length = 100
reads = [[util.create_sequence(length)]*number_of_reads[k] for k in range(number_of_constructs)]
insertions = [[[]]*n for n in number_of_reads]
deletions = [[[]]*n for n in number_of_reads]
constructs = ['construct_{}'.format(i) for i in range(number_of_constructs)]
barcode_start = 10
barcodes = util.generate_barcodes(8, number_of_constructs, 3)
sections_start = [[0,0, 25, 50, 75]]*number_of_constructs
sections_end = [[100,25, 50, 75, 99]]*number_of_constructs
sections = [['{}_{}'.format(ss, se) for ss,se in zip(sections_start[n], sections_end[n])] for n in range(number_of_constructs)]

sample_profile = util.make_sample_profile(constructs, reads, number_of_reads, mutations, insertions, deletions, sections=sections, section_start=sections_start, section_end=sections_end, barcodes=barcodes, barcode_start=barcode_start)
test_files_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),  '../..', 'test', 'test_files'))

inputs = ['bitvector','samples_csv','library']
outputs = ['output']
util.generate_files(sample_profile, 'aggregate', inputs, outputs, test_files_dir, sample_name)


In [None]:
import json

def count_bases(positions, l):
    out = np.zeros(l, dtype=int)
    for p in positions:
        if p < l:
            out[p] += 1
    return out.tolist()

def count_mut_indel(positions, l, ss, se):
    return [count_bases([a for a in positions[b] if (a>=ss) and (a<se)], se-ss) for b in range(len(positions))]  

def count_mut_mod(ref, reads, base):
    out = np.zeros(len(ref), dtype=int)
    for s in reads:
        print(s)
        print(ref)
        for i in range(len(ref)):
            if ref[i] == base and s[i] != base:
                out[i] += 1
    return out.tolist()


def generate_output_files(file, sample_profile, library, samples, clusters = None):
    if clusters is None:
        library = pd.read_csv(library)
        samples = pd.read_csv(samples)
        out = samples.to_dict('records')[0]
        out['construct'] = {}
        for idx, (construct, v) in enumerate(sample_profile.items()):
            out['construct'][construct] = {}
            out['construct'][construct]['num_reads'] = sample_profile[construct]['number_of_reads']
            out['construct'][construct]['num_aligned'] = sample_profile[construct]['number_of_reads']
            out['construct'][construct]['barcode'] = v['barcodes']
            out['construct'][construct]['barcode_start'] = v['barcode_start']
            out['construct'][construct]['some_random_attribute'] = library['some_random_attribute'].values[0]
            out['construct'][construct]['sequence'] = v['reference']
            for s, ss, se in zip(v['sections'], v['section_start'], v['section_end']):
                out['construct'][construct][s] = {}
                out['construct'][construct][s]['section_start'] = ss
                out['construct'][construct][s]['section_end'] = se
                out['construct'][construct][s]['num_of_mutations'] = [len([a for a in v['mutations'][b] if (a>=ss) and (a<se)]) for b in range(len(v['mutations']))]
                out['construct'][construct][s]['mut_bases'] =  count_mut_indel(v['mutations'], len(v['reference']), ss, se)
                out['construct'][construct][s]['del_bases'] =  count_mut_indel(v['deletions'], len(v['reference']), ss, se)
                out['construct'][construct][s]['ins_bases'] =   count_mut_indel(v['insertions'], len(v['reference']), ss, se)
                out['construct'][construct][s]['cov_bases'] = [v['number_of_reads']]*(se-ss)
                out['construct'][construct][s]['mut_rates'] = np.array( np.array(out['construct'][construct][s]['mut_bases'])/np.array(out['construct'][construct][s]['cov_bases'])).tolist()
                for base in ['A', 'C', 'G', 'T']:
                    out['construct'][construct][s]['mod_bases_{}'.format(base)] = count_mut_mod(v['reference'][ss:se], [seq[ss:se] for seq in v['reads']], base)
    else: 
        raise NotImplementedError

    with open(file, 'w') as f:
        json.dump(out, f, indent=4)

generate_output_files(os.path.join(test_files_dir,'predicted_output','aggregate',sample_name+'.json'), sample_profile, os.path.join(test_files_dir, 'input', 'aggregate', sample_name,'library.csv'),  os.path.join(test_files_dir, 'input', 'aggregate', sample_name,'samples.csv'))


