In [86]:
# commands to generate datasets to be converted here (maybe lower the memroy)
#  bsub -R "rusage[mem=250000]" -W 24:00 
#./a.out --gene-len 100 --gene-len2 2000 -G 100 -P 10000 -n 3 -r 2 --geometric-p .5 -m .1 -r 4 -f data1
#./a.out --gene-len 100 --gene-len2 2000 -G 3000 -P 55000 -n 5 -r 3 --geometric-p .34 -m .1 -r 4 -f data16
#./a.out --gene-len 100 --gene-len2 2000 -G 6000 -P 55000 -n 5 -r 3 --geometric-p .34 -m .1 -r 4 -f data17
#./a.out --gene-len 100 --gene-len2 2000 -G 12000 -P 55000 -n 5 -r 3 --geometric-p .34 -m .1 -r 4 -f data18

import numpy as np
import glob
from attrdict import AttrDict
import utility

file_name = 'data1'
proj_dir = utility.proj_dir() + '/data/fasta/'
datadir = proj_dir + file_name

# read maf file 
fasta_files = glob.glob(datadir+'/seqs*.fa')
N = len(fasta_files)
genes = [[] for i in range(N)]
maf_file = open(datadir + '/MSA.maf','r')
line = maf_file.readline()
while '===' not in line:
    if line[0]=='s':
        line = maf_file.readline()
        continue
    seq, g, s, l = [int(i) for i in line.split()]
    genes[seq].append([g, s, l])
    line = maf_file.readline()
    
# read fasta files and fill the values 
seqs = []
vals = []

for i in range(N):
    f = open(fasta_files[i])
    f.readline()
    seq = f.readline()[:-1]
    L = len(seq)
    val = np.zeros(L, dtype=np.uint32)
    for gene in genes[i]:
        gc, s, l = gene
        val[s:s+l] = gc
    seqs.append(seq)
    vals.append(val)
    
    
    
# read options file 

opts_files = open(datadir+'/options.txt')
lines = opts_files.readlines()
for line in lines:
    if 'num_seqs' in line:
        num_seqs = [int(i) for i in line.split()[1:]]
    elif 'gene_lens' in line:
        gene_lens = [int(i) for i in line.split()[1:]]
    elif 'seq_options' in line:
        opts = line.replace(',', ' ').replace('}','').replace('{','').split()[2:]
        Dict = dict()
        for i in range(int(len(opts)/2)):
            key, val = opts[2*i:2*i +2]
            if key in ['mutation_rate', 'geometric_p', 'reverse_p']:
                Dict[key] = float(val)
            elif key == 'file_name':
                Dict['save_directory'] = val
            else:
                Dict[key] = int(val)
Dict['num_seqs'] = num_seqs
Dict['gene_lens'] = gene_lens
options = AttrDict(Dict)    

        
# save the values
np.savez(utility.proj_dir() + '/data/'+file_name+'.npz', seqs = seqs, vals = vals, num_seqs = num_seqs, gene_lens = gene_lens, options = Dict )

In [83]:
print(options.__dict__)

{'_default_factory': None, '_pass_key': False, '_recursive': True, '_mapping': {'mutation_rate': 0.1, 'gene_len': 100, 'num_genes': 5, 'padding': 200, 'num_seq': 5, 'repeat': 1, 'reverse_p': 0.0, 'gene_len2': 100, 'num_seq2': 5, 'geometric_p': 0.4, 'colored': 0, 'print_values': 0, 'print_seqs': 0, 'save_directory': 'tmp', 'num_seqs': [5], 'gene_lens': [100, 100, 100, 100, 100]}, 'mutation_rate': 0.1, 'gene_len': 100, 'num_genes': 5, 'padding': 200, 'num_seq': 5, 'repeat': 1, 'reverse_p': 0.0, 'gene_len2': 100, 'num_seq2': 5, 'geometric_p': 0.4, 'colored': 0, 'print_values': 0, 'print_seqs': 0, 'save_directory': 'tmp', 'num_seqs': [5], 'gene_lens': [100, 100, 100, 100, 100]}


{'mutation_rate': 0.1, 'gene_len': 100, 'num_genes': 6000, 'padding': 15000, 'num_seq': 5, 'repeat': 1, 'geometric_p': 0.33, 'reverse_p': 0, 'gene_len2': 1000, 'num_seq2': 5, 'colored': False, 'print_values': False, 'print_seqs': False, 'save_directory': '/cluster/work/grlab/share/databases/genomes/synthetic/data/seqs12'}


{'colored': False,
 'gene_len': 100,
 'gene_len2': 1000,
 'geometric_p': 0.33,
 'mutation_rate': 0.1,
 'num_genes': 6000,
 'num_seq': 5,
 'num_seq2': 5,
 'padding': 15000,
 'print_seqs': False,
 'print_values': False,
 'repeat': 1,
 'reverse_p': 0,
 'save_directory': '/cluster/work/grlab/share/databases/genomes/synthetic/data/seqs12'}

{'_default_factory': None,
 '_mapping': {'colored': 0,
  'gene_len': 100,
  'gene_len2': 100,
  'gene_lens': [100, 100, 100, 100, 100],
  'geometric_p': 0.4,
  'mutation_rate': 0.1,
  'num_genes': 5,
  'num_seq': 5,
  'num_seq2': 5,
  'num_seqs': [5],
  'padding': 200,
  'print_seqs': 0,
  'print_values': 0,
  'repeat': 1,
  'reverse_p': 0.0,
  'save_directory': 'tmp'},
 '_pass_key': False,
 '_recursive': True,
 'colored': 0,
 'gene_len': 100,
 'gene_len2': 100,
 'gene_lens': [100, 100, 100, 100, 100],
 'geometric_p': 0.4,
 'mutation_rate': 0.1,
 'num_genes': 5,
 'num_seq': 5,
 'num_seq2': 5,
 'num_seqs': [5],
 'padding': 200,
 'print_seqs': 0,
 'print_values': 0,
 'repeat': 1,
 'reverse_p': 0.0,
 'save_directory': 'tmp'}