## Preprocessing

In [21]:
import pandas as pd
import json
import rouskinhf

def read_bpseq(filename):
    bases, pairs = [], []
    with open(filename) as f:
        for line in f:
            if line.startswith('#'):
                continue
            if not line[0].isdigit():
                continue
            i, b, j = line.split()
            bases.append(b)
            if j != '0':
                pairs.append([int(i)-1, int(j)-1]) # 0-based indexing
    seq = ''.join(bases)
    return seq, pairs

read_bpseq('/Users/yvesmartin/data/CRW/16S/a/A.fulgidus/d.16.a.A.fulgidus.bpseq.txt')

('AUUCUGGUUGAUCCUGCCAGAGGCCGCUGCUAUCCGGCUGGGACUAAGCCAUGCGAGUCAAGGGGCUUGUAUCCCUUCGGGGAUGCAAGCACCGGCGGACGGCUCAGUAACACGUGGACAACCUGCCCUCGGGUGGGGGAUAACCCCGGGAAACUGGGGCUAAUCCCCCAUAGGGGAUGGGUACUGGAAUGUCCCAUCUCCGAAAGCGCUUAGCGCCCGAGGAUGGGUCUGCGGCGGAUUAGGUUGUUGGUGGGGUAACGGCCCACCAAGCCGAAGAUCCGUACGGGCCAUGAGAGUGGGAGCCCGGAGAUGGACCCUGAGACACGGGUCCAGGCCCUACGGGGCGCAGCAGGCGCGAAACCUCCGCAAUGCGGGAAACCGCGACGGGGUCAGCCGGAGUGCUCGCGCAUCGCGCGGGCUGUCGGGGUGCCUAAAAAGCACCCCACAGCAAGGGCCGGGCAAGGCCGGUGGCAGCCGCCGCGGUAAUACCGGCGGCCCGAGUGGCGGCCACUUUUAUUGGGCCUAAAGCGUCCGUAGCCGGGCUGGUAAGUCCUCCGGGAAAUCUGGCGGCUUAACCGUCAGACUGCCGGAGGAUACUGCCAGCCUAGGGACCGGGAGAGGCCGGGGGUAUUCCCGGAGUAGGGGUGAAAUCCUGUAAUCCCGGGAGGACCACCUGUGGCGAAGGCGCCCGGCUGGAACGGGUCCGACGGUGAGGGACGAAGGCCAGGGGAGCGAACCGGAUUAGAUACCCGGGUAGUCCUGGCUGUAAACGAUGCGGACUAGGUGUCACCGAAGCUACGAGCUUCGGUGGUGCCGGAGGGAAGCCGUUAAGUCCGCCGCCUGGGGAGUACGGCCGCAAGGCUGAAACUUAAAGGAAUUGGCGGGGGAGCACUACAACGGGUGGAGCCUGCGGUUUAAUUGGAUUCAACGCCGGGAAGCUUACCGGGGGAGACAGCGGGAUGAAGGUCGGGCUGAAGACCUUACCAGACUAGCUGAGA

In [25]:
source_root = '/Users/yvesmartin/data/CRW'
data = {}
import os
for root, dirs, files in os.walk(source_root):
    for name in files:
        if name.endswith((".bpseq", ".bpseq.txt")):
            data[name.removesuffix('.bpseq.txt')] = list(read_bpseq(os.path.join(root, name))) + [root.split('/')[-3]]
data = pd.DataFrame.from_dict(data, orient='index', columns=['sequence', 'structure', 'family'])

In [28]:
print("We drop {}/{} sequences that are too short".format(len(data[data['sequence'].str.len() <= 10]), len(data)))
data = data[data['sequence'].str.len() > 10]

We drop 0/1978 sequences that are too short


In [30]:
rouskinhf.dump_json(data.to_dict(orient='index'), 'crw.json')

In [32]:
def json_to_fasta(data, path):
    data = data.to_dict(orient='index')
    with open(path, 'w') as f:
        f.write('\n'.join('>{}___{}\n{}'.format(data[name]['family'], name, data[name]['sequence']) for name in data))

json_to_fasta(data.to_dict(orient='index'), 'crw.fasta')