In [None]:
from pathlib import Path
import pickle
from concurrent.futures import ProcessPoolExecutor, wait

from tqdm import tqdm
import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO
from machina.generate_models import TMalignModel


def _process(args):
    aligns_list = SeqIO.index('data/train/scop40_structural_alignment.fasta', 'fasta')
    for a in args:
        TMalignModel().generate_protein_model(a[0], a[1], aligns_list, a[3])


def main():
    scop_root = Scop(dir_path=Path('data/scop'), version='1.75')
    with Path('data/train/scop40_hie.pkl').open('rb') as f:
        hie = pickle.load(f)
    test_data = np.load('data/test/scop40.npy')
    with ProcessPoolExecutor() as executor:
        futures = []
        for query in test_data:
            query_sf = scop_root.getDomainBySid(query).getAscendent('sf').sunid
            args = [(query, _, None, f'data/.tmalign_aln/{query}') for _ in hie[query_sf]]
            futures.append(executor.submit(_process, args))
        wait(futures)


if __name__ == '__main__':
    main()

In [None]:
from pathlib import Path
import pickle
from concurrent.futures import ProcessPoolExecutor, wait

from tqdm import tqdm
import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO
from machina.generate_models import SWModel


def _process(args):
    aligns_list = SeqIO.index('data/train/scop40_structural_alignment.fasta', 'fasta')
    for a in args:
        SWModel(-11, -1).generate_protein_model(a[0], a[1], aligns_list, a[3])


def main():
    scop_root = Scop(dir_path=Path('data/scop'), version='1.75')
    with Path('data/train/scop40_hie.pkl').open('rb') as f:
        hie = pickle.load(f)
    test_data = np.load('data/test/scop40.npy')
    for query in tqdm(test_data):
        query_sf = scop_root.getDomainBySid(query).getAscendent('sf').sunid
        args = [(query, _, None, f'data/.sw_aln/open11_extend1/{query}') for _ in hie[query_sf]]
        _process(args)


if __name__ == '__main__':
    main()

In [None]:
from pathlib import Path
import pickle
from concurrent.futures import ProcessPoolExecutor, wait

from tqdm import tqdm
import numpy as np
from Bio.SCOP import Scop
from machina.generate_models import BLASTModel


def _process(args):
    for a in args:
        BLASTModel('deltablast').generate_protein_model(a[0], a[1], a[2], a[3])


def main():
    scop_root = Scop(dir_path=Path('data/scop'), version='1.75')
    with Path('data/train/scop40_hie.pkl').open('rb') as f:
        hie = pickle.load(f)
    test_data = np.load('data/test/scop40.npy')
    for query in tqdm(test_data):
        query_sf = scop_root.getDomainBySid(query).getAscendent('sf').sunid
        args = [(query, _, None, f'data/.deltablast_aln/{query}') for _ in hie[query_sf]]
        _process(args)


if __name__ == '__main__':
    main()

In [None]:
from pathlib import Path
import pickle
from concurrent.futures import ProcessPoolExecutor, wait

from tqdm import tqdm
import numpy as np
from Bio.SCOP import Scop
from machina.generate_models import BLASTModel


def _process(args):
    for a in args:
        BLASTModel('deltablast').generate_protein_model(a[0], a[1], a[2], a[3])


def main():
    scop_root = Scop(dir_path=Path('data/scop'), version='1.75')
    with Path('data/train/scop40_hie.pkl').open('rb') as f:
        hie = pickle.load(f)
    test_data = np.load('data/test/scop40.npy')
    for query in tqdm(test_data):
        query_sf = scop_root.getDomainBySid(query).getAscendent('sf').sunid
        args = [(query, _, None, f'data/.deltablast_aln/{query}') for _ in hie[query_sf]]
        _process(args)


if __name__ == '__main__':
    main()


In [3]:
from pathlib import Path
import pickle

from tqdm import tqdm_notebook as tqdm
import numpy as np
from Bio.SCOP import Scop
from machina.generate_models import HHSearchModel


def _process(args):
    for a in args:
        HHSearchModel().generate_protein_model(a[0], a[1], a[2], a[3])


def main():
    hie = {}
    for line in [_ for _ in Path('/data/DB/hhsuitedb/scop40_hhm.ffdata').read_text().splitlines() if _.startswith('NAME ')]:
        domain_sid = line.split()[1]
        sf_sccs = '.'.join(line.split()[2].split('.')[:3])
        if sf_sccs not in hie:
            hie[sf_sccs] = [domain_sid]
        else:
            hie[sf_sccs].append(domain_sid)
    scop_root = Scop(dir_path=Path('data/scop'), version='1.75')
    test_data = np.load('data/test/scop40.npy')
    for query in tqdm(test_data):
        query_sf = scop_root.getDomainBySid(query).getAscendent('sf').sccs
        if query_sf not in hie:
            print(f'{query} ({query_sf})')
            continue
        args = [(query, _, None, f'data/.hhsearch/{query}') for _ in hie[query_sf]]
        _process(args)


if __name__ == '__main__':
    main()

HBox(children=(IntProgress(value=0, max=2844), HTML(value='')))




HBox(children=(IntProgress(value=0, max=14), HTML(value='')))

d2axtu1 (a.60.12)
d2axtd1 (f.26.1)

