In [None]:
from pathlib import Path
import pickle
from concurrent.futures import ProcessPoolExecutor, wait

from tqdm import tqdm
import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO
from machina.generate_models import TMalignModel


def _process(args):
    aligns_list = SeqIO.index('data/train/scop40_structural_alignment.fasta', 'fasta')
    for a in args:
        TMalignModel().generate_protein_model(a[0], a[1], aligns_list, a[3])


def main():
    scop_root = Scop(dir_path=Path('data/scop'), version='1.75')
    with Path('data/train/scop40_hie.pkl').open('rb') as f:
        hie = pickle.load(f)
    test_data = np.load('data/test/scop40.npy')
    with ProcessPoolExecutor() as executor:
        futures = []
        for query in test_data:
            query_sf = scop_root.getDomainBySid(query).getAscendent('sf').sunid
            args = [(query, _, None, f'data/.tmalign_aln/{query}') for _ in hie[query_sf]]
            futures.append(executor.submit(_process, args))
        wait(futures)


if __name__ == '__main__':
    main()

In [None]:
from pathlib import Path
import pickle
from concurrent.futures import ProcessPoolExecutor, wait

from tqdm import tqdm
import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO
from machina.generate_models import SWModel


def _process(args):
    aligns_list = SeqIO.index('data/train/scop40_structural_alignment.fasta', 'fasta')
    for a in args:
        SWModel(-11, -1).generate_protein_model(a[0], a[1], aligns_list, a[3])


def main():
    scop_root = Scop(dir_path=Path('data/scop'), version='1.75')
    with Path('data/train/scop40_hie.pkl').open('rb') as f:
        hie = pickle.load(f)
    test_data = np.load('data/test/scop40.npy')
    for query in tqdm(test_data):
        query_sf = scop_root.getDomainBySid(query).getAscendent('sf').sunid
        args = [(query, _, None, f'data/.sw_aln/open11_extend1/{query}') for _ in hie[query_sf]]
        _process(args)


if __name__ == '__main__':
    main()

## {PSI,DELTA}-BLAST

In [2]:
from pathlib import Path
import sys

import numpy as np
from tqdm import tqdm_notebook as tqdm
from Bio.SCOP import Scop
from Bio import SeqIO

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import BLASTModel


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
for query in tqdm(test_domain):
    domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px') if _.sid in scop40]
    for domain in domains:
        BLASTModel('psiblast', '/data/DB/blastdb').generate_pairwise_alignment(
            query, domain, '../data/evaluation/psiblast', '../data/train/pssm') # 40 min

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))




In [1]:
from pathlib import Path
import sys

import numpy as np
from tqdm import tqdm_notebook as tqdm
from Bio.SCOP import Scop
from Bio import SeqIO

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import BLASTModel


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
for query in tqdm(test_domain):
    domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px') if _.sid in scop40]
    for domain in domains:
        BLASTModel('deltablast', '/data/DB/blastdb').generate_pairwise_alignment(
            query, domain, '../data/evaluation/deltablast', '../data/train/pssm') # 9min

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))




In [1]:
from pathlib import Path
import sys
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO
from tqdm import tqdm_notebook as tqdm

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import BLASTModel


def _process(args):
    for a in args:
        BLASTModel('psiblast', '/data/DB/blastdb').generate_protein_model(a[0], a[1], a[2], a[3], a[4])


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for query in test_domain:
        domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px')
                   if _.sid in scop40 and _.sid != query]
        args = [(query, _,
                 f'../data/evaluation/psiblast/{query}/{_}.xml',
                 f'../data/evaluation/psiblast/{query}',
                 f'../data/train/pdbstyle-1.75/{_[2:4]}') for _ in domains]
        futures.append(executor.submit(_process, args))
    [_ for _ in tqdm(as_completed(futures), total=len(futures))] # 7 min



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))






In [2]:
from pathlib import Path
import sys
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO
from tqdm import tqdm_notebook as tqdm

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import BLASTModel


def _process(args):
    for a in args:
        BLASTModel('deltablast', '/data/DB/blastdb').generate_protein_model(a[0], a[1], a[2], a[3], a[4])


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for query in test_domain:
        domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px')
                   if _.sid in scop40 and _.sid != query]
        args = [(query, _,
                 f'../data/evaluation/deltablast/{query}/{_}.xml',
                 f'../data/evaluation/deltablast/{query}',
                 f'../data/train/pdbstyle-1.75/{_[2:4]}') for _ in domains]
        futures.append(executor.submit(_process, args))
    [_ for _ in tqdm(as_completed(futures), total=len(futures))] # 7 min

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))






## HHsearch

In [1]:
from pathlib import Path
import sys

import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO
from tqdm import tqdm_notebook as tqdm

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import HHSearchModel


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
for query in tqdm(test_domain):
    domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px')
               if _.sid in scop40 and _.sid != query]
    for domain in domains:
        HHSearchModel('/data/DB/hhsuitedb').generate_pairwise_alignment(
            query, domain, scop40[query], f'../data/evaluation/hhsearch/{query}') # 2 min or 12 hours

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))




In [1]:
from pathlib import Path
import sys

from tqdm import tqdm_notebook as tqdm
import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import HHSearchModel


def _process(args):
    for a in args:
        HHSearchModel('/data/DB/hhsuitedb').generate_protein_model(a[0], a[1], a[2], a[3])


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
for query in tqdm(test_domain):
    domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px')
               if _.sid in scop40 and _.sid != query]
    args = [(query, _, f'../data/evaluation/hhsearch/{query}', f'../data/train/pdbstyle-1.75/{_[2:4]}') for _ in domains]
    _process(args)  # 9 min

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


