In [1]:
from pathlib import Path
import sys
from concurrent.futures import ProcessPoolExecutor, as_completed

from tqdm import tqdm_notebook as tqdm
import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import TMalignModel


def _process(args):
    alignments = SeqIO.index('../data/train/scop40_structural_alignment.fasta', 'fasta')
    for a in args:
        try:
            TMalignModel().generate_protein_model(a[0], a[1], alignments, a[3], a[4])
        except Exception as e:
            print(e)


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
with ProcessPoolExecutor() as executor:
    futures = []
    for query in test_domain:
        domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px')
                   if _.sid in scop40 and _.sid != query]
        args = [(query, _, None, f'../data/train/pdbstyle-1.75/{_[2:4]}', f'../data/evaluation/tmalign/{query}')
                for _ in domains]
        futures.append(executor.submit(_process, args))
    [_ for _ in tqdm(as_completed(futures), total=len(futures))] # 15min

100%|██████████| 35/35 [15:29<00:00, 155.78s/it]


In [1]:
from pathlib import Path
import sys
from concurrent.futures import ProcessPoolExecutor, as_completed

from tqdm import tqdm
import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import SWModel


def _process(args):
    aligns = SeqIO.index('../data/train/scop40_structural_alignment.fasta', 'fasta')
    for a in args:
        SWModel(-11, -1).generate_protein_model(a[0], a[1], aligns, a[3], a[4])


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
with ProcessPoolExecutor() as executor:
    futures = []
    for query in test_domain:
        domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px')
                   if _.sid in scop40 and _.sid != query]
        args = [(query, _, None, f'../data/train/pdbstyle-1.75/{_[2:4]}',
                 f'../data/evaluation/sw_open11_ext1/{query}') for _ in domains]
        futures.append(executor.submit(_process, args))
    [_ for _ in tqdm(as_completed(futures), total=len(futures))]

  0%|          | 0/35 [00:00<?, ?it/s]



  3%|▎         | 1/35 [00:05<03:15,  5.75s/it]

  6%|▌         | 2/35 [00:11<03:13,  5.87s/it]

  9%|▊         | 3/35 [00:13<02:23,  4.47s/it]

 14%|█▍        | 5/35 [00:13<01:37,  3.23s/it]

 17%|█▋        | 6/35 [00:14<01:07,  2.33s/it]

 20%|██        | 7/35 [00:15<00:57,  2.04s/it]

 23%|██▎       | 8/35 [00:18<01:06,  2.47s/it]

 26%|██▌       | 9/35 [00:20<00:55,  2.12s/it]

 29%|██▊       | 10/35 [00:21<00:44,  1.77s/it]

 31%|███▏      | 11/35 [00:29<01:28,  3.67s/it]

 34%|███▍      | 12/35 [00:34<01:38,  4.29s/it]

 37%|███▋      | 13/35 [00:35<01:08,  3.10s/it]

 40%|████      | 14/35 [00:43<01:34,  4.51s/it]

 43%|████▎     | 15/35 [00:43<01:06,  3.33s/it]

 46%|████▌     | 16/35 [00:49<01:17,  4.09s/it]

 49%|████▊     | 17/35 [00:50<00:54,  3.02s/it]

 51%|█████▏    | 18/35 [00:54<00:57,  3.41s/it]



 54%|█████▍    | 19/35 [01:01<01:13,  4.57s/it]

 57%|█████▋    | 20/35 [01:21<02:16,  9.09s/it]



 60%|██████    | 21/35 [01:29<02:03,  8.81s/it]

 63%|██████▎   | 22/35 [01:47<02:31, 11.64s/it]

 66%|██████▌   | 23/35 [01:48<01:42,  8.53s/it]

 69%|██████▊   | 24/35 [01:51<01:13,  6.68s/it]



 71%|███████▏  | 25/35 [02:05<01:30,  9.03s/it]

 74%|███████▍  | 26/35 [02:12<01:14,  8.29s/it]



 77%|███████▋  | 27/35 [02:41<01:56, 14.61s/it]

 80%|████████  | 28/35 [04:47<05:34, 47.82s/it]



 83%|████████▎ | 29/35 [05:01<03:46, 37.69s/it]

 86%|████████▌ | 30/35 [06:03<03:45, 45.19s/it]

 89%|████████▊ | 31/35 [06:10<02:15, 33.79s/it]

 91%|█████████▏| 32/35 [06:46<01:43, 34.38s/it]

 94%|█████████▍| 33/35 [07:38<01:19, 39.68s/it]

 97%|█████████▋| 34/35 [08:18<00:39, 39.76s/it]

100%|██████████| 35/35 [14:58<00:00, 147.78s/it]




## {PSI,DELTA}-BLAST

In [2]:
from pathlib import Path
import sys

import numpy as np
from tqdm import tqdm_notebook as tqdm
from Bio.SCOP import Scop
from Bio import SeqIO

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import BLASTModel


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
for query in tqdm(test_domain):
    domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px') if _.sid in scop40]
    for domain in domains:
        BLASTModel('psiblast', '/data/DB/blastdb').generate_pairwise_alignment(
            query, domain, '../data/evaluation/psiblast', '../data/train/pssm') # 40 min

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))




In [1]:
from pathlib import Path
import sys

import numpy as np
from tqdm import tqdm_notebook as tqdm
from Bio.SCOP import Scop
from Bio import SeqIO

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import BLASTModel


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
for query in tqdm(test_domain):
    domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px') if _.sid in scop40]
    for domain in domains:
        BLASTModel('deltablast', '/data/DB/blastdb').generate_pairwise_alignment(
            query, domain, '../data/evaluation/deltablast', '../data/train/pssm') # 9min

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))




In [1]:
from pathlib import Path
import sys
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO
from tqdm import tqdm_notebook as tqdm

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import BLASTModel


def _process(args):
    for a in args:
        BLASTModel('psiblast', '/data/DB/blastdb').generate_protein_model(a[0], a[1], a[2], a[3], a[4])


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for query in test_domain:
        domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px')
                   if _.sid in scop40 and _.sid != query]
        args = [(query, _,
                 f'../data/evaluation/psiblast/{query}/{_}.xml',
                 f'../data/evaluation/psiblast/{query}',
                 f'../data/train/pdbstyle-1.75/{_[2:4]}') for _ in domains]
        futures.append(executor.submit(_process, args))
    [_ for _ in tqdm(as_completed(futures), total=len(futures))] # 7 min



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))






In [2]:
from pathlib import Path
import sys
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO
from tqdm import tqdm_notebook as tqdm

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import BLASTModel


def _process(args):
    for a in args:
        BLASTModel('deltablast', '/data/DB/blastdb').generate_protein_model(a[0], a[1], a[2], a[3], a[4])


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for query in test_domain:
        domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px')
                   if _.sid in scop40 and _.sid != query]
        args = [(query, _,
                 f'../data/evaluation/deltablast/{query}/{_}.xml',
                 f'../data/evaluation/deltablast/{query}',
                 f'../data/train/pdbstyle-1.75/{_[2:4]}') for _ in domains]
        futures.append(executor.submit(_process, args))
    [_ for _ in tqdm(as_completed(futures), total=len(futures))] # 7 min

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))






## HHsearch

In [1]:
from pathlib import Path
import sys

import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO
from tqdm import tqdm_notebook as tqdm

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import HHSearchModel


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
for query in tqdm(test_domain):
    domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px')
               if _.sid in scop40 and _.sid != query]
    for domain in domains:
        HHSearchModel('/data/DB/hhsuitedb').generate_pairwise_alignment(
            query, domain, scop40[query], f'../data/evaluation/hhsearch/{query}') # 2 min or 12 hours

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))




In [1]:
from pathlib import Path
import sys

from tqdm import tqdm_notebook as tqdm
import numpy as np
from Bio.SCOP import Scop
from Bio import SeqIO

sys.path.append(Path().resolve().parent.as_posix())
from machina.generate_models import HHSearchModel


def _process(args):
    for a in args:
        HHSearchModel('/data/DB/hhsuitedb').generate_protein_model(a[0], a[1], a[2], a[3])


scop_root = Scop(dir_path='../data/train', version='1.75')
test_domain = np.load('../data/test_domain_ids.npy')
scop40 = SeqIO.index('../data/train/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta')
for query in tqdm(test_domain):
    domains = [_.sid for _ in scop_root.getDomainBySid(query).getAscendent('sf').getDescendents('px')
               if _.sid in scop40 and _.sid != query]
    args = [(query, _, f'../data/evaluation/hhsearch/{query}', f'../data/train/pdbstyle-1.75/{_[2:4]}') for _ in domains]
    _process(args)  # 9 min

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))


