In [None]:
from pathlib import Path
import pickle
import random
from concurrent.futures import ProcessPoolExecutor, wait

from Bio.SCOP import Scop
import numpy as np

from machina.predict import predict_by_kmknc, predict_by_flann


def process():
    scop_root = Scop(dir_path=Path('data/scop'), version='1.75')
    test_data = np.load('data/test/scop40.npy')
    with Path('data/train/scop40_hie.pkl').open('rb') as f:
        hie = pickle.load(f)
    args = []
    for query in test_data:
        args.extend([(query, _) for _ in hie[scop_root.getDomainBySid(query).getAscendent('sf').sunid]])
    random.shuffle(args)
    x = np.load('data/train/scop40_logscore_tmscore50_w5_randomsampling_ratio0.1_x.npy')
    y = np.load('data/train/scop40_logscore_tmscore50_w5_randomsampling_ratio0.1_y.npy')
    predict_by_flann(x, y, 200, 'flann19_scop40_logscore_tmscore50_w5_randomsampling_ratio0.1_nn1000', 1000, args)  # > 23GB


workers = 4
with ProcessPoolExecutor(max_workers=workers) as executor:
    futures = [executor.submit(process) for _ in range(workers)]
    wait(futures)


In [10]:
from pathlib import Path
from multiprocessing import Pool
import pickle
import random

from tqdm import tqdm_notebook as tqdm
import numpy as np
from Bio.SCOP import Scop

from machina.generate_alignment import generate_alignment

test_data = np.load('data/test/scop40.npy')
scop_root = Scop(dir_path=Path('data/scop'), version='1.75')
with Path('data/train/scop40_hie.pkl').open('rb') as f:
    hie = pickle.load(f)
model_name = 'flann19_scop40_logscore_tmscore50_w5_randomsampling_ratio0.1_nn1000'
GAP_OPEN = -0.1
GAP_EXTEND = -0.01

for query in test_data:
    result_dir = Path('results')/model_name
    result_file = result_dir/Path(f'{query}_open{-GAP_OPEN}_extend{-GAP_EXTEND}.npy')
    if result_file.exists():
        continue
    result_dir.mkdir(exist_ok=True, parents=True)
    query_sf_sunid = scop_root.getDomainBySid(query).getAscendent('sf').sunid
    paths = [Path(f'data/prediction/{model_name}/{query}/{domain}.npy') for domain in hie[query_sf_sunid]]
    with Pool() as pool:
        result = pool.starmap(generate_alignment, ((_, GAP_OPEN, GAP_EXTEND) for _ in paths))
    np.save(result_file, np.array(result))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




In [None]:
from pathlib import Path
import pickle
import os
from concurrent.futures import ThreadPoolExecutor,wait

from tqdm import tqdm_notebook as tqdm
import numpy as np
from Bio.SCOP import Scop

from machina.generate_models import MachinaModel

scop_root = Scop(dir_path=Path('data/scop'), version='1.75')
with Path('data/train/scop40_hie.pkl').open('rb') as f:
    hie = pickle.load(f)
test_data = np.load('data/test/scop40.npy')
model_name = 'flann19_scop40_logscore_tmscore50_w5_randomsampling_ratio0.1_nn1000'
GAP_OPEN = -0.1
GAP_EXTEND = -0.01

def _process(args):
    for a in args:
        MachinaModel().generate_protein_model(a[0], a[1], a[2], a[3])


with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = []
    for query in test_data:
        query_sf = scop_root.getDomainBySid(query).getAscendent('sf').sunid
        aligns_list = np.load(f'results/{model_name}/{query}_open{-GAP_OPEN}_extend{-GAP_EXTEND}.npy')
        args = [(query, _, aligns_list, f'results/{model_name}/open{-GAP_OPEN}extend{-GAP_EXTEND}/{query}') for _ in hie[query_sf]]
        futures.append(executor.submit(_process, args))
    wait(futures)