In [1]:
from IPython.display import display
from Bio import SeqIO


count = {}
for record in SeqIO.parse('data/hh_scop40.fasta', 'fasta'):
    sccs = record.description.split()[1]
    sf = '.'.join(sccs.split('.')[:3])
    if sf in count:
        count[sf] += 1
    else:
        count[sf] = 1
count = list(count.items())
test_data = sorted(count, key=lambda _: _[1], reverse=True)[:30]
display(test_data)
test_data = [_[0] for _ in test_data]

[('d.58.7', 74),
 ('g.37.1', 59),
 ('a.4.1', 57),
 ('a.4.5', 55),
 ('b.1.2', 53),
 ('g.39.1', 47),
 ('b.1.1', 46),
 ('c.47.1', 44),
 ('b.36.1', 44),
 ('c.37.1', 37),
 ('d.15.1', 37),
 ('b.34.2', 35),
 ('b.40.4', 30),
 ('b.55.1', 28),
 ('c.94.1', 27),
 ('a.5.2', 24),
 ('c.66.1', 23),
 ('g.44.1', 23),
 ('c.108.1', 23),
 ('c.23.1', 22),
 ('a.39.1', 22),
 ('c.2.1', 20),
 ('d.51.1', 20),
 ('d.17.4', 18),
 ('c.93.1', 18),
 ('b.1.18', 17),
 ('g.3.6', 16),
 ('c.1.8', 16),
 ('d.108.1', 16),
 ('g.50.1', 15)]

In [2]:
import os
from time import sleep
from pathlib import Path

import docker
from tqdm import tqdm_notebook as tqdm
from Bio import SeqIO
import numpy as np


def get_hh_scop40_hie():
    hie = {}
    for record in SeqIO.parse('data/hh_scop40.fasta', 'fasta'):
        sf = '.'.join(record.description.split()[1].split('.')[:3])
        hie.setdefault(sf, [])
        hie[sf].append(record)
    return hie

client = docker.from_env()
hh_scop40_hie = get_hh_scop40_hie()

for sf_sccs in tqdm(test_data):
    for domain in hh_scop40_hie[sf_sccs]:
        if Path(f'data/hh_scop40/{domain.id}.a3m').exists():
            continue
        if not Path(f'data/hh_scop40/{domain.id}.fasta').exists():
            SeqIO.write(domain, f'data/hh_scop40/{domain.id}.fasta', 'fasta')
        client.containers.run(
            'makisyu/hhsuite',
            f'hhblits -i data/hh_scop40/{domain.id}.fasta -d /DB/uniclust30_2018_08 -oa3m data/hh_scop40/{domain.id}.a3m -cpu {os.cpu_count()} -o /dev/null -n 3',
            remove=True, user=f'{os.getuid()}:{os.getgid()}',
            volumes={os.getcwd(): {'bind': '/WORK', 'mode': 'rw'}, '/data/DB/hhsuitedb': {'bind': '/DB', 'mode': 'ro'}}
        )
        sleep(1)

results = {}
for sf_sccs in tqdm(test_data):
    r = {}
    for domain in hh_scop40_hie[sf_sccs]:
        client.containers.run(
            'makisyu/hhsuite',
            f'hhsearch -p 0 -i data/hh_scop40/{domain.id}.a3m -d /DB/scop40 -cpu {os.cpu_count()} -o tmp.hhr',
            remove=True, user=f'{os.getuid()}:{os.getgid()}',
            volumes={os.getcwd(): {'bind': '/WORK','mode': 'rw'}, '/data/DB/hhsuitedb': {'bind': '/DB','mode': 'ro'}}
        )
        sleep(1)
        r[domain.id]= Path('tmp.hhr').read_text()
    results[sf_sccs] = r
np.save('data/hhsearch_hh_scop40.npy', np.array(results))

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




HBox(children=(IntProgress(value=0, max=30), HTML(value='')))


