In [20]:
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

In [21]:
n_processes = 18 # number of processes to run on each node
memory = 184000 # memory requested, you probably want a big memory node, so leave as is

cluster = SLURMCluster(
    project='deepgreen',
    walltime='180', # 2 hours for workers, you can make this more if you need it
    job_mem=str(memory),
    job_cpu=36,
    interface='ib0',
    local_directory='/tmp/scratch/dask-worker-space',
    cores=18,
    processes=n_processes,
    memory='{}MB'.format(memory)
)

print(cluster.job_script())

#!/usr/bin/env bash

#SBATCH -J dask-worker
#SBATCH -A deepgreen
#SBATCH -n 1
#SBATCH --cpus-per-task=36
#SBATCH --mem=184000
#SBATCH -t 180

/projects/rlmolecule/pstjohn/envs/tf2/bin/python -m distributed.cli.dask_worker tcp://10.148.8.87:44148 --nthreads 1 --nprocs 18 --memory-limit 10.22GB --name name --nanny --death-timeout 60 --local-directory /tmp/scratch/dask-worker-space --interface ib0



Perhaps you already have a cluster running?
Hosting the HTTP server on port 42307 instead
  http_address["port"], self.http_server.port


In [44]:
dask_client = Client(cluster)

In [46]:
n_nodes = 5 # set this to the number of nodes you would like to start as workers
cluster.scale(n_processes * n_nodes)

In [24]:
import pandas as pd
import dask.dataframe as dd

In [52]:
swissprot_data = pd.read_parquet('/projects/bpms/pstjohn/swissprot/parsed_swissprot.parquet')

In [53]:
uniref100 = dd.read_parquet('/scratch/pstjohn/uniparc/uniref100/*', engine='pyarrow')

In [54]:
uniref100 = dask_client.persist(uniref100)

In [56]:
merged = uniref100.drop(['NCBI taxonomy', 'length'], 1).merge(swissprot_data, how='right', left_on='Sequence', right_on='sequence')

In [57]:
merged = dask_client.persist(merged)

In [33]:
dask_client

0,1
Client  Scheduler: tcp://10.148.8.87:44148  Dashboard: http://10.148.8.87:42307/status,Cluster  Workers: 88  Cores: 88  Memory: 899.36 GB


In [60]:
merged_df = merged.compute()

In [62]:
swissprot_data

Unnamed: 0,accession,EMBL,RefSeq,KEGG,InterPro,Pfam,NCBI Taxonomy,length,sequence,subcellularLocalization
0,Q6GZX4,AY548484,YP_031579.1,vg:2947773,IPR007031,PF04947,654924,256,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,
1,Q6GZX3,AY548484,YP_031580.1,vg:2947774,IPR004251,PF03003,654924,320,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,Host membrane
2,Q197F8,DQ643392,YP_654574.1,vg:4156251,,,345201,458,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,
3,Q197F7,DQ643392,YP_654575.1,vg:4156252,,,345201,156,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,
4,Q6GZX2,AY548484,YP_031581.1,vg:2947775,,,654924,438,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,
...,...,...,...,...,...,...,...,...,...,...
561563,Q9QXY1,AF157006,,,IPR005420,PF07653,10090,905,MEELTIWEQHTATLYKDPRRGFGIAVSGGHDRASGSVVVSDVVPGS...,Cell membrane\nCell junction\nNucleus
561564,P18750,,,,IPR013087,PF00096,8355,224,SNEKLFSCSVCGKCFALKTELTIHCRSHSGEKAFHCTECGKYFQHR...,Nucleus
561565,P18749,,,,IPR013087,PF00096,8355,453,TRLDGFICSKCGETFTVNSHLLTHLCGKHERIYSREKLYSCTECRR...,Nucleus
561566,P18751,M25866,,,IPR013087,PF00096,8355,898,MGMWEEASDTGMKGKKKDKNEEEEERGKKERMVNLTLEMIYLLTGE...,Nucleus


Unnamed: 0,UniRef100 ID,UniRef90 ID,UniRef50 ID,accession,EMBL,RefSeq,KEGG,InterPro,Pfam,NCBI Taxonomy,length,sequence,subcellularLocalization
0,UniRef100_Q9Q8J2,UniRef90_Q9Q8J2,UniRef50_P16712,Q9Q8J2,AF170726,NP_051822.1,vg:932054,IPR027417,PF04851,31530,478,MSVCSEIDYALYTELKKFLNSQPLFLFNADKNFVEVVPSSSFKFYI...,Virion
1,UniRef100_P14197,UniRef90_P14197,UniRef50_P14197,P14197,X16524,XP_643326.1,ddi:DDB_G0276031,IPR036322,PF00400,44689,478,MGSRLNPSSNMYIPMNGPRGGYYGMPSMGQLQHPLFNYQFPPGGFQ...,
2,UniRef100_A6VUT8,UniRef90_A6VUT8,UniRef50_Q65UI5,A6VUT8,CP000749,WP_012069002.1,mmw:Mmwyl1_1288,IPR011763,PF03255,400668,315,MNLDYLPFEQPIAELEQKIEELRLVGNDNELNISDEISRLEDKKIA...,Cytoplasm
3,UniRef100_A4QKB4,UniRef90_P56765,UniRef50_P56765,A4QKB4,AP009370,YP_001123295.1,,IPR011762,PF01039,50458,487,MEKSWFNLMFSKGELEYRGELSKAMDSFAPSEKTTISQDRFIYDMD...,Plastid
4,UniRef100_Q9SQR4,UniRef90_Q9SQR4,UniRef50_Q9SQR4,Q9SQR4,CP002686,NP_187048.1,ath:AT3G03980,IPR002347,,3702,270,MSTHSSISQPPLPLAGRVAIVTGSSRGIGRAIAIHLAELGARIVIN...,Plastid
...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,UniRef100_B5E231,UniRef90_Q97S73,UniRef50_Q97S73,C1CQ17,CP000921,WP_000046031.1,snt:SPT_0549,IPR009012,PF01025,487213,174,MAQDIKNEEVEEVQEEEVVETAEETTPEKSELDLANERADEFENKY...,Cytoplasm
280,UniRef100_B5E231,UniRef90_Q97S73,UniRef50_Q97S73,B5E231,CP001015,WP_000046031.1,spx:SPG_0467,IPR009012,PF01025,512566,174,MAQDIKNEEVEEVQEEEVVETAEETTPEKSELDLANERADEFENKY...,Cytoplasm
281,UniRef100_B5E231,UniRef90_Q97S73,UniRef50_Q97S73,C1C5N6,CP000918,WP_000046031.1,snm:SP70585_0572,IPR009012,PF01025,488221,174,MAQDIKNEEVEEVQEEEVVETAEETTPEKSELDLANERADEFENKY...,Cytoplasm
282,UniRef100_B5E231,UniRef90_Q97S73,UniRef50_Q97S73,B1IA51,CP000936,WP_000046031.1,spv:SPH_0623,IPR009012,PF01025,487214,174,MAQDIKNEEVEEVQEEEVVETAEETTPEKSELDLANERADEFENKY...,Cytoplasm


In [64]:
merged_df.drop('Sequence', 1).to_parquet('/projects/bpms/pstjohn/swissprot/parsed_swissprot_uniref_clusters.parquet')

In [66]:
cluster.close()

tornado.application - ERROR - Exception in callback <bound method Client._heartbeat of <Client: 'tcp://10.148.8.87:44148' processes=88 threads=88, memory=899.36 GB>>
Traceback (most recent call last):
  File "/projects/rlmolecule/pstjohn/envs/tf2/lib/python3.7/site-packages/tornado/ioloop.py", line 907, in _run
    return self.callback()
  File "/projects/rlmolecule/pstjohn/envs/tf2/lib/python3.7/site-packages/distributed/client.py", line 1165, in _heartbeat
    self.scheduler_comm.send({"op": "heartbeat-client"})
  File "/projects/rlmolecule/pstjohn/envs/tf2/lib/python3.7/site-packages/distributed/batched.py", line 117, in send
    raise CommClosedError
distributed.comm.core.CommClosedError
tornado.application - ERROR - Exception in callback <bound method Client._heartbeat of <Client: 'tcp://10.148.8.87:44148' processes=88 threads=88, memory=899.36 GB>>
Traceback (most recent call last):
  File "/projects/rlmolecule/pstjohn/envs/tf2/lib/python3.7/site-packages/tornado/ioloop.py", line