In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# needed to call snakemake from jupyter notebook
import nest_asyncio
nest_asyncio.apply()

In [3]:
from metagraph_workflows import cli
from pathlib import Path

from metagraph_workflows import workflow_configs

import urllib

## Building Index

In [4]:
temp_dir = Path('/tmp/metagraph_workflows')
temp_dir.mkdir(parents=True, exist_ok=True)

In [5]:
sample_list_path = Path('/tmp/paths.txt')

In [6]:
# download can be unreliable at times. Try several times, possibly changing the protocol between http and ftp

protocol = "ftp"
sample_download = [
    f"{protocol}://ftp.sra.ebi.ac.uk/vol1/fastq/SRR512/000/SRR5122830/SRR5122830_subreads.fastq.gz",
    f"{protocol}://ftp.sra.ebi.ac.uk/vol1/fastq/SRR512/006/SRR5122826/SRR5122826_subreads.fastq.gz"
]

input_seqs = temp_dir/'input_seqs'

input_seqs.mkdir(parents=True, exist_ok=True)

for s in sample_download:
    file_name = Path(urllib.parse.urlparse(s).path).name
    sample_name = file_name.split('_')[0]
    
    if not (input_seqs/file_name).exists():
        urllib.request.urlretrieve(s, input_seqs/file_name)

In [7]:
def create_sample_list_path(path_list, out_path):
    with open(out_path, "w") as f:
        f.write('\n'.join([str(p) for p in path_list]))

create_sample_list_path(input_seqs.glob('*fastq*'), sample_list_path)

In [8]:
output_dir =  temp_dir /'output_dir'

In [9]:
# checking metagraph version
!metagraph --help 2>&1 | head

#############################
### Welcome to MetaGraph! ###
#############################

Metagraph: comprehensive metagenome graph representation -- Version 0.1

Usage: metagraph <command> [command specific options]

Available commands:
	build		construct a graph object from input sequence


In [10]:
cli.run_build_workflow(output_dir, sample_list_path, build_primary_graph=True, 
                       force=True, k=5, verbose=False, threads=2,
                       annotation_labels_source=workflow_configs.AnnotationLabelsSource.SEQUENCE_HEADERS);

Building DAG of jobs...
Using shell: /usr/local/bin/bash
Provided cores: 1 (use --cores to define parallelism)
Rules claiming more threads will be scaled down.
Job counts:
	count	jobs
	1	all
	1	annotate
	1	annotate_row_diff_brwt
	1	build_joint_graph
	1	build_joint_primary
	1	generate_column_list
	1	primarize_joint_graph
	1	relax_row_diff_brwt
	1	transform_rd_stage0
	1	transform_rd_stage1
	1	transform_rd_stage2
	11
	count	jobs
	1	all
	1	annotate
	1	annotate_row_diff_brwt
	1	build_joint_graph
	1	build_joint_primary
	1	generate_column_list
	1	primarize_joint_graph
	1	relax_row_diff_brwt
	1	transform_rd_stage0
	1	transform_rd_stage1
	1	transform_rd_stage2
	11

2021-10-19 14:37:27,489 - INFO: 
[Tue Oct 19 14:37:27 2021]
2021-10-19 14:37:27,494 - INFO: [Tue Oct 19 14:37:27 2021]
rule build_joint_graph:
    input: /tmp/paths.txt
    output: /tmp/metagraph_workflows/output_dir/graph_canonical.dbg
    log: /tmp/metagraph_workflows/output_dir/logs/build_joint_graph.log
    jobid: 3
    resources

2021-10-19 14:37:30,800 - INFO: rule transform_rd_stage1:
    input: /tmp/metagraph_workflows/output_dir/graph.dbg, /tmp/metagraph_workflows/output_dir/columns.txt, /tmp/metagraph_workflows/output_dir/rd_cols/vector.row_count
    output: /tmp/metagraph_workflows/output_dir/graph.dbg.pred, /tmp/metagraph_workflows/output_dir/graph.dbg.pred_boundary, /tmp/metagraph_workflows/output_dir/graph.dbg.rd_succ, /tmp/metagraph_workflows/output_dir/graph.dbg.succ, /tmp/metagraph_workflows/output_dir/graph.dbg.succ_boundary, /tmp/metagraph_workflows/output_dir/rd_cols/vectors.row_reduction
    log: /tmp/metagraph_workflows/output_dir/logs/transform_rd_stage1.log
    jobid: 9
    resources: mem_mb=4048

2021-10-19 14:37:30,809 - INFO: 
[Tue Oct 19 14:37:38 2021]
2021-10-19 14:37:38,427 - INFO: [Tue Oct 19 14:37:38 2021]
Finished job 9.
2021-10-19 14:37:38,430 - INFO: Finished job 9.
7 of 11 steps (64%) done
2021-10-19 14:37:38,431 - INFO: 7 of 11 steps (64%) done

2021-10-19 14:37:38,435 - INFO: 
[

## Querying Index

In [11]:
!ls {output_dir}

[34mcolumns[m[m                             graph.dbg.succ
columns.txt                         graph.dbg.succ_boundary
graph.dbg                           graph.relax.row_diff_brwt.annodbg
graph.dbg.anchors                   graph.row_diff_brwt.annodbg
graph.dbg.pred                      graph.row_diff_brwt.annodbg.linkage
graph.dbg.pred_boundary             [34mlogs[m[m
graph.dbg.rd_succ                   [34mrd_cols[m[m


In [12]:
graph_path = output_dir / 'graph.dbg'
annotations = output_dir / 'graph.relax.row_diff_brwt.annodbg'

In [13]:
# some "magic" to start a metagraph server on the commandline
import asyncio
import atexit

s = asyncio.create_subprocess_shell(f"metagraph server_query -i {graph_path} -a {annotations}")
process = asyncio.run(s)

def kill_server():
    process.kill()

atexit.register(kill_server) # shutting down server, when jupyter kernel exits

<function __main__.kill_server()>

In [14]:
# https://metagraph.ethz.ch/static/docs/api.html#install-api
from metagraph import client
cl = client.GraphClient('localhost', '5555')

In [15]:
example_seq = 'ACCACCCAAGACTGTAATTGTTCCATCTACACAGGACATATCACAGGACACAGAATGGCTTGGGACATGATGATGAATTGGAGCCCCACCGCGACGCTGGTCCTCGCCCAACT'

In [16]:
# we should find the sequence back!
cl.search([example_seq])

Unnamed: 0,kmer_count,sample,seq_description
0,109,SRR5122826.1,0
1,109,SRR5122826.2,0
2,109,SRR5122826.3,0
3,109,SRR5122826.4,0
4,109,SRR5122826.5,0
...,...,...,...
95,109,SRR5122826.108,0
96,109,SRR5122826.109,0
97,109,SRR5122826.111,0
98,109,SRR5122826.112,0
