## Benchmark for decoding mmtf

In [5]:
from pyspark.sql import SparkSession
from mmtfPyspark.filters import ContainsGroup
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.utils import DsspSecondaryStructure
from mmtfPyspark.utils import MmtfStructure

from mmtfPyspark.interactions import InteractionExtractor, InteractionFilter

import gzip
import pandas as pd
import os
import time

## Setup the benchmark
Set the path to the MMTF Hadoop Sequence file. Here we retrieve the value of the environment variable MMTF_FULL

In [6]:
path = mmtfReader.get_mmtf_full_path()

Hadoop Sequence file path: MMTF_FULL=/Users/peter/MMTF_Files/full


Specify a list with the number of cores

In [7]:
cores = [4]

In [8]:
# create results directory
results_dir = '../results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

## MmtfChain decoding

In [29]:
def mmtf_chain(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Read").getOrCreate()
    sc = spark.sparkContext
    text = "org.apache.hadoop.io.Text"
    byteWritable = "org.apache.hadoop.io.BytesWritable"
    rdd = sc.sequenceFile(path, text, byteWritable)  ## returns key/value tuples
    data = rdd.map(lambda t: gzip.decompress(t[1]))  # t[1] are the values in the rdd
    unpack = data.map(lambda d: pd.read_msgpack(d))
    structure = unpack.map(lambda u: MmtfStructure(u))
    #chains = structure.flatMap(lambda s: s.get_chains())
    acount = structure.map(lambda c: len(c.group_numbers))
    count = acount.count()
    spark.stop()
    return count

In [30]:
df_mmtf_chain = pd.DataFrame(columns=('cores', 'mmtf_chain'))

for num_cores in cores:
    start = time.time()
    count = mmtf_chain(path, num_cores)
    end = time.time()
    print('mmtf_chain, cores:', num_cores, 'time:', end-start, 'seconds')
    df_mmtf_chain = df_mmtf_chain.append([{'cores':num_cores, 'mmtf_chain': end-start, 'count': count}], ignore_index=True, sort=True)

mmtf_chain, cores: 4 time: 396.53967809677124 seconds


In [31]:
df_mmtf_chain

Unnamed: 0,cores,count,mmtf_chain
0,4,140825.0,396.539678


In [1]:
import os
# import msgpack
import gzip
from mmtfPyspark.utils import MmtfStructure
from mmtfPyspark.utils import MmtfChain
from mmtf.api import default_api
from os import path, walk
from pyspark.sql import SparkSession
import urllib
import urllib.request as urllib2
import pandas as pd
import numpy as np

In [2]:
pdbId = "1J6T"
#pdbId = "1OHR"
url = default_api.get_url(pdbId, False)
request = urllib2.Request(url)
request.add_header('Accept-encoding', 'gzip')
response = urllib2.urlopen(request)
if response.info().get('Content-Encoding') == 'gzip':
    data = gzip.decompress(response.read())
else:
    data = response.read()
unpack = pd.read_msgpack(data)
structure = MmtfStructure(unpack)
chain = structure.get_chain('A')
print(chain.start, chain.end)
print(chain.chain_name)
print('atoms:', chain.num_atoms)
print('groups:', chain.num_groups)
print('chains:', chain.num_chains)
print('models:', chain.num_models)
print('models:', chain.elements)


AttributeError: 'MmtfStructure' object has no attribute 'modelToGroupsIndices'

## Read and and Unzip Data in MMTF Hadoop Sequence File
This benchmark reads the MMTF Hadoop Sequence File and unzips the values.

In [None]:
def unzip(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Read").getOrCreate()
    sc = spark.sparkContext
    text = "org.apache.hadoop.io.Text"
    byteWritable = "org.apache.hadoop.io.BytesWritable"
    rdd = sc.sequenceFile(path, text, byteWritable)  ## returns key/value tuples
    data = rdd.map(lambda t: gzip.decompress(t[1]))  # t[1] are the values in the rdd
    count = data.count()
    spark.stop()
    return count

In [None]:
df_unzip = pd.DataFrame(columns=('cores', 'unzip'))

for num_cores in cores:
    start = time.time()
    count = unzip(path, num_cores)
    end = time.time()
    print('unzip, cores:', num_cores, 'time:', end-start, 'seconds')
    df_unzip = df_unzip.append([{'cores':num_cores, 'unzip': end-start, 'count': count}], ignore_index=True, sort=True)

In [None]:
df_unzip.to_csv(os.path.join(results_dir, 'unzip.csv'), index=False)
df_unzip

## Unpack Data
This benchmark read an MMTF Hadoop Sequence File, unzips the data, and decodes the data using the Pandas libarary.

In [None]:
def unpack_pd(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Read").getOrCreate()
    sc = spark.sparkContext
    text = "org.apache.hadoop.io.Text"
    byteWritable = "org.apache.hadoop.io.BytesWritable"
    rdd = sc.sequenceFile(path, text, byteWritable)  ## returns key/value tuples
    data = rdd.map(lambda t: gzip.decompress(t[1]))  # t[1] are the values in the rdd
    unpack = data.map(lambda d: pd.read_msgpack(d))
    count = unpack.count()
    spark.stop()
    return count

In [None]:
df_unpack_pd = pd.DataFrame(columns=('cores', 'unpack_pd'))

for num_cores in cores:
    start = time.time()
    count = unpack_pd(path, num_cores)
    end = time.time()
    print('unpack_pd, cores:', num_cores, 'time:', end-start, 'seconds')
    df_unpack_pd = df_unpack_pd.append([{'cores':num_cores, 'unpack_pd': end-start, 'count': count}], ignore_index=True, sort=True)

In [None]:
df_unpack_pd.to_csv(os.path.join(results_dir, 'unpack_pd.csv'), index=False)
df_unpack_pd

## Unpack Data using MsgPack
This benchmark read an MMTF Hadoop Sequence File, unzips the data, and decodes the data using the msgpack library.

In [None]:
import msgpack

def unpack_msgpack(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Read").getOrCreate()
    sc = spark.sparkContext
    text = "org.apache.hadoop.io.Text"
    byteWritable = "org.apache.hadoop.io.BytesWritable"
    rdd = sc.sequenceFile(path, text, byteWritable)  ## returns key/value tuples
    data = rdd.map(lambda t: gzip.decompress(t[1]))  # t[1] are the values in the rdd
    unpack = data.map(lambda d: msgpack.unpackb(d, raw=False))
    count = unpack.count()
    spark.stop()
    return count

In [None]:
df_unpack_msgpack = pd.DataFrame(columns=('cores', 'unpack_msgpack'))

for num_cores in cores:
    start = time.time()
    count = unpack_msgpack(path, num_cores)
    end = time.time()
    print('unpack_msgpack, cores:', num_cores, 'time:', end-start, 'seconds')
    df_unpack_msgpack = df_unpack_msgpack.append([{'cores':num_cores, 'unpack_msgpack': end-start, 'count': count}], ignore_index=True, sort=True)

In [None]:
df_unpack_msgpack.to_csv(os.path.join(results_dir, 'unpack_msgpack.csv'), index=False)
df_unpack_msgpack