## Benchmark for Reading and Datamining PDB Structures with mmtf-pyspark

In [1]:
from pyspark.sql import SparkSession
from mmtfPyspark.filters import ContainsGroup
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionExtractor, InteractionFilter

import gzip
import pandas as pd
import os
import time

## Setup the benchmark
Set the path to the MMTF Hadoop Sequence file. Here we retrieve the value of the environment variable MMTF_FULL

In [2]:
path = mmtfReader.get_mmtf_full_path()

Hadoop Sequence file path: MMTF_FULL=/Users/peter/MMTF_Files/full


Specify a list with the number of cores

In [3]:
cores = [4]

In [4]:
# create results directory
results_dir = '../results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

## Read Benchmark
Benchmarks reading an MMTF Hadoop Sequence File

In [5]:
def read(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Read").getOrCreate()
    structures = mmtfReader.read_sequence_file(path)
    count = structures.count()
    spark.stop()
    return count

In [6]:
df_read = pd.DataFrame(columns=('cores', 'read'))

for num_cores in cores:
    start = time.time()
    count = read(path, num_cores)
    end = time.time()
    print('read, cores:', num_cores, 'time:', end-start, 'seconds')
    df_read = df_read.append([{'cores':num_cores, 'read': end-start, 'count': count}], ignore_index=True, sort=True)

read, cores: 4 time: 229.9848439693451 seconds


In [7]:
df_read.to_csv(os.path.join(results_dir, 'read.csv'), index=False)

In [8]:
df_read

Unnamed: 0,cores,count,read
0,4,140825.0,229.984844


## Interactions Benchmark
This benchmark finds all zinc interactions in PDB structures. Structures with multiple models, e.g., NMR structures are excluded.

In [22]:
def interactions(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Interactions").getOrCreate()
    structures = mmtfReader.read_sequence_file(path)
    structures = structures.filter(lambda s: s[1].num_models == 1)
    structures = structures.filter(ContainsGroup('ZN'))
                               
    interaction_filter = InteractionFilter()
    interaction_filter.set_target_elements(False, ['C','H','P'])
    interaction_filter.set_query_elements(True, ['Zn'])
    interaction_filter.set_distance_cutoff(3.0)

    interactions = InteractionExtractor().get_ligand_polymer_interactions(structures, interaction_filter)
    count = interactions.count()

    spark.stop()
    return count

In [23]:
df_interactions = pd.DataFrame(columns=('cores', 'interactions'))

for num_cores in cores:
    start = time.time()
    count = interactions(path, num_cores)
    end = time.time()
    print('interactions, cores:', num_cores, 'time:', end-start, 'seconds')
    df_interactions = df_interactions.append([{'cores':num_cores, 'interactions': end-start, 'count': count}], ignore_index=True, sort=True)

interactions, cores: 4 time: 311.6189091205597 seconds


In [24]:
df_interactions.to_csv(os.path.join(results_dir, 'interactions.csv'), index=False)

In [25]:
df_interactions

Unnamed: 0,cores,count,interactions
0,4,127196.0,311.618909


## Saltbridges Benchmark
This benchmark finds salt bridges in protein structures. Structures with multiple models, e.g., NMR structures are excluded.

In [26]:
def saltbridges(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Saltbridges").getOrCreate()
    structures = mmtfReader.read_sequence_file(path)
    structures = structures.filter(lambda s: s[1].num_models == 1)
                               
    salt_bridge = InteractionFilter(distanceCutoff=3.5)
    salt_bridge.set_query_groups(True, ['ASP', 'GLU'])
    salt_bridge.set_query_atom_names(True, ['OD1', 'OD2', 'OE1', 'OE2'])
    salt_bridge.set_target_groups(True, ['ARG', 'LYS', 'HIS'])
    salt_bridge.set_target_atom_names(True, ['NH1', 'NH2', 'NZ', 'ND1', 'NE2'])

    interactions = InteractionExtractor.get_polymer_interactions(structures, salt_bridge)
    count = interactions.count()

    spark.stop()
    return count

In [27]:
df_saltbridges = pd.DataFrame(columns=('cores', 'saltbridges'))

for num_cores in cores:
    start = time.time()
    count = saltbridges(path, num_cores)
    end = time.time()
    print('saltbridges, cores:', num_cores, 'time:', end-start, 'seconds')
    df_saltbridges = df_saltbridges.append([{'cores':num_cores, 'saltbridges': end-start, 'count': count}], ignore_index=True, sort=True)

saltbridges, cores: 4 time: 1050.5137100219727 seconds


In [28]:
df_saltbridges.to_csv(os.path.join(results_dir, 'saltbridges.csv'), index=False)
df_saltbridges

Unnamed: 0,cores,count,saltbridges
0,4,523868.0,1050.51371


## Read MMTF Hadoop Sequence File
This benchmarks read the raw MMTF Hadoop Sequence file.

In [16]:
def read_hadoop(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Read").getOrCreate()
    sc = spark.sparkContext
    text = "org.apache.hadoop.io.Text"
    byteWritable = "org.apache.hadoop.io.BytesWritable"
    rdd = sc.sequenceFile(path, text, byteWritable)
    count = rdd.count()
    spark.stop()
    return count

In [17]:
df_read_hadoop = pd.DataFrame(columns=('cores', 'read_hadoop'))

for num_cores in cores:
    start = time.time()
    count = read_hadoop(path, num_cores)
    end = time.time()
    print('read_hadoop, cores:', num_cores, 'time:', end-start, 'seconds')
    df_read_hadoop = df_read_hadoop.append([{'cores':num_cores, 'read_hadoop': end-start, 'count': count}], ignore_index=True, sort=True)

read_hadoop, cores: 4 time: 70.44136500358582 seconds


In [18]:
df_read_hadoop.to_csv(os.path.join(results_dir, 'read_hadoop.csv'), index=False)
df_read_hadoop

Unnamed: 0,cores,count,read_hadoop
0,4,140825.0,70.441365


## Read and and Unzip Data in MMTF Hadoop Sequence File
This benchmark reads the MMTF Hadoop Sequence File and unzips the values.

In [19]:
def unzip(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Read").getOrCreate()
    sc = spark.sparkContext
    text = "org.apache.hadoop.io.Text"
    byteWritable = "org.apache.hadoop.io.BytesWritable"
    rdd = sc.sequenceFile(path, text, byteWritable)  ## returns key/value tuples
    data = rdd.map(lambda t: gzip.decompress(t[1]))  # t[1] are the values in the rdd
    count = data.count()
    spark.stop()
    return count

In [20]:
df_unzip = pd.DataFrame(columns=('cores', 'unzip'))

for num_cores in cores:
    start = time.time()
    count = unzip(path, num_cores)
    end = time.time()
    print('unzip, cores:', num_cores, 'time:', end-start, 'seconds')
    df_unzip = df_unzip.append([{'cores':num_cores, 'unzip': end-start, 'count': count}], ignore_index=True, sort=True)

unzip, cores: 4 time: 110.27356004714966 seconds


In [21]:
df_unzip.to_csv(os.path.join(results_dir, 'unzip.csv'), index=False)
df_unzip

Unnamed: 0,cores,count,unzip
0,4,140825.0,110.27356


## Unpack Data
This benchmark read an MMTF Hadoop Sequence File, unzips the data, and decodes the data using the Pandas libarary.

In [22]:
def unpack_pd(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Read").getOrCreate()
    sc = spark.sparkContext
    text = "org.apache.hadoop.io.Text"
    byteWritable = "org.apache.hadoop.io.BytesWritable"
    rdd = sc.sequenceFile(path, text, byteWritable)  ## returns key/value tuples
    data = rdd.map(lambda t: gzip.decompress(t[1]))  # t[1] are the values in the rdd
    unpack = data.map(lambda d: pd.read_msgpack(d))
    count = unpack.count()
    spark.stop()
    return count

In [23]:
df_unpack_pd = pd.DataFrame(columns=('cores', 'unpack_pd'))

for num_cores in cores:
    start = time.time()
    count = unpack_pd(path, num_cores)
    end = time.time()
    print('unpack_pd, cores:', num_cores, 'time:', end-start, 'seconds')
    df_unpack_pd = df_unpack_pd.append([{'cores':num_cores, 'unpack_pd': end-start, 'count': count}], ignore_index=True, sort=True)

unpack_pd, cores: 4 time: 131.6141209602356 seconds


In [24]:
df_unpack_pd.to_csv(os.path.join(results_dir, 'unpack_pd.csv'), index=False)
df_unpack_pd

Unnamed: 0,cores,count,unpack_pd
0,4,140825.0,131.614121


## Unpack Data using MsgPack
This benchmark read an MMTF Hadoop Sequence File, unzips the data, and decodes the data using the msgpack library.

In [25]:
import msgpack

def unpack_msgpack(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Read").getOrCreate()
    sc = spark.sparkContext
    text = "org.apache.hadoop.io.Text"
    byteWritable = "org.apache.hadoop.io.BytesWritable"
    rdd = sc.sequenceFile(path, text, byteWritable)  ## returns key/value tuples
    data = rdd.map(lambda t: gzip.decompress(t[1]))  # t[1] are the values in the rdd
    unpack = data.map(lambda d: msgpack.unpackb(d, raw=False))
    count = unpack.count()
    spark.stop()
    return count

In [26]:
df_unpack_msgpack = pd.DataFrame(columns=('cores', 'unpack_msgpack'))

for num_cores in cores:
    start = time.time()
    count = unpack_msgpack(path, num_cores)
    end = time.time()
    print('unpack_msgpack, cores:', num_cores, 'time:', end-start, 'seconds')
    df_unpack_msgpack = df_unpack_msgpack.append([{'cores':num_cores, 'unpack_msgpack': end-start, 'count': count}], ignore_index=True, sort=True)

unpack_msgpack, cores: 4 time: 436.3425590991974 seconds


In [27]:
df_unpack_msgpack.to_csv(os.path.join(results_dir, 'unpack_msgpack.csv'), index=False)
df_unpack_msgpack

Unnamed: 0,cores,count,unpack_msgpack
0,4,140825.0,436.342559
