## Benchmark for Reading and Datamining PDB Structures with mmtf-pyspark

In [None]:
from pyspark.sql import SparkSession
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionExtractor, InteractionFilter

import pandas as pd
import os
import time

## Setup the benchmark
Set the path to the MMTF Hadoop Sequence file. Here we retrieve the value of the environment variable MMTF_FULL

In [None]:
path = mmtfReader.get_mmtf_full_path()

Specify a list with the number of cores

In [None]:
cores = [4]

In [None]:
# create results directory
results_dir = '../results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

## Read Benchmark
Benchmarks reading an MMTF Hadoop Sequence File

In [None]:
def read(path, num_core):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Read").getOrCreate()
    structures = mmtfReader.read_sequence_file(path, fraction=0.01)
    count = structures.count()
    spark.stop()
    return count

In [None]:
df_read = pd.DataFrame(columns=('cores', 'read'))

for num_cores in cores:
    start = time.time()
    count = read(path, num_cores)
    end = time.time()
    print('read, cores:', num_cores, 'time:', end-start, 'seconds')
    df_read = df_read.append([{'cores':num_cores, 'read': end-start, 'count': count}], ignore_index=True, sort=True)

In [None]:
df_read.to_csv(os.path.join(results_dir, 'read.csv'), index=False)

In [None]:
df_read

## Interactions Benchmark
This benchmark finds all zinc interactions in PDB structures. Structures with multiple models, e.g., NMR structures are excluded.

In [None]:
def interactions(path, num_core):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Interactions").getOrCreate()
    structures = mmtfReader.read_sequence_file(path, fraction=0.05)
    structures = structures.filter(lambda s: s[1].num_models == 1)
                               
    interaction_filter = InteractionFilter()
    interaction_filter.set_target_elements(False, ['C','H','P'])
    interaction_filter.set_query_elements(True, ['Zn'])
    interaction_filter.set_distance_cutoff(3.0)

    interactions = InteractionExtractor().get_ligand_polymer_interactions(structures, interaction_filter)
    count = interactions.count()

    spark.stop()
    return count

In [None]:
df_interactions = pd.DataFrame(columns=('cores', 'interactions'))

for num_cores in cores:
    start = time.time()
    count = interactions(path, num_cores)
    end = time.time()
    print('interactions, cores:', num_cores, 'time:', end-start, 'seconds')
    df_interactions = df_interactions.append([{'cores':num_cores, 'interactions': end-start, 'count': count}], ignore_index=True, sort=True)

In [None]:
df_interactions.to_csv(os.path.join(results_dir, 'interactions.csv'), index=False)

In [None]:
df_interactions

## Saltbridges Benchmark
This benchmark finds salt bridges in protein structures. Structures with multiple models, e.g., NMR structures are excluded.

In [None]:
def saltbridges(path, num_core):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Saltbridges").getOrCreate()
    structures = mmtfReader.read_sequence_file(path, fraction=0.05)
    structures = structures.filter(lambda s: s[1].num_models == 1)
                               
    salt_bridge = InteractionFilter(distanceCutoff=3.5)
    salt_bridge.set_query_groups(True, ['ASP', 'GLU'])
    salt_bridge.set_query_atom_names(True, ['OD1', 'OD2', 'OE1', 'OE2'])
    salt_bridge.set_target_groups(True, ['ARG', 'LYS', 'HIS'])
    salt_bridge.set_target_atom_names(True, ['NH1', 'NH2', 'NZ', 'ND1', 'NE2'])

    interactions = InteractionExtractor.get_polymer_interactions(structures, salt_bridge)
    count = interactions.count()

    spark.stop()
    return count

In [None]:
df_saltbridges = pd.DataFrame(columns=('cores', 'saltbridges'))

for num_cores in cores:
    start = time.time()
    count = saltbridges(path, num_cores)
    end = time.time()
    print('saltbridges, cores:', num_cores, 'time:', end-start, 'seconds')
    df_saltbridges = df_saltbridges.append([{'cores':num_cores, 'saltbridges': end-start, 'count': count}], ignore_index=True, sort=True)

In [None]:
df_saltbridges.to_csv(os.path.join(results_dir, 'saltbridges.csv'), index=False)
df_saltbridges