## Benchmark for Reading and Datamining PDB Structures with mmtf-pyspark

In [1]:
from pyspark.sql import SparkSession
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.filters import ContainsGroup
from mmtfPyspark.utils import ColumnarStructure
from mmtfPyspark.interactions import InteractionExtractorPd

import gzip
import pandas as pd
import numpy as np
import os
import time

## Setup the benchmark
Set the path to the MMTF Hadoop Sequence file. Here we retrieve the value of the environment variable MMTF_FULL

In [2]:
path = mmtfReader.get_mmtf_full_path()

Hadoop Sequence file path: MMTF_FULL=/Users/peter/MMTF_Files/full


Specify a list with the number of cores

In [3]:
cores = [4]

In [4]:
# create results directory
results_dir = '../results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

## Interactions Benchmark
This benchmark finds all zinc interactions in PDB structures. Structures with multiple models, e.g., NMR structures are excluded.

In [5]:
def interactions(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Interactions").getOrCreate()
    structures = mmtfReader.read_sequence_file(path)
    structures = structures.filter(lambda s: s[1].num_models == 1)
    structures = structures.filter(ContainsGroup('ZN'))
                               
    distance_cutoff = 3.0
    query = "element == 'Zn'"
    target = "polymer and (element not in ['C','H','P'])"

    interactions = InteractionExtractorPd.get_interactions(structures, distance_cutoff, query, target, bio=None)
    count = interactions.count()

    spark.stop()
    return count

In [6]:
df_interactions = pd.DataFrame(columns=('cores', 'interactions_pd'))

for num_cores in cores:
    start = time.time()
    count = interactions(path, num_cores)
    end = time.time()
    print('interactions_pd, cores:', num_cores, 'time:', end-start, 'seconds')
    df_interactions = df_interactions.append([{'cores':num_cores, 'interactions_pd': end-start, 'count': count}], ignore_index=True, sort=True)

interactions_pd, cores: 4 time: 689.0490472316742 seconds


In [7]:
df_interactions.to_csv(os.path.join(results_dir, 'interactions_pd.csv'), index=False)

In [8]:
df_interactions

Unnamed: 0,cores,count,interactions_pd
0,4,127196.0,689.049047


## Saltbridges Benchmark
This benchmark finds salt bridges in protein structures. Structures with multiple models, e.g., NMR structures are excluded.

In [15]:
def saltbridges(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Saltbridges").getOrCreate()
    structures = mmtfReader.read_sequence_file(path)
    structures = structures.filter(lambda s: s[1].num_models == 1)
                               
    distance_cutoff = 3.5
    query = "polymer and (group_name in ['ASP', 'GLU']) and (atom_name in ['OD1', 'OD2', 'OE1', 'OE2'])"
    target = "polymer and (group_name in ['ARG', 'LYS', 'HIS']) and (atom_name in ['NH1', 'NH2', 'NZ', 'ND1', 'NE2'])"

    interactions = InteractionExtractorPd.get_interactions(structures, distance_cutoff, query, target, bio=None)
    count = interactions.count()

    spark.stop()
    return count

In [16]:
df_saltbridges = pd.DataFrame(columns=('cores', 'saltbridges_pd'))

for num_cores in cores:
    start = time.time()
    count = saltbridges(path, num_cores)
    end = time.time()
    print('saltbridges_pd, cores:', num_cores, 'time:', end-start, 'seconds')
    df_saltbridges = df_saltbridges.append([{'cores':num_cores, 'saltbridges_pd': end-start, 'count': count}], ignore_index=True, sort=True)

saltbridges_pd, cores: 4 time: 6268.662695884705 seconds


In [17]:
df_saltbridges.to_csv(os.path.join(results_dir, 'saltbridges_pd.csv'), index=False)
df_saltbridges

Unnamed: 0,cores,count,saltbridges_pd
0,4,523868.0,6268.662696
