## Benchmark for Reading and Datamining PDB Structures with mmtf-pyspark

In [1]:
from pyspark.sql import SparkSession
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.filters import ContainsGroup
from mmtfPyspark.utils import ColumnarStructure
from mmtfPyspark.interactions import InteractionExtractorPd

import gzip
import pandas as pd
import numpy as np
import os
import time

## Setup the benchmark
Set the path to the MMTF Hadoop Sequence file. Here we retrieve the value of the environment variable MMTF_FULL

In [None]:
path = mmtfReader.get_mmtf_full_path()

Specify a list with the number of cores

In [None]:
cores = [4]

In [None]:
# create results directory
results_dir = '../results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

## Structure Benchmark
This benchmark read structures

In [None]:
def structure(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Benchmark3").getOrCreate()
    structures = mmtfReader.read_sequence_file(path, first_model=True)                
    count = structures.count()

    spark.stop()
    return count

In [None]:
df_s = pd.DataFrame(columns=('cores', 'structures'))

for num_cores in cores:
    start = time.time()
    count = structure(path, num_cores)
    end = time.time()
    print('structures, cores:', num_cores, 'time:', end-start, 'seconds')
    df_s = df_s.append([{'cores':num_cores, 'structures': end-start, 'count': count}], ignore_index=True, sort=True)

In [None]:
df_s.to_csv(os.path.join(results_dir, 'structures.csv'), index=False)

In [None]:
df_s

In [2]:
spark = SparkSession.builder.master("local[4]").appName("Benchmark3").getOrCreate()
#structures = mmtfReader.read_sequence_file(path, first_model=True)
structures = mmtfReader.download_full_mmtf_files(['4HHB'])
#structures = structures.filter(lambda s: s[0] == '4HHB')
dfs = structures.map(lambda s: s[1]).first()

In [3]:
import pandas as pd
# df = pd.DataFrame({'chain_name': dfs.chain_names,
#                                     'chain_id': dfs.chain_ids,
#                                     'group_number': dfs.group_numbers,
#                                     'group_name': dfs.group_names,
#                                     'atom_name': dfs.atom_names,
#                                     'altloc': dfs.alt_loc_list,
#                                     'x': dfs.x_coord_list,
#                                     'y': dfs.y_coord_list,
#                                     'z': dfs.z_coord_list,
#                                     'o': dfs.occupancy_list,
#                                     'b': dfs.b_factor_list,
#                                     'element': dfs.elements,
#                                     'polymer:': dfs.polymer
#                                     })

In [None]:
df = pd.DataFrame({'z': dfs.z_coord_list})

In [None]:
df.info()


In [None]:
df['chain_name'] = df['chain_name'].astype('category')
df['group_name'] = df['group_name'].astype('category')
df['atom_name'] = df['atom_name'].astype('category')
df['altloc'] = df['altloc'].astype('category')

In [None]:
df.info()

In [None]:
df

## Structure To Pandas Benchmark
This benchmark read structures and converts them to pandas dataframes

In [None]:
def structure(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Benchmark3").getOrCreate()
    structures = mmtfReader.read_sequence_file(path, first_model=True)
    dfs = structures.map(lambda s: s[1].to_pandas())
    count = dfs.count()

    spark.stop()
    return count

In [None]:
df_s = pd.DataFrame(columns=('cores', 'structures_to_pandas'))

for num_cores in cores:
    start = time.time()
    count = structure(path, num_cores)
    end = time.time()
    print('structures, cores:', num_cores, 'time:', end-start, 'seconds')
    df_s = df_s.append([{'cores':num_cores, 'structures_to_pandas': end-start, 'count': count}], ignore_index=True, sort=True)

In [None]:
df_s.to_csv(os.path.join(results_dir, 'structures_to_pandas.csv'), index=False)

In [None]:
df_s

## Structure to Chain Benchmark
This benchmark read structures and flatmaps to polymer chains

In [None]:
def structure_to_chains(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Interactions").getOrCreate()
    structures = mmtfReader.read_sequence_file(path)
    chains = structures.flatMap(lambda s: s[1].get_chains())                 
    count = chains.count()

    spark.stop()
    return count

In [None]:
df_s2c = pd.DataFrame(columns=('cores', 'structure_to_chains'))

for num_cores in cores:
    start = time.time()
    count = structure_to_chains(path, num_cores)
    end = time.time()
    print('structure_to_chains, cores:', num_cores, 'time:', end-start, 'seconds')
    df_s2c = df_s2c.append([{'cores':num_cores, 'structure_to_chains': end-start, 'count': count}], ignore_index=True, sort=True)

In [None]:
df_s2c.to_csv(os.path.join(results_dir, 'structure_to_chains.csv'), index=False)

In [None]:
df_s2c

## Structure to Chain to pandas Benchmark
This benchmark read structures and flatmaps to polymer chains and convert to pandas dataframes

In [None]:
def structure_to_chains(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Interactions").getOrCreate()
    structures = mmtfReader.read_sequence_file(path)
    chains = structures.flatMap(lambda s: s[1].get_chains())
    dfc = chains.map(lambda c: c.to_pandas())
    count = dfc.count()

    spark.stop()
    return count

In [None]:
df_s2c = pd.DataFrame(columns=('cores', 'structure_to_chains'))

for num_cores in cores:
    start = time.time()
    count = structure_to_chains(path, num_cores)
    end = time.time()
    print('structure_to_chains, cores:', num_cores, 'time:', end-start, 'seconds')
    df_s2c = df_s2c.append([{'cores':num_cores, 'structure_to_chains': end-start, 'count': count}], ignore_index=True, sort=True)

In [None]:
df_s2c.to_csv(os.path.join(results_dir, 'structure_to_chains_to_pandas.csv'), index=False)

In [None]:
df_s2c

## Saltbridges Benchmark
This benchmark finds salt bridges in protein structures. Structures with multiple models, e.g., NMR structures are excluded.

In [None]:
def saltbridges(path, num_cores):
    spark = SparkSession.builder.master("local[" + str(num_cores) + "]").appName("Saltbridges").getOrCreate()
    structures = mmtfReader.read_sequence_file(path)
    structures = structures.filter(lambda s: s[1].num_models == 1)
                               
    distance_cutoff = 3.5
    query = "polymer and (group_name in ['ASP', 'GLU']) and (atom_name in ['OD1', 'OD2', 'OE1', 'OE2'])"
    target = "polymer and (group_name in ['ARG', 'LYS', 'HIS']) and (atom_name in ['NH1', 'NH2', 'NZ', 'ND1', 'NE2'])"

    interactions = InteractionExtractorPd.get_interactions(structures, distance_cutoff, query, target, bio=None)
    count = interactions.count()

    spark.stop()
    return count

In [None]:
df_saltbridges = pd.DataFrame(columns=('cores', 'saltbridges_pd'))

for num_cores in cores:
    start = time.time()
    count = saltbridges(path, num_cores)
    end = time.time()
    print('saltbridges_pd, cores:', num_cores, 'time:', end-start, 'seconds')
    df_saltbridges = df_saltbridges.append([{'cores':num_cores, 'saltbridges_pd': end-start, 'count': count}], ignore_index=True, sort=True)

In [None]:
df_saltbridges.to_csv(os.path.join(results_dir, 'saltbridges_pd.csv'), index=False)
df_saltbridges