# Imports

In [12]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.datasets import groupInteractionExtractor
from mmtfPyspark.io import MmtfReader
from mmtfPyspark.webfilters import blastCluster
import time

# Configure Spark

In [5]:
conf = SparkConf().setMaster("local[*]") \
                  .setAppName("ATPInteractionAnalysisDemo")
    
sc = SparkContext(conf = conf)

# Read PDB in MMTF format

In [11]:
path = "/home/marshuang80/PDB/full"

pdb = MmtfReader.readSequenceFile(path, sc)

# Filter by sequence identity subset

In [13]:
pdb = pdb.filter(blastCluster(40))

# Find ATP interactions within 3 Angstroms

In [14]:
finder = groupInteractionExtractor("ATP", 3)

interactions = finder.getDataset(pdb).cache()

In [15]:
interactions = interactions.filter("atom1 LIKE('O%G')")

# Show the data schema of the dataset and some data

In [16]:
interactions.printSchema()

interactions.show(20)

root
 |-- structureId: string (nullable = false)
 |-- residue1: string (nullable = false)
 |-- atom1: string (nullable = false)
 |-- element1: string (nullable = false)
 |-- index1: integer (nullable = false)
 |-- residue2: string (nullable = false)
 |-- atom2: string (nullable = false)
 |-- element2: string (nullable = false)
 |-- index2: integer (nullable = false)
 |-- distance: float (nullable = false)

+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|structureId|residue1|atom1|element1|index1|residue2|atom2|element2|index2| distance|
+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|       2JJX|     ATP|  O3G|       O|   732|     ARG|  NH2|       N|    97|2.6567695|
|       2JJX|     ATP|  O2G|       O|   732|     HIS|  CE1|       C|   362|2.7452836|
|       2JJX|     ATP|  O2G|       O|   732|     HIS|  NE2|       N|   362| 2.596871|
|       2JJX|     ATP|  O1G|       O|   733|     TYR|   OH|       O|   368

# Count number of interactions

In [17]:
n = interactions.count()

print(f"Number of interactions: {n}")

Number of interactions: 10994


# Identify top interacting groups

In [19]:
topGroups = interactions.groupBy("residue2").count()

topGroups.sort("count", ascending = False).show(10) # Sort descending by count

+--------+-----+
|residue2|count|
+--------+-----+
|     HOH| 2407|
|      MG| 1375|
|     ARG| 1176|
|     SER| 1049|
|     LYS|  995|
|     GLY|  752|
|     THR|  658|
|     ASP|  577|
|      CA|  218|
|     GLU|  203|
+--------+-----+
only showing top 10 rows



# Top interacting groups/atoms types

In [23]:
topGroupsAndAtoms = interactions.groupBy("residue2","atom2").count()

topGroupsAndAtoms.withColumn("frequency", topGroupsAndAtoms["count"] / n)\
                 .sort("frequency", ascending = False) \
                 .show(10)

+--------+-----+-----+--------------------+
|residue2|atom2|count|           frequency|
+--------+-----+-----+--------------------+
|     HOH|    O| 2395|  0.2178460978715663|
|      MG|   MG| 1375| 0.12506821902856102|
|     LYS|   NZ|  777| 0.07067491358923049|
|     SER|   OG|  589| 0.05357467709659815|
|     ARG|  NH2|  550|0.050027287611424415|
|     GLY|    N|  543|  0.0493905766781881|
|     THR|  OG1|  377| 0.03429143169001273|
|     SER|    N|  318|0.028924868109878116|
|     ARG|  NH1|  287| 0.02610514826268874|
|     ASP|  OD1|  247|0.022466800072766965|
+--------+-----+-----+--------------------+
only showing top 10 rows

