# ATP Interaction Analysis

This demo shows how to find interactions of ATP in the PDB.

In [1]:
from pyspark.sql import SparkSession
from mmtfPyspark.datasets import groupInteractionExtractor
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.webfilters import Pisces

#### Configure Spark 

In [2]:
spark = SparkSession.builder.master("local[4]").appName("ATPInteractionAnalysis").getOrCreate()

## Read PDB structures
Read a provided sample (~10,000 structures) of the PDB in MMTF format (https://mmtf.rcsb.org).

In [3]:
path = "../../resources/mmtf_full_sample/"
pdb = mmtfReader.read_sequence_file(path)

## Create a non-redundant subset
Create a Pisces (Protein Sequence Culling Server, http://dunbrack.fccc.edu/PISCES.php) non-redundant subset for this analysis. Select protein chains with less than 40% sequence identity among each other and a resolution of 2.0 A or better.

In [4]:
pdb = pdb.filter(Pisces(sequenceIdentity = 40, resolution = 2.0))

## Find ATP interactions within 3.3 Angstroms
Find interactions within 3 A around each ATP.

![ATPInteraction](./figures/atp-dist2.jpg)

In [5]:
finder = groupInteractionExtractor("ATP", 3.3)

interactions = finder.get_dataset(pdb)

## Find interactions with the terminal phosphate in ATP
The three oxygen atom in the terminal phosphate have names O1G, O2G, O3G.

In [6]:
interactions = interactions.filter("atom1 LIKE('O%G')").cache()

## Show the data schema of the dataset and some data

In [7]:
interactions.printSchema()

interactions.show(20)

root
 |-- structureId: string (nullable = false)
 |-- residue1: string (nullable = false)
 |-- atom1: string (nullable = false)
 |-- element1: string (nullable = false)
 |-- index1: integer (nullable = false)
 |-- residue2: string (nullable = false)
 |-- atom2: string (nullable = false)
 |-- element2: string (nullable = false)
 |-- index2: integer (nullable = false)
 |-- distance: float (nullable = false)

+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|structureId|residue1|atom1|element1|index1|residue2|atom2|element2|index2| distance|
+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|       3SJH|     ATP|  O3G|       O|   379|     SER|    N|       N|     8| 2.702945|
|       3SJH|     ATP|  O3G|       O|   379|     SER|   CB|       C|     8|3.2578347|
|       3SJH|     ATP|  O3G|       O|   379|     SER|   OG|       O|     8| 2.708776|
|       3SJH|     ATP|  O1G|       O|   379|     ASP|    N|       N|   142

## Count number of interactions

In [8]:
n = interactions.count()

print(f"Number of interactions: {n}")

Number of interactions: 638


## Identify top interacting groups with terminal phosphate in ATP

In [9]:
topGroups = interactions.groupBy("residue2").count()

topGroups.sort("count", ascending = False).show(10) # Sort descending by count

+--------+-----+
|residue2|count|
+--------+-----+
|     HOH|  224|
|     SER|   61|
|     GLY|   49|
|     PO4|   45|
|     LYS|   42|
|     ARG|   42|
|      MG|   36|
|     GLU|   24|
|     ASP|   17|
|     ATP|   16|
+--------+-----+
only showing top 10 rows



## Top interacting groups/atom types

In [10]:
topGroupsAndAtoms = interactions.groupBy("residue2","atom2").count()

topGroupsAndAtoms.withColumn("frequency", topGroupsAndAtoms["count"] / n)\
                 .sort("frequency", ascending = False) \
                 .show(10)

+--------+-----+-----+--------------------+
|residue2|atom2|count|           frequency|
+--------+-----+-----+--------------------+
|     HOH|    O|  224|  0.3510971786833856|
|      MG|   MG|   36| 0.05642633228840126|
|     LYS|   NZ|   33| 0.05172413793103448|
|     GLY|    N|   28|  0.0438871473354232|
|     SER|   OG|   23| 0.03605015673981191|
|     SER|    N|   15|0.023510971786833857|
|     SER|   CB|   15|0.023510971786833857|
|     GLY|   CA|   12|0.018808777429467086|
|     ARG|  NH2|   12|0.018808777429467086|
|     ASN|  ND2|   10| 0.01567398119122257|
+--------+-----+-----+--------------------+
only showing top 10 rows



# Terminate Spark

In [11]:
spark.stop()