# ATP Interaction Anaylsis

This demo shows how to create a dataset of ATP Interating atoms.

## Imports

In [1]:
from pyspark import SparkConf, SparkContext, SQLContext
from mmtfPyspark.datasets import groupInteractionExtractor
from mmtfPyspark.io import MmtfReader
from mmtfPyspark.webfilters import blastCluster
import time

## Configure Spark

In [2]:
conf = SparkConf().setMaster("local[*]") \
                  .setAppName("ATPInteractionAnalysisDemo")
    
sc = SparkContext(conf = conf)

## Read PDB in MMTF format

In [3]:
path = "../../resources/mmtf_full_sample/"

pdb = MmtfReader.readSequenceFile(path, sc)

## Filter by sequence identity subset

In [4]:
pdb = pdb.filter(blastCluster(40))

## Find ATP interactions within 3 Angstroms

![ATPInteraction](./figures/atp-dist2.jpg)

In [5]:
finder = groupInteractionExtractor("ATP", 3)

interactions = finder.getDataset(pdb).cache()

In [6]:
interactions = interactions.filter("atom1 LIKE('O%G')")

## Show the data schema of the dataset and some data

In [7]:
interactions.printSchema()

interactions.show(20)

root
 |-- structureId: string (nullable = false)
 |-- residue1: string (nullable = false)
 |-- atom1: string (nullable = false)
 |-- element1: string (nullable = false)
 |-- index1: integer (nullable = false)
 |-- residue2: string (nullable = false)
 |-- atom2: string (nullable = false)
 |-- element2: string (nullable = false)
 |-- index2: integer (nullable = false)
 |-- distance: float (nullable = false)

+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|structureId|residue1|atom1|element1|index1|residue2|atom2|element2|index2| distance|
+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|       1FIN|     ATP|  O2G|       O|  1116|     HOH|    O|       O|  1242| 2.811164|
|       1FIN|     ATP|  O2G|       O|  1117|     GLY|    O|       O|   570|2.7497313|
|       1FIN|     ATP|  O1G|       O|  1117|     THR|  OG1|       O|   571|2.8906834|
|       1FIN|     ATP|  O2G|       O|  1117|     THR|  OG1|       O|   571

## Count number of interactions

In [8]:
n = interactions.count()

print(f"Number of interactions: {n}")

Number of interactions: 249


## Identify top interacting groups

In [9]:
topGroups = interactions.groupBy("residue2").count()

topGroups.sort("count", ascending = False).show(10) # Sort descending by count

+--------+-----+
|residue2|count|
+--------+-----+
|     HOH|   70|
|      MG|   37|
|     OXL|   27|
|     ARG|   27|
|     SER|   15|
|     ASP|   10|
|     THR|    9|
|     LYS|    9|
|     ASN|    6|
|       K|    6|
+--------+-----+
only showing top 10 rows



## Top interacting groups/atoms types

In [10]:
topGroupsAndAtoms = interactions.groupBy("residue2","atom2").count()

topGroupsAndAtoms.withColumn("frequency", topGroupsAndAtoms["count"] / n)\
                 .sort("frequency", ascending = False) \
                 .show(10)

+--------+-----+-----+--------------------+
|residue2|atom2|count|           frequency|
+--------+-----+-----+--------------------+
|     HOH|    O|   65| 0.26104417670682734|
|      MG|   MG|   37| 0.14859437751004015|
|     ARG|  NH2|   20| 0.08032128514056225|
|     OXL|   O1|   12| 0.04819277108433735|
|     SER|   OG|   10|0.040160642570281124|
|     OXL|   O3|    9| 0.03614457831325301|
|     THR|  OG1|    7|0.028112449799196786|
|     LYS|   NZ|    7|0.028112449799196786|
|     ASP|  OD2|    6|0.024096385542168676|
|       K|    K|    6|0.024096385542168676|
+--------+-----+-----+--------------------+
only showing top 10 rows



# Terminate Spark

In [11]:
sc.stop()