# Advanced Zinc Interaction Analysis Example

<img src="./figures/zinc_interaction.png" style="width: 300px;"/>

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from mmtfPyspark.datasets import groupInteractionExtractor
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.webfilters import Pisces

#### Configure Spark 

In [2]:
spark = SparkSession.builder.appName("InteractionAnalysisAdvanced").getOrCreate()

## Read PDB structures
Read a provided sample (~10,000 structures) of the PDB in MMTF format (https://mmtf.rcsb.org).

In [3]:
path = "../../resources/mmtf_full_sample/"

pdb = mmtfReader.read_sequence_file(path)

# Create a non-redundant subset
Create a Pisces (Protein Sequence Culling Server, http://dunbrack.fccc.edu/PISCES.php) non-redundant subset for this analysis. Select protein chains with less than 40% sequence identity among each other and a resolution of 2.0 A or better.

In [4]:
pdb = pdb.filter(Pisces(sequenceIdentity = 40, resolution = 2.0))

## Extract proteins with Zn interactions

In [5]:
finder = groupInteractionExtractor("ZN",3)

interactions = finder.get_dataset(pdb).cache()

## List the top 10 residue types that interact with Zn

In [6]:
interactions.printSchema()

interactions.show(20)

n = interactions.count()

print(f"Number of interactions: {n}")

root
 |-- structureId: string (nullable = false)
 |-- residue1: string (nullable = false)
 |-- atom1: string (nullable = false)
 |-- element1: string (nullable = false)
 |-- index1: integer (nullable = false)
 |-- residue2: string (nullable = false)
 |-- atom2: string (nullable = false)
 |-- element2: string (nullable = false)
 |-- index2: integer (nullable = false)
 |-- distance: float (nullable = false)

+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|structureId|residue1|atom1|element1|index1|residue2|atom2|element2|index2| distance|
+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|       1LBU|      ZN|   ZN|      Zn|   213|     HIS|  NE2|       N|   153| 2.151916|
|       1LBU|      ZN|   ZN|      Zn|   213|     ASP|   CG|       C|   160|2.8048584|
|       1LBU|      ZN|   ZN|      Zn|   213|     ASP|  OD1|       O|   160|1.9849186|
|       1LBU|      ZN|   ZN|      Zn|   213|     HIS|  ND1|       N|   196

## Show the top 10 interacting group/atom types

#### Exclude Carbon Interactions

In [7]:
topGroupsAndAtoms = interactions.filter("element2 != 'C'") \
                                .groupBy("residue2","atom2") \
                                .count()

#### Add column with frequency of occurence
#### Filter out occurrences <=1% 
#### Sort descending

In [8]:
topGroupsAndAtoms.withColumn("frequency", topGroupsAndAtoms["count"] / n) \
                 .filter("frequency > 0.01") \
                 .sort("frequency", ascending = False) \
                 .show(20)

+--------+-----+-----+-------------------+
|residue2|atom2|count|          frequency|
+--------+-----+-----+-------------------+
|     CYS|   SG| 1587|0.18434196770821235|
|     HOH|    O| 1148|0.13334882100127773|
|     HIS|  NE2| 1103|  0.128121733070043|
|     ASP|  OD2|  455|0.05285166686026251|
|     GLU|  OE2|  430|0.04994772912068765|
|     HIS|  ND1|  425|0.04936694157277268|
|     GLU|  OE1|  392|0.04553374375653386|
|     ASP|  OD1|  363|0.04216517597862702|
+--------+-----+-----+-------------------+



## Print the top interacting elements

#### Exclude carbon interactions and group by element 2

In [9]:
topElements = interactions.filter("element2 != 'C'") \
                          .groupBy("element2") \
                          .count()

#### Add column with frequencey of occurence
#### Filter out occurence < 1%
#### sort decending

In [10]:
topElements.withColumn("frequency", topElements["count"] / n) \
           .filter("frequency > 0.01") \
           .sort("frequency", ascending = False) \
           .show(10)

+--------+-----+--------------------+
|element2|count|           frequency|
+--------+-----+--------------------+
|       O| 3363|  0.3906377047276106|
|       N| 1627| 0.18898826809153213|
|       S| 1608| 0.18678127540945522|
|       H|   99|0.011499593448716459|
+--------+-----+--------------------+



In [11]:
interactions.groupBy("element2") \
            .avg("distance") \
            .sort("avg(distance)") \
            .show(10)

+--------+-------------------+
|element2|      avg(distance)|
+--------+-------------------+
|      Mn|0.20966920256614685|
|      Ni| 0.5448364615440369|
|       F| 1.9403731226921082|
|       N| 2.1260014373097014|
|      Na|  2.189347982406616|
|      Zn| 2.2065688480030405|
|      Cl|  2.219455847033748|
|       R| 2.2229933738708496|
|       O|  2.245771651391362|
|       S| 2.3379255972098356|
+--------+-------------------+
only showing top 10 rows



## Aggregate multiple statistics

### NOTE: from pyspark.sql.functions import * required

In [12]:
interactions.groupBy("element2") \
            .agg(count("distance"), avg("distance"), min("distance"), max("distance"), kurtosis("distance")) \
            .show(10)

+--------+---------------+-------------------+-------------+-------------+--------------------+
|element2|count(distance)|      avg(distance)|min(distance)|max(distance)|  kurtosis(distance)|
+--------+---------------+-------------------+-------------+-------------+--------------------+
|       F|              2| 1.9403731226921082|    1.7216884|    2.1590579| -1.9999999999999993|
|      Ni|              2| 0.5448364615440369|   0.53758055|    0.5520924| -1.9999999999999998|
|      As|              3| 2.8134802977244058|     2.688963|    2.8974245|  -1.500000000000001|
|       O|           3363|  2.245771651391362|    1.1738018|    2.9994936|-0.10149437480333878|
|       C|           1785|  2.840591127598653|    1.5134287|      2.99994|   7.300107344115171|
|      Mn|              2|0.20966920256614685|   0.19816406|   0.22117434| -2.0000000000000004|
|       N|           1627| 2.1260014373097014|    1.6628766|    2.9689412|   8.804427073278552|
|      Cl|             54|  2.2194558470

## Terminate Spark

In [13]:
spark.stop()