# Advanced Zinc Interaction Analysis Example

<img src="./figures/zinc_interaction.png" style="width: 300px;"/>

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from mmtfPyspark.datasets import groupInteractionExtractor
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.webfilters import Pisces

#### Configure Spark 

In [2]:
spark = SparkSession.builder.master("local[4]").appName("InteractionAnalysisAdvanced").getOrCreate()

## Read PDB structures
Read a provided sample (~10,000 structures) of the PDB in MMTF format (https://mmtf.rcsb.org).

In [3]:
path = "../../resources/mmtf_full_sample/"

pdb = mmtfReader.read_sequence_file(path)

# Create a non-redundant subset
Create a Pisces (Protein Sequence Culling Server, http://dunbrack.fccc.edu/PISCES.php) non-redundant subset for this analysis. Select protein chains with less than 40% sequence identity among each other and a resolution of 2.0 A or better.

In [4]:
pdb = pdb.filter(Pisces(sequenceIdentity = 40, resolution = 2.0))

## Extract proteins with Zn interactions

In [5]:
finder = groupInteractionExtractor("ZN",3)

interactions = finder.get_dataset(pdb).cache()

## List the top 10 residue types that interact with Zn

In [6]:
interactions.printSchema()

interactions.show(20)

n = interactions.count()

print(f"Number of interactions: {n}")

root
 |-- structureId: string (nullable = false)
 |-- residue1: string (nullable = false)
 |-- atom1: string (nullable = false)
 |-- element1: string (nullable = false)
 |-- index1: integer (nullable = false)
 |-- residue2: string (nullable = false)
 |-- atom2: string (nullable = false)
 |-- element2: string (nullable = false)
 |-- index2: integer (nullable = false)
 |-- distance: float (nullable = false)

+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|structureId|residue1|atom1|element1|index1|residue2|atom2|element2|index2| distance|
+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|       1LBU|      ZN|   ZN|      Zn|   213|     HIS|  NE2|       N|   153|2.1519165|
|       1LBU|      ZN|   ZN|      Zn|   213|     ASP|   CG|       C|   160|2.8048582|
|       1LBU|      ZN|   ZN|      Zn|   213|     ASP|  OD1|       O|   160|1.9849179|
|       1LBU|      ZN|   ZN|      Zn|   213|     HIS|  ND1|       N|   196

## Show the top 10 interacting group/atom types

#### Exclude Carbon Interactions

In [7]:
topGroupsAndAtoms = interactions.filter("element2 != 'C'") \
                                .groupBy("residue2","atom2") \
                                .count()

#### Add column with frequency of occurence
#### Filter out occurrences <=1% 
#### Sort descending

In [8]:
topGroupsAndAtoms.withColumn("frequency", topGroupsAndAtoms["count"] / n) \
                 .filter("frequency > 0.01") \
                 .sort("frequency", ascending = False) \
                 .show(20)

+--------+-----+-----+-------------------+
|residue2|atom2|count|          frequency|
+--------+-----+-----+-------------------+
|     CYS|   SG| 1645| 0.1856449610653425|
|     HOH|    O| 1196|0.13497347929127637|
|     HIS|  NE2| 1142|0.12887935898882744|
|     ASP|  OD2|  459|0.05180002257081594|
|     GLU|  OE2|  442|0.04988150321634127|
|     HIS|  ND1|  433|0.04886581649926645|
|     GLU|  OE1|  401|0.04525448594966708|
|     ASP|  OD1|  371|0.04186886355941767|
+--------+-----+-----+-------------------+



## Print the top interacting elements

#### Exclude carbon interactions and group by element 2

In [9]:
topElements = interactions.filter("element2 != 'C'") \
                          .groupBy("element2") \
                          .count()

#### Add column with frequencey of occurence
#### Filter out occurence < 1%
#### sort decending

In [10]:
topElements.withColumn("frequency", topElements["count"] / n) \
           .filter("frequency > 0.01") \
           .sort("frequency", ascending = False) \
           .show(10)

+--------+-----+--------------------+
|element2|count|           frequency|
+--------+-----+--------------------+
|       O| 3457| 0.39013655343640674|
|       N| 1675| 0.18903058345559193|
|       S| 1666|  0.1880148967385171|
|       H|  104|0.011736824286197945|
+--------+-----+--------------------+



In [11]:
interactions.groupBy("element2") \
            .avg("distance") \
            .sort("avg(distance)") \
            .show(10)

+--------+------------------+
|element2|     avg(distance)|
+--------+------------------+
|      Mn|0.2096691057085991|
|      Ni|0.5448365211486816|
|       F| 1.940373182296753|
|       N| 2.125986994060118|
|      Na| 2.189347505569458|
|      Zn|2.2065686800263147|
|      Cl|2.2211811542510986|
|       R| 2.222992499669393|
|       O| 2.244950079421346|
|       S|2.3377188774002415|
+--------+------------------+
only showing top 10 rows



## Aggregate multiple statistics

### NOTE: from pyspark.sql.functions import * required

In [12]:
interactions.groupBy("element2") \
            .agg(count("distance"), avg("distance"), min("distance"), max("distance"), kurtosis("distance")) \
            .show(10)

+--------+---------------+------------------+-------------+-------------+--------------------+
|element2|count(distance)|     avg(distance)|min(distance)|max(distance)|  kurtosis(distance)|
+--------+---------------+------------------+-------------+-------------+--------------------+
|       F|              2| 1.940373182296753|    1.7216878|    2.1590586| -1.9999999999999998|
|      Ni|              2|0.5448365211486816|   0.53758067|    0.5520924| -1.9999999999999993|
|      As|              3| 2.813481410344442|     2.688965|    2.8974242|                -1.5|
|       O|           3457| 2.244950079421346|    1.1738015|    2.9994946|-0.07213122347964651|
|       C|           1829|2.8415631190097157|    1.5134286|    2.9999394|   7.377358034263853|
|      Mn|              2|0.2096691057085991|   0.19816408|   0.22117414| -1.9999999999999993|
|       N|           1675| 2.125986994060118|    1.6628777|    2.9689415|    8.91490107405303|
|      Cl|             56|2.2211811542510986|    1

## Terminate Spark

In [13]:
spark.stop()