# Example of using PySpark to do data analysis with dataframe

## Imports and variables

In [1]:
from pyspark import SparkConf, SparkContext                    
from mmtfPyspark.io import MmtfReader                                
from mmtfPyspark.rcsbfilters import pisces                        
from mmtfPyspark.datasets import groupInteractionExtractor
import py3Dmol
import time
                                                               
# Create variables                                             
APP_NAME = "MMTF_Spark"                                        
path = "/home/marshuang80/PDB/full"                            
                                                               
# Configure Spark                                              
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")  
sc = SparkContext(conf=conf)                                   

## Read PDB and create PISCES non-redundant set

In [2]:
pdb = MmtfReader.readSequenceFile(path, sc)
pdb = pdb.filter(pisces(sequenceIdentity = 20, resolution = 2.0))         

## Extract Zinc interactions

In [3]:
finder = groupInteractionExtractor("ZN", distance = 3.0)       
interactions = finder.getDataset(pdb)
interactions.show(10)

+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|structureId|residue1|atom1|element1|index1|residue2|atom2|element2|index2| distance|
+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|       3T7L|      ZN|   ZN|      Zn|    74|     CYS|   SG|       S|    18|2.3716674|
|       3T7L|      ZN|   ZN|      Zn|    74|     CYS|   SG|       S|    21|2.3241453|
|       3T7L|      ZN|   ZN|      Zn|    74|     CYS|   SG|       S|    42|2.3526788|
|       3T7L|      ZN|   ZN|      Zn|    74|     CYS|   SG|       S|    45|2.3325799|
|       3T7L|      ZN|   ZN|      Zn|    75|     CYS|   SG|       S|    34|2.3725426|
|       3T7L|      ZN|   ZN|      Zn|    75|     CYS|   SG|       S|    37| 2.336876|
|       3T7L|      ZN|   ZN|      Zn|    75|     CYS|   SG|       S|    62|2.3524697|
|       3T7L|      ZN|   ZN|      Zn|    75|     CYS|   SG|       S|    65|2.3205538|
|       3T92|      ZN|   ZN|      Zn|   113|     HIS| 

## Visualize first hit

In [4]:
hit = interactions.first()[0]
view = py3Dmol.view(query='pdb:%s'%hit)
view.setStyle({'cartoon': {'color':'spectrum'}})
view.setStyle({'atom':'ZN'},{'sphere': {'color':'gray'}})
view.show()

## Show top 5 interacting groups                           


In [8]:
start = time.time()
interactions.filter("element2 != 'C'").groupBy("residue2").count().sort("count", ascending=False).show(10)
end = time.time()
m,s = divmod(end-start, 60)
print("Total time: %i minute(s) , %i seconds"%(m,s))

+--------+-----+
|residue2|count|
+--------+-----+
|     CYS| 1394|
|     HIS| 1262|
|     HOH| 1047|
|     GLU|  735|
|     ASP|  719|
|     ACT|   75|
|      ZN|   48|
|     CAC|   43|
|     PO4|   40|
|      CL|   36|
+--------+-----+
only showing top 10 rows

Total time: 1 minute(s) , 19 seconds
