# Example of using PySpark to do data analysis with dataframe

## Imports and variables

In [3]:
from pyspark import SparkConf, SparkContext                    
from src.main import MmtfReader                                
from src.main.rcsbfilters import pisces                        
from src.main.datasets import groupInteractionExtractor
import py3Dmol
                                                               
# Create variables                                             
APP_NAME = "MMTF_Spark"                                        
path = "/home/marshuang80/PDB/full"                            
                                                               
# Configure Spark                                              
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")  
sc = SparkContext(conf=conf)                                   

## Read PDB and create PISCES non-redundant set

In [4]:
pdb = MmtfReader.readSequenceFile(path, sc)
pdb = pdb.filter(pisces(sequenceIdentity = 20, resolution = 2.0))         

## Extract Zinc interactions

In [5]:
finder = groupInteractionExtractor("ZN", distance = 3.0)       
interactions = finder.getDataset(pdb)

## Visualize first hit

In [16]:
hit = interactions.first()[0]
view = py3Dmol.view(query='pdb:%s'%hit)
view.setStyle({'cartoon': {'color':'spectrum'}})
view.setStyle({'atom':'ZN'},{'sphere': {'color':'gray'}})
view.show()

## Show top 5 interacting groups                           


In [18]:
interactions.filter("element2 != 'C'").groupBy("residue2").count().sort("count", ascending=False).show(5)

+--------+-----+
|residue2|count|
+--------+-----+
|     CYS| 1394|
|     HIS| 1265|
|     HOH| 1049|
|     GLU|  737|
|     ASP|  722|
+--------+-----+
only showing top 5 rows

