# Example of using PySpark to do data analysis with dataframe

## Imports and variables

In [45]:
from pyspark import SparkConf, SparkContext                    
from src.main import MmtfReader                                
from src.main.rcsbfilters import pisces                        
from src.main.datasets import groupInteractionExtractor
import py3Dmol
                                                               
# Create variables                                             
APP_NAME = "MMTF_Spark"                                        
path = "/home/marshuang80/PDB/full"                            
                                                               
# Configure Spark                                              
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")  
sc = SparkContext(conf=conf)                                   

## Extract and list top interacting groups for Zinc in PDB   

In [46]:
# Read all PDB Entries
pdb = MmtfReader.readSequenceFile(path, sc)

# Save a non-redundant subset using Pisces filter (R. Dunbrack)
sequenceIdentity = 20                                          
resolution = 2.0                                               
pdb = pdb.filter(pisces(sequenceIdentity, resolution))         

# Extract interacting groups
cutoffDistance = 3.0                                           
finder = groupInteractionExtractor("ZN", cutoffDistance)       
interactions = finder.getDataset(pdb)

## Structure Visualization

In [51]:
exampleProtein = interactions.first()
proteinId = exampleProtein['structureId'].lower()

p = py3Dmol.view(query='pdb:%s'%proteinId, width = 200, height = 200)
p.setStyle({'stick': {'color':'spectrum'}})
p.show()

In [76]:
view = py3Dmol.view(query='pdb:%s'%proteinId)
chA = {'atom':'ZN'}
view.setStyle(chA,{'stick': {'color':'spectrum'}})
view.setStyle({'stick': {'color':'spectrum', 'opacity':0.5}})
view.addSurface(py3Dmol.VDW,{'opacity':0.9,'color':'white'},chA)
view.show()

## Show the top 10 interacting groups                           


In [48]:
interactions.filter("element2 != 'C'").groupBy("residue2").count().sort("count", ascending=False).show(10)
sc.stop()

KeyboardInterrupt: 