In [1]:
from pyspark.sql import SparkSession
import opmDataset
from pandas import *
from mmtfPyspark.io import mmtfReader
from ipywidgets import interact, IntSlider
import py3Dmol

#### Configure Spark

In [2]:
spark = SparkSession.builder.master("local[4]").appName("OPM_DEMO").getOrCreate()
sc = spark.sparkContext

In [3]:
dataset = opmDataset.get_dataset()

In [4]:
dataset.toPandas().head()

Unnamed: 0,Family,ProteinName,PDBID,Species,Localization,Num.Subunits,Num.TMSec.Structs.,HydrophobicThicknessorDepth(Å),TiltAngle(°),ΔGtransfer(kcal/mol)
0,1.1.01.01.,Rhodopsin I,5AX0,Acetabularia acetabulum,Eukaryo. plasma,1,7,32.4 ± 2.2,9 ± 1,-66.6
1,1.1.01.01.,Halorhodopsin,1E12,Halobacterium salinarum,Archaebac.,3,21,31.8 ± 1.4,0 ± 1,-126.7
2,1.1.01.01.,Proteorhodopsin,4JQ6,Gamma-proteobacterium,Bact. Gram-neg inner,6,42,28.4 ± 1.3,0 ± 0,-188.5
3,1.1.01.01.,"Archaerhodopsin-2, trimeric",2EI4,Halobacterium sp.,Archaebac.,3,21,30.5 ± 1.2,0 ± 1,-143.9
4,1.1.01.01.,Archaerhodopsin-1,1UAZ,Halobacterium sp.,Archaebac.,1,7,31.8 ± 1.3,9 ± 2,-65.3


In [8]:
pdbids = dataset.toPandas()['PDBID'].tolist()
print(pdbids)
structures = mmtfReader.download_full_mmtf_files(pdbids[:10], sc)
type(structures)

['5AX0', '1E12', '4JQ6', '2EI4', '1UAZ', '4JR8', '1IW6', '4PXK', '2ZZL', '5AZD', '6EYU', '1H68', '4FBZ', '3A7K', '4QI1', '1PY6', '3VVK', '5JJE', '6EID', '4XTL', '3UG9', '1VGO', '6EIG', '1XIO', '4XTO', '3AM6', '2L6X', '5JSI', '4HYJ', '2M3G', '4TL3', '1M0L', '1FBK', '4WAV', '5B2N', '5VN7', '3DDL', '3X3B', '1H2S', '4KNF', '1AP9', '5XPR', '4GBR', '3CAP', '5WQC', '6BQH', '3UZA', '4GPO', '3EML', '3PXO', '4N6H', '2YDV', '5XSZ', '4U15', '4UHR', '2VT4', '5VBL', '5WIV', '4IB4', '5TE5', '2ZIY', '3QAK', '2Z73', '5O9H', '5ZBQ', '5A8E', '4RWA', '4EA3', '4IAR', '5LWE', '5TUD', '2YCW', '5X33', '3UON', '4GRV', '5TGZ', '5DSG', '4IAQ', '3OE6', '4BUO', '3RZE', '5TVN', '4YAY', '3VGA', '2LNL', '4MBS', '5UIW', '2RH1', '5V54', '5CXV', '1U19', '5GLH', '5T1A', '3OE0', '5UIG', '3PWH', '6B73', '2I37', '4EJ4', '4ZJC', '5GLI', '4MQS', '3ODU', '2IQR', '2X72', '5U09', '4PXZ', '2I36', '4RWS', '4ZWJ', '5T04', '5W0P', '4EIY', '4PXF', '2IQV', '3VW7', '1GZM', '5UEN', '4PHU', '4XES', '2Y02', '4XNV', '4NTJ', '3V2Y', '3PBL',

pyspark.rdd.PipelinedRDD

In [9]:
structures.keys().collect()

['2EI4',
 '1UAZ',
 '2ZZL',
 '5AZD',
 '1E12',
 '5AX0',
 '4JQ6',
 '4JR8',
 '1IW6',
 '4PXK']

In [32]:
def view_proteins(pdb_ids):
    import pycurl
    def view3d(i=0):
        pdb_id = pdb_ids[i].lower()
        opm_url = 'http://opm.phar.umich.edu/pdb/'
        c = pycurl.Curl()
        c.setopt(c.URL, opm_url + pdb_id + '.pdb')
        with open(pdb_id + '.pdb', 'wb') as f:
            c.setopt(c.WRITEDATA, f)
            c.perform()
        
        print(f'PDB: {pdb_ids[i]}')

        structure = open(pdb_id + '.pdb', 'r').read()
        viewer = py3Dmol.view()
        viewer.addModel(structure, 'pdb')

        viewer.setStyle({'cartoon': {'color': 'spectrum'}})
        viewer.setStyle({'hetflag': True}, {'stick': {}})
        viewer.setStyle({'resn': 'DUM'}, {'sphere': {'radius': 0.3}})
        viewer.zoomTo()
        return viewer.show()

    s_widget = IntSlider(min=0, max=len(pdb_ids)-1, description='Structure', continuous_update=True)
    return interact(view3d, i=s_widget)

In [33]:
view_proteins(pdbids)

interactive(children=(IntSlider(value=0, description='Structure', max=3825), Output()), _dom_classes=('widget-…

<function __main__.view_proteins.<locals>.view3d>

In [None]:
spark.stop()