In [63]:
from pyspark.sql import SparkSession
import opmDataset
from pandas import *
from mmtfPyspark.io import mmtfReader
from ipywidgets import interact, IntSlider
import py3Dmol

#### Configure Spark

In [64]:
spark = SparkSession.builder.master("local[4]").appName("OPM_DEMO").getOrCreate()
sc = spark.sparkContext

In [65]:
dataset = opmDataset.get_dataset()

In [66]:
dataset.toPandas().head()

Unnamed: 0,Family,ProteinName,PDBID,Species,Localization,Num.Subunits,Num.TMSec.Structs.,HydrophobicThicknessorDepth(Å),TiltAngle(°),ΔGtransfer(kcal/mol)
0,1.1.01.01.,Rhodopsin I,5AX0,Acetabularia acetabulum,Eukaryo. plasma,1,7,32.4 ± 2.2,9 ± 1,-66.6
1,1.1.01.01.,Halorhodopsin,1E12,Halobacterium salinarum,Archaebac.,3,21,31.8 ± 1.4,0 ± 1,-126.7
2,1.1.01.01.,Proteorhodopsin,4JQ6,Gamma-proteobacterium,Bact. Gram-neg inner,6,42,28.4 ± 1.3,0 ± 0,-188.5
3,1.1.01.01.,"Archaerhodopsin-2, trimeric",2EI4,Halobacterium sp.,Archaebac.,3,21,30.5 ± 1.2,0 ± 1,-143.9
4,1.1.01.01.,Archaerhodopsin-1,1UAZ,Halobacterium sp.,Archaebac.,1,7,31.8 ± 1.3,9 ± 2,-65.3


In [79]:
# get pdb ids for alpha-helical transmembrane proteins and beta-barrel transmembrane proteins
opm_df = dataset.toPandas()
tm_alpha_pdbids = opm_df[[s.startswith('1.1.') for s in opm_df['Family']]]['PDBID'].tolist()
beta_barrel_pdbids = opm_df[[s.startswith('1.3.') for s in opm_df['Family']]]['PDBID'].tolist()
gpcrpdbids = opm_df[[s.startswith('1.1.01.01') for s in opm_df['Family']]]['PDBID'].tolist()
print(gpcrpdbids)

['5AX0', '1E12', '4JQ6', '2EI4', '1UAZ', '4JR8', '1IW6', '4PXK', '2ZZL', '5AZD', '6EYU', '1H68', '4FBZ', '3A7K', '1PY6', '4QI1', '3VVK', '5JJE', '6EID', '4XTL', '1VGO', '3UG9', '6EIG', '1XIO', '4XTO', '3AM6', '2L6X', '5JSI', '4HYJ', '2M3G', '4TL3', '1M0L', '1FBK', '4WAV', '5B2N', '5VN7', '3DDL', '3X3B', '1H2S', '4KNF', '1AP9']


In [None]:
# import urllib

#def DepPDBFilter(pdbid):
#    request = urllib.request.Request('https://mmtf.rcsb.org/v1.0/full/' + pdbid)
#    request.get_method = lambda: 'HEAD'
#    try:
#        urllib.request.urlopen(request)
#        return True
#    except urllib.request.HTTPError:
#        return False
#
#cleanpdblist = [i for i in pdbids if DepPDBFilter(i)]
#print(cleanpdblist)

In [12]:
#sppdbids = sc.parallelize(set(pdbids)).filter(lambda t: DepPDBFilter(t))
# structures = mmtfReader.download_full_mmtf_files(cleanpdblist, sc).cache()

In [69]:
def view_proteins(pdb_ids):
    import pycurl
    def view3d(i=0):
        pdb_id = pdb_ids[i].lower()
        opm_url = 'http://opm.phar.umich.edu/pdb/'
        c = pycurl.Curl()
        c.setopt(c.URL, opm_url + pdb_id + '.pdb')
        with open(pdb_id + '.pdb', 'wb') as f:
            c.setopt(c.WRITEDATA, f)
            c.perform()
        
        print(f'PDB: {pdb_ids[i]}')

        structure = open(pdb_id + '.pdb', 'r').read()
        viewer = py3Dmol.view()
        viewer.addModel(structure, 'pdb')

        viewer.setStyle({'cartoon': {'color': 'spectrum'}})
        viewer.setStyle({'hetflag': True}, {'stick': {}})
        viewer.setStyle({'resn': 'DUM'}, {'sphere': {'radius': 0.3}})
        viewer.zoomTo()
        viewer.rotate(-90, 'x', 1)
        return viewer.show()

    s_widget = IntSlider(min=0, max=len(pdb_ids)-1, description='Structure', continuous_update=False)
    return interact(view3d, i=s_widget)

In [82]:
view_proteins(tm_alpha_pdbids)

interactive(children=(IntSlider(value=0, continuous_update=False, description='Structure', max=1228), Output()…

<function __main__.view_proteins.<locals>.view3d>

In [80]:
view_proteins(gpcrpdbids)

interactive(children=(IntSlider(value=0, continuous_update=False, description='Structure', max=40), Output()),…

<function __main__.view_proteins.<locals>.view3d>

In [81]:
view_proteins(beta_barrel_pdbids)

interactive(children=(IntSlider(value=0, continuous_update=False, description='Structure', max=241), Output())…

<function __main__.view_proteins.<locals>.view3d>

In [None]:
spark.stop()