# Mine PDB for Protein Complexes
This notebook creates a list of UniProt Id of protein homomer/heteromers in the PDB.

In [1]:
from pyspark.sql import SparkSession
from ipywidgets import widgets
from mmtfPyspark.webfilters import AdvancedQuery, PdbjMineSearch
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.datasets import polymerSequenceExtractor, pdbjMineDataset, advancedSearchDataset
from mmtfPyspark.io import mmtfReader

#### Configure Spark

In [2]:
spark =  SparkSession.builder.master("local[4]").appName("ProteinComplexesV2").getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

## Select type of complex

In [3]:
selection = widgets.RadioButtons(options=['homomer', 'heteromer'],description='Complex:')

In [4]:
selection

RadioButtons(description='Complex:', options=('homomer', 'heteromer'), value='homomer')

In [5]:
complexType = selection.value

In [6]:
complexType

'homomer'

## Find biological assemblies of the requested type

In [7]:
query = (
    "<orgPdbQuery>"
        "<queryType>org.pdb.query.simple.StoichiometryQuery</queryType>"
        "<stoichiometry>" + complexType + "</stoichiometry>"
    "</orgPdbQuery>"
)

complexes = advancedSearchDataset.get_dataset(query).cache()
complexes.show()
complexes.count()

  return f(*args, **kwds)
  return f(*args, **kwds)


+-----------+
|structureId|
+-----------+
|       10GS|
|       117E|
|       11AS|
|       11BA|
|       11BG|
|       11GS|
|       121P|
|       12AS|
|       12GS|
|       137L|
|       13GS|
|       14GS|
|       16GS|
|       17GS|
|       18GS|
|       19GS|
|       19HC|
|       1A03|
|       1A05|
|       1A07|
+-----------+
only showing top 20 rows



51566

## Filter chains by taxonomy

In [8]:
taxonomyQuery = "SELECT * FROM sifts.pdb_chain_taxonomy WHERE scientific_name = 'Homo sampiens'"

In [9]:
taxonomy = pdbjMineDataset.get_dataset(taxonomyQuery).cache()
taxonomy.show()

+-----+-----+------+---------------+----------------+
|pdbid|chain|tax_id|scientific_name|structureChainId|
+-----+-----+------+---------------+----------------+
| 10GS|    A|  9606|  Homo sampiens|          10GS.A|
| 10GS|    B|  9606|  Homo sampiens|          10GS.B|
| 11GS|    A|  9606|  Homo sampiens|          11GS.A|
| 11GS|    B|  9606|  Homo sampiens|          11GS.B|
| 121P|    A|  9606|  Homo sampiens|          121P.A|
| 12CA|    A|  9606|  Homo sampiens|          12CA.A|
| 12GS|    A|  9606|  Homo sampiens|          12GS.A|
| 12GS|    B|  9606|  Homo sampiens|          12GS.B|
| 133L|    A|  9606|  Homo sampiens|          133L.A|
| 134L|    A|  9606|  Homo sampiens|          134L.A|
| 13GS|    A|  9606|  Homo sampiens|          13GS.A|
| 13GS|    B|  9606|  Homo sampiens|          13GS.B|
| 14GS|    A|  9606|  Homo sampiens|          14GS.A|
| 14GS|    B|  9606|  Homo sampiens|          14GS.B|
| 16GS|    A|  9606|  Homo sampiens|          16GS.A|
| 16GS|    B|  9606|  Homo s

In [10]:
human_complexes = taxonomy.join(complexes, taxonomy.pdbid == complexes.structureId).drop("structureId").cache()
human_complexes.show()
human_complexes.count()

+-----+-----+------+---------------+----------------+
|pdbid|chain|tax_id|scientific_name|structureChainId|
+-----+-----+------+---------------+----------------+
| 10GS|    A|  9606|  Homo sampiens|          10GS.A|
| 10GS|    B|  9606|  Homo sampiens|          10GS.B|
| 11GS|    A|  9606|  Homo sampiens|          11GS.A|
| 11GS|    B|  9606|  Homo sampiens|          11GS.B|
| 121P|    A|  9606|  Homo sampiens|          121P.A|
| 12GS|    A|  9606|  Homo sampiens|          12GS.A|
| 12GS|    B|  9606|  Homo sampiens|          12GS.B|
| 13GS|    A|  9606|  Homo sampiens|          13GS.A|
| 13GS|    B|  9606|  Homo sampiens|          13GS.B|
| 14GS|    A|  9606|  Homo sampiens|          14GS.A|
| 14GS|    B|  9606|  Homo sampiens|          14GS.B|
| 16GS|    A|  9606|  Homo sampiens|          16GS.A|
| 16GS|    B|  9606|  Homo sampiens|          16GS.B|
| 17GS|    A|  9606|  Homo sampiens|          17GS.A|
| 17GS|    B|  9606|  Homo sampiens|          17GS.B|
| 18GS|    A|  9606|  Homo s

24478

## Get list of chain ids with UniProt Id mappings

In [11]:
uniprotQuery = "SELECT * FROM sifts.pdb_chain_uniprot"
uniprot = pdbjMineDataset.get_dataset(uniprotQuery)
uniprot = uniprot.select("structureChainId", "sp_primary").withColumnRenamed("structureChainId", "id")

## Join dataset with the UniProt ids

In [12]:
human_complexes = human_complexes.join(uniprot, human_complexes.structureChainId == uniprot.id).drop("id").cache()

In [13]:
human_complexes.show()
human_complexes.select("sp_primary").distinct().count()

+-----+-----+------+---------------+----------------+----------+
|pdbid|chain|tax_id|scientific_name|structureChainId|sp_primary|
+-----+-----+------+---------------+----------------+----------+
| 10GS|    A|  9606|  Homo sampiens|          10GS.A|    P09211|
| 10GS|    B|  9606|  Homo sampiens|          10GS.B|    P09211|
| 11GS|    A|  9606|  Homo sampiens|          11GS.A|    P09211|
| 11GS|    B|  9606|  Homo sampiens|          11GS.B|    P09211|
| 121P|    A|  9606|  Homo sampiens|          121P.A|    P01112|
| 12GS|    A|  9606|  Homo sampiens|          12GS.A|    P09211|
| 12GS|    B|  9606|  Homo sampiens|          12GS.B|    P09211|
| 13GS|    A|  9606|  Homo sampiens|          13GS.A|    P09211|
| 13GS|    B|  9606|  Homo sampiens|          13GS.B|    P09211|
| 14GS|    A|  9606|  Homo sampiens|          14GS.A|    P09211|
| 14GS|    B|  9606|  Homo sampiens|          14GS.B|    P09211|
| 16GS|    A|  9606|  Homo sampiens|          16GS.A|    P09211|
| 16GS|    B|  9606|  Hom

2336

## Save unique list of uniProt Ids involved in PDB complexes

In [14]:
human_complexes.select("sp_primary").distinct().toPandas().to_csv("../../data/" + complexType + ".csv")

In [15]:
spark.stop()