# Mine for Protein Complexes in the PDB
This notebook creates a list of UniProt Id of protein involved in protein-protein interactions of homomer/heteromers.

In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.webfilters import AdvancedQuery, PdbjMineSearch
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.datasets import polymerSequenceExtractor, pdbjMineDataset
from mmtfPyspark.io import mmtfReader

### Configure Spark Context

In [2]:
conf = SparkConf().setMaster("local[*]").setAppName("proteinComplexes")
sc = SparkContext(conf = conf)

## Read PDB

In [3]:
pdb = mmtfReader.read_reduced_sequence_file(sc)

Hadoop Sequence file path: MMTF_REDUCED=/Users/peter/MMTF_Files/reduced


## Filter by type of complex

In [4]:
#complexType = "homomer"
complexType = "heteromer"

In [5]:
query = (
    "<orgPdbQuery>"
        "<queryType>org.pdb.query.simple.StoichiometryQuery</queryType>"
        "<stoichiometry>" + complexType + "</stoichiometry>"
    "</orgPdbQuery>"
)

pdb = pdb.filter(AdvancedQuery(query));

## Split PDB structures into polymer chains

In [6]:
pdb = pdb.flatMap(StructureToPolymerChains(excludeDuplicates=True))

pdb.cache()
pdb.count()

82525

## Filter chains by taxonomy

In [7]:
taxonomyQuery = "SELECT * FROM sifts.pdb_chain_taxonomy WHERE scientific_name = 'Homo sampiens'"

In [8]:
pdb = pdb.filter(PdbjMineSearch(taxonomyQuery))

## Extract dataset with chain ids

In [9]:
chains = polymerSequenceExtractor.get_dataset(pdb).select("structureChainId")

## Get list of chain ids with UniProt Id mappings

In [10]:
uniprotQuery = "SELECT * FROM sifts.pdb_chain_uniprot"
uniprot = pdbjMineDataset.get_dataset(uniprotQuery)
uniprot = uniprot.select("structureChainId", "sp_primary").withColumnRenamed("structureChainId", "id")

## Join the homomer/heteromer chains with the UniProt ids

In [11]:
chains = chains.join(uniprot, chains.structureChainId == uniprot.id).drop("id").cache()

In [12]:
chains.count()

16886

In [13]:
chains.show()

+----------------+----------+
|structureChainId|sp_primary|
+----------------+----------+
|          4WMI.D|    P00740|
|          4WMK.D|    P00740|
|          4WN2.D|    P00740|
|          4WND.A|    P81274|
|          4WND.B|    Q14CM0|
|          4WNH.D|    P00740|
|          4WRL.A|    P07333|
|          4WRL.B|    P09603|
|          4WRM.A|    P07333|
|          4WRM.B|    P09603|
|          1GXD.A|    P08253|
|          1GXD.C|    P16035|
|          1GY3.A|    P24941|
|          1GY3.B|    P20248|
|          1GZH.A|    P04637|
|          1GZH.B|    Q12888|
|          1GZH.C|    P04637|
|          1GZS.A|    P60953|
|          1GZX.A|    P69905|
|          1GZX.B|    P68871|
+----------------+----------+
only showing top 20 rows



In [14]:
chains.select("sp_primary").distinct().toPandas().to_csv("../../data/" + complexType + ".csv")

In [15]:
sc.stop()