# Protein Fold Dataset Creator Dmeo

This Demo is a simple example of using Dataset operations to create a datset

## Imports

In [1]:
from pyspark import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from mmtfPyspark.ml import ProteinSequenceEncoder
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureExtractor
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.io import mmtfReader

## Define addProteinFoldType function

In [2]:
def add_protein_fold_type(data, minThreshold, maxThreshold):
    '''
    Adds a column "foldType" with three major secondary structure class:
    "alpha", "beta", "alpha+beta", and "other" based upon the fraction of alpha/beta content.

    The simplified syntax used in this method relies on two imports:
        from pyspark.sql.functions import when
        from pyspark.sql.functions import col

    Attributes:
        data (Dataset<Row>): input dataset with alpha, beta composition
        minThreshold (float): below this threshold, the secondary structure is ignored
        maxThreshold (float): above this threshold, the secondary structure is ignored
    '''

    return data.withColumn("foldType", \
                           when((col("alpha") > maxThreshold) & (col("beta") < minThreshold), "alpha"). \
                           when((col("beta") > maxThreshold) & (col("alpha") < minThreshold), "beta"). \
                           when((col("alpha") > maxThreshold) & (col("beta") > minThreshold), "alpha+beta"). \
                           otherwise("other")\
                           )

#### Configure Spark 

In [3]:
spark = SparkSession.builder.appName("ProteinFoldDatasetCreatorDemo").getOrCreate()

## Read MMTF Hadoop sequence file

Create non-redundant set (<=40% seq. identity) if L-protein chains

In [4]:
path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 40
resolution = 2.0

pdb = mmtfReader \
        .read_sequence_file(path) \
        .filter(Pisces(sequenceIdentity, resolution)) \
        .flatMap(StructureToPolymerChains()) \
        .filter(Pisces(sequenceIdentity, resolution)) \
        .filter(ContainsLProteinChain())

## Get secondary structure content

In [5]:
data = secondaryStructureExtractor.get_dataset(pdb)

## Classify chains by secondary structure type

In [6]:
minThreshold = 0.05
maxThreshold = 0.15
data = add_protein_fold_type(data, minThreshold, maxThreshold)

## Add Word2Vec encoded feature vector

In [7]:
encoder = ProteinSequenceEncoder(data)
n = 2 # Create 2-grams
windowSize = 25 # 25-amino residue window size for Word2Vec
vectorSize = 50 # dimension of feature vector
# overlapping_ngram_word2vec_encode uses keyword attributes
data = encoder.overlapping_ngram_word2vec_encode(n = n, windowSize = windowSize, vectorSize=vectorSize).cache()

data.printSchema()
data.show(10)

root
 |-- structureChainId: string (nullable = false)
 |-- sequence: string (nullable = false)
 |-- alpha: float (nullable = false)
 |-- beta: float (nullable = false)
 |-- coil: float (nullable = false)
 |-- dsspQ8Code: string (nullable = false)
 |-- dsspQ3Code: string (nullable = false)
 |-- foldType: string (nullable = false)
 |-- ngram: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)

+----------------+--------------------+-----------+-----------+----------+--------------------+--------------------+----------+--------------------+--------------------+
|structureChainId|            sequence|      alpha|       beta|      coil|          dsspQ8Code|          dsspQ3Code|  foldType|               ngram|            features|
+----------------+--------------------+-----------+-----------+----------+--------------------+--------------------+----------+--------------------+--------------------+
|          4WMY.B|TDWSHPQFEKSTDEAN

## Keep only a subset of relevant fields for futher processing

In [8]:
data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features")

data.show(10)

+----------------+-----------+-----------+----------+----------+--------------------+
|structureChainId|      alpha|       beta|      coil|  foldType|            features|
+----------------+-----------+-----------+----------+----------+--------------------+
|          4WMY.B| 0.17081851| 0.26334518| 0.5658363|alpha+beta|[0.16776788024934...|
|          4WN5.A|  0.2962963| 0.37962964|0.32407406|alpha+beta|[0.21019394612429...|
|          4WND.B|0.115384616|        0.0|0.88461536|     other|[0.09861392040665...|
|          4WP6.A| 0.45695364|0.119205296|0.42384106|alpha+beta|[0.04305993950833...|
|          4WP9.A|  0.3939394|  0.3151515|0.29090908|alpha+beta|[-0.0690641615814...|
|          4WPG.A| 0.39372823| 0.17073171|0.43554008|alpha+beta|[0.00626527878921...|
|          4WPK.A|  0.4122807|0.114035085|0.47368422|alpha+beta|[-0.2018287289683...|
|          4WQD.A|  0.3991228|0.057017542|0.54385966|alpha+beta|[-0.1654694509651...|
|          4WRI.A| 0.62032086|0.053475935| 0.3262032|a

## Terminate Spark Context

In [9]:
spark.stop()