# Secondary Structure Elements Word2Vec Encoder Demo

This demo creates a dataset by extracting secondary structure elements "H", then encode an overlapping Ngram feature vector

## Imports

In [1]:
from pyspark import SQLContext
from pyspark.sql import SparkSession
from mmtfPyspark.ml import ProteinSequenceEncoder
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureElementExtractor
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.io import mmtfReader

#### Configure Spark 

In [2]:
spark = SparkSession.builder.appName("SecondaryStructureElementsWord2VecEncoder").getOrCreate()

 ## Read MMTF Hadoop sequence file and 
 
 Create a non-redundant set(<=20% seq. identity) of L-protein chains

In [3]:
path = "../../resources/mmtf_reduced_sample/"
fraction = 0.05
seed = 123

pdb = mmtfReader \
        .read_sequence_file(path) \
        .flatMap(StructureToPolymerChains(False, True)) \
        .filter(ContainsLProteinChain()) \
        .sample(False, fraction, seed)

## Extract Element "H" from Secondary Structure

In [4]:
label = "H"
data = secondaryStructureElementExtractor.get_dataset(pdb, label).cache()
print(f"original data   : {data.count()}")
data.show(10, False)

original data   : 3417
+--------------------+-----+
|sequence            |label|
+--------------------+-----+
|STALNERI            |H    |
|RAWVKLISSHDKLVSDLVRR|H    |
|EQAVRCGIELQRALRRN   |H    |
|NVAMAARVAAQ         |H    |
|PVRDA               |H    |
|NKMEEKAPLLLQEDFNM   |H    |
|KLKVAWEEAKKRWNNI    |H    |
|FHGTALVAY           |H    |
|AVDFNRAVR           |H    |
|AFHYYLTRALQL        |H    |
+--------------------+-----+
only showing top 10 rows



## Word2Vec encoded feature Vector

In [5]:
segmentLength = 11
n = 2
windowSize = (segmentLength-1)/2
vectorSize = 50

encoder = ProteinSequenceEncoder(data)
# overlapping_ngram_word2vec_encode uses keyword attributes
data = encoder.overlapping_ngram_word2vec_encode(n=n, windowSize=windowSize, vectorSize=vectorSize)

data.show(5)

+--------------------+-----+--------------------+--------------------+
|            sequence|label|               ngram|            features|
+--------------------+-----+--------------------+--------------------+
|            STALNERI|    H|[ST, TA, AL, LN, ...|[-0.1606897578707...|
|RAWVKLISSHDKLVSDLVRR|    H|[RA, AW, WV, VK, ...|[-0.2602296720602...|
|   EQAVRCGIELQRALRRN|    H|[EQ, QA, AV, VR, ...|[-0.0787890475476...|
|         NVAMAARVAAQ|    H|[NV, VA, AM, MA, ...|[0.14959662854671...|
|               PVRDA|    H|    [PV, VR, RD, DA]|[-0.1039832192473...|
+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



## Terminate Spark Context

In [6]:
spark.stop()