# Secondary Structure Shifted Word2Vec Encoder

This demo creates a dataset of sequence segments derived from a non-redundent set. The dataset contains the seuqence segment, the DSSP Q8 and DSSP Q3 code of the center residue in a sequnece segment, and a 3-gram shifted Word2Vec encoding of the seuqnece segment.

## Imports

In [1]:
from pyspark import SQLContext
from pyspark.sql import SparkSession
from mmtfPyspark.ml import ProteinSequenceEncoder
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.datasets import secondaryStructureSegmentExtractor
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.io import mmtfReader
import time

#### Configure Spark 

In [2]:
spark = SparkSession.builder.appName("SecondaryStructureShiftedWord2VecEncoder").getOrCreate()

## Read in, filter and sample Hadoop Sequence Files

In [3]:
path = "../../resources/mmtf_reduced_sample/"

sequenceIdentity = 20
resolution = 2.0
fraction = 0.1
seed = 123

pdb = mmtfReader \
        .read_sequence_file(path) \
        .flatMap(StructureToPolymerChains()) \
        .filter(Pisces(sequenceIdentity, resolution)) \
        .filter(ContainsLProteinChain()) \
        .sample(False, fraction, seed)

## Extract Secondary Structure Segments

In [4]:
segmentLength = 25
data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache()

## Add Word2Vec encoded feature vector

In [5]:
encoder = ProteinSequenceEncoder(data)

windowSize = (segmentLength -1) // 2
vectorSize = 50
# overlapping_ngram_word2vec_encode uses keyword attributes
data = encoder.shifted_3gram_word2vec_encode(windowSize=windowSize, vectorSize=vectorSize).cache()

root
 |-- ngram0: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ngram1: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ngram2: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- structureChainId: string (nullable = false)
 |-- sequence: string (nullable = false)
 |-- labelQ8: string (nullable = false)
 |-- labelQ3: string (nullable = false)
 |-- feature2: vector (nullable = true)
 |-- feature1: vector (nullable = true)
 |-- feature0: vector (nullable = true)
 |-- features: vector (nullable = true)



## Show dataset schema and few rows of data

In [6]:
data.printSchema()
data.show(10, False)

root
 |-- structureChainId: string (nullable = false)
 |-- sequence: string (nullable = false)
 |-- labelQ8: string (nullable = false)
 |-- labelQ3: string (nullable = false)
 |-- ngram0: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ngram1: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ngram2: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- feature0: vector (nullable = true)
 |-- feature1: vector (nullable = true)
 |-- feature2: vector (nullable = true)
 |-- features: vector (nullable = true)

+----------------+-------------------------+-------+-------+----------------------------------------+----------------------------------------+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
df = data.toPandas()

df.head(10)

Unnamed: 0,structureChainId,sequence,labelQ8,labelQ3,ngram0,ngram1,ngram2,feature0,feature1,feature2,features
0,5JJE.B,AAAATGDAAAVQEAAVSAILGLIIL,H,H,"[AAA, ATG, DAA, AVQ, EAA, VSA, ILG, LII]","[AAA, TGD, AAA, VQE, AAV, SAI, LGL, IIL]","[AAT, GDA, AAV, QEA, AVS, AIL, GLI, IL]","[1.3077131249010563, 0.7122090365737677, -0.37...","[0.31528071127831936, 1.2591924294829369, -0.0...","[0.42587000131607056, 1.7479266859591007, -0.8...","[0.6829546124984821, 1.239776050671935, -0.409..."
1,3Q4A.B,AAACYGRAITRNPLVAVYYTNRALC,T,C,"[AAA, CYG, RAI, TRN, PLV, AVY, YTN, RAL]","[AAC, YGR, AIT, RNP, LVA, VYY, TNR, ALC]","[ACY, GRA, ITR, NPL, VAV, YYT, NRA, LC]","[0.36003951728343964, 0.21338949352502823, -0....","[-1.0577798932790756, -0.4433322809636593, 0.8...","[-0.5259715355932713, -0.2866472825407982, -1....","[-0.40790397052963573, -0.17219668999314308, -..."
2,3JYO.A,AAADGVVNATPMGMPAHPGTAFDVS,T,C,"[AAA, DGV, VNA, TPM, GMP, AHP, GTA, FDV]","[AAD, GVV, NAT, PMG, MPA, HPG, TAF, DVS]","[ADG, VVN, ATP, MGM, PAH, PGT, AFD, VS]","[-2.1520258113741875, 1.5814734399318695, 2.72...","[-0.9818067885935307, 0.10639189183712006, 0.5...","[0.7791224308311939, 0.23633908666670322, -0.3...","[-0.7849033897121748, 0.6414014728118976, 0.95..."
3,2WOL.A,AAGLPDGFRAVIGTQRGKFRLVADA,E,E,"[AAG, LPD, GFR, AVI, GTQ, RGK, FRL, VAD]","[AGL, PDG, FRA, VIG, TQR, GKF, RLV, ADA]","[GLP, DGF, RAV, IGT, QRG, KFR, LVA, DA]","[-0.9328930005431175, -0.30927836149930954, -0...","[0.5547036295756698, -0.1040046401321888, 0.59...","[-0.4442402692511678, 0.05184726230800152, 1.6...","[-0.2741432134062052, -0.12047857977449894, 0...."
4,4ZW9.A,AAHYLGAYVFIIFTGFLITFLAFTF,H,H,"[AAH, YLG, AYV, FII, FTG, FLI, TFL, AFT]","[AHY, LGA, YVF, IIF, TGF, LIT, FLA, FTF]","[HYL, GAY, VFI, IFT, GFL, ITF, LAF, TF]","[0.6695579942315817, -1.2924418151378632, 0.17...","[-1.0392890851944685, 0.04093315452337265, 3.3...","[-1.8265038803219795, 0.3357615452259779, -0.4...","[-0.7320783237616221, -0.3052490384628375, 1.0..."
5,4JDU.A,AAINLAARSFTPQEGVGRAIVVITD,C,C,"[AAI, NLA, ARS, FTP, QEG, VGR, AIV, VIT]","[AIN, LAA, RSF, TPQ, EGV, GRA, IVV, ITD]","[INL, AAR, SFT, PQE, GVG, RAI, VVI, TD]","[0.5898258611559868, -1.6104249842464924, 0.05...","[-1.1192478314042091, 1.6043020384386182, 1.45...","[0.4854221139103174, 1.0365443676710129, 0.100...","[-0.014666618779301643, 0.3434738072877129, 0...."
6,2D7V.B,AAKQRYLVESYTDNAVGILGKNSKG,E,E,"[AAK, QRY, LVE, SYT, DNA, VGI, LGK, NSK]","[AKQ, RYL, VES, YTD, NAV, GIL, GKN, SKG]","[KQR, YLV, ESY, TDN, AVG, ILG, KNS, KG]","[1.117486486211419, 0.6020313552580774, 1.1991...","[-0.9790477193892002, 1.155381366610527, -0.12...","[0.41564929485321045, 0.4955863244831562, 1.12...","[0.18469602055847645, 0.7509996821172535, 0.73..."
7,2OQM.D,AATIQISQPRWQGKYLTGYEFAIEH,T,C,"[AAT, IQI, SQP, RWQ, GKY, LTG, YEF, AIE]","[ATI, QIS, QPR, WQG, KYL, TGY, EFA, IEH]","[TIQ, ISQ, PRW, QGK, YLT, GYE, FAI, EH]","[0.32192998845130205, 0.06875469558872283, -0....","[0.5564863681793213, -0.4034331664443016, 0.73...","[-0.9259331449866295, 2.6121824011206627, -0.4...","[-0.01583892945200205, 0.759167976755028, -0.2..."
8,2QED.A,ADIGSMNLNSIPAFQDNYIWVLTND,E,E,"[ADI, GSM, NLN, SIP, AFQ, DNY, IWV, LTN]","[DIG, SMN, LNS, IPA, FQD, NYI, WVL, TND]","[IGS, MNL, NSI, PAF, QDN, YIW, VLT, ND]","[0.17318073846399784, 0.4019491821527481, 0.07...","[-2.0795037001371384, 1.5218573417514563, -0.1...","[0.2228504689410329, -1.0630179531872272, 0.79...","[-0.5611574975773692, 0.2869295235723257, 0.22..."
9,3I4G.A,ADLYESYEFKDGTPFSYDDPRYDPS,C,C,"[ADL, YES, YEF, KDG, TPF, SYD, DPR, YDP]","[DLY, ESY, EFK, DGT, PFS, YDD, PRY, DPS]","[LYE, SYE, FKD, GTP, FSY, DDP, RYD, PS]","[-0.5635035485029221, 1.0982801094651222, -0.1...","[1.8807343170046806, -0.031605927273631096, 0....","[-1.2648962195962667, 1.233267653733492, 0.492...","[0.017444849635163944, 0.766647278641661, 0.29..."


## Terminate Spark Context

In [8]:
spark.stop()