# Secondary Structure Element Demo

This demo shows how to get a dataset of secondary structure elements

## Imports

In [4]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.io import MmtfReader
from mmtfPyspark.mappers import structureToPolymerChains
from mmtfPyspark.filters import containsLProteinChain
from mmtfPyspark.datasets import secondaryStructureElementExtractor

## Configure Spark

In [2]:
conf = SparkConf().setMaster("local[*]") \
                      .setAppName("secondaryStructureElementDemo")
sc = SparkContext(conf = conf)

## Download protein (1STP)

### Note: Need to use SparkContext as parameter to download Mmtf files

In [5]:
pdb = MmtfReader.download_mmtf_files(['1STP'], sc).cache()

## Map protein to polymer chains and apply LProteinChain filter

In [9]:
pdb = pdb.flatMap(structureToPolymerChains()) \
         .filter(containsLProteinChain())

## Extract secondary structure element 'E'

In [10]:
ds = secondaryStructureElementExtractor.getDataset(pdb, 'E', 6)

ds.show(50, False)

[<Row(TFIVTA, E)>, <Row(ALTGTYE, E)>, <Row(VLTGRY, E)>, <Row(TALGWTVAWK, E)>, <Row(NAHSATTWSGQYV, E)>, <Row(INTQWLLTS, E)>, <Row(TLVGHDTFT, E)>]
+-------------+-----+
|sequence     |label|
+-------------+-----+
|TFIVTA       |E    |
|ALTGTYE      |E    |
|VLTGRY       |E    |
|TALGWTVAWK   |E    |
|NAHSATTWSGQYV|E    |
|INTQWLLTS    |E    |
|TLVGHDTFT    |E    |
+-------------+-----+



## Terminate Spark

In [None]:
sc.stop()