# Sequence Similarity Search Demo

This demo filters PDB chains by sequence similarity using RCSB PDB webservices.


## Imports

In [1]:
from pyspark.sql import SparkSession
from mmtfPyspark.webfilters import SequenceSimilarity
from mmtfPyspark.mappers import StructureToPolymerChains, StructureToPolymerSequences
from mmtfPyspark.io import mmtfReader

#### Configure Spark 

In [2]:
spark = SparkSession.builder.appName("SequenceSimilarityDemo").getOrCreate()

## Read PDB in MMTF format, split into polymer chain, search by sequence similarity, and print sequence found

In [3]:
path = "../../resources/mmtf_reduced_sample/"

pdb = mmtfReader.read_sequence_file(path) \
                .flatMap(StructureToPolymerChains()) \
                .filter(SequenceSimilarity(sequence="MNVRATYTVIFKNASGLPNGYDNWGWGCTLS",\
                                           searchTool=SequenceSimilarity.BLAST, \
                                           eValueCutoff=0.001, \
                                           sequenceIdentityCutoff=40, \
                                           maskLowComplexity=True)) \
                .collect()

for pdbId, structure in pdb:
        print(f"{pdbId} :     {structure.entity_list[0]['sequence']}")

1GWM.A :     MNVRATYTVIFKNASGLPNGYDNWGWGCTLSYYGGAMIINPQEGKYGAVSLKRNSGSFRGGSLRFDMKNEGKVKILVENSEADEKFEVETISPSDEYVTYILDVDFDLPFDRIDFQDAPGNGDRIWIKNLVHSTGSADDFVDPINLEHHHHHH


## Terminate Spark Context

In [4]:
spark.stop()