# StructureToFragments

#### The goal of this project is to divide a structure into fragments of a desired length, on a rolling-window basis. Functions written in this notebook are based off of the StructureToPolymerChains class. Once the code is completed, we hope it can be added to mmtf-pyspark.


Authors:
* Oleg Sobolev
* Nicholas Kovacs

In [1]:
from pyspark import SparkConf, SparkContext
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.io.mmtfReader import download_mmtf_files

In [2]:
conf = SparkConf().setMaster("local[4]").setAppName('splitting_mmtf')
sc = SparkContext(conf=conf)

#### Here, we load a small structure composed of 1 chain of 7 amino acid residues. The structure is composed of 2 asymmetric units.

In [3]:
pdbIds = ["1YJP"]
pdb = download_mmtf_files(pdbIds, sc)
mmtfs = pdb.values().collect()
structure = mmtfs[0]

#### A couple of lines are omitted from functions our class is dependent on, due to incompatibility between the MMTFencoder object that our function uses, and the MMTFstructure object that is used by its dependencies.

In [4]:
from mmtfPyspark.utils.dsspSecondaryStructure import *

def print_chain_entity_group_atom_info(structure):
#     structure = _check_structure_or_tuple(structure)

    print("*** CHAIN ENTITY GROUP ATOM DATA ***")
    chainToEntityIndex = _get_chain_to_entity_index(structure)
    chainIndex = 0
    groupIndex = 0
    atomIndex = 0
    for i in range(0, structure.num_models):
        print("model: " + str(i+1))
        for j in range(0, structure.chains_per_model[i]):
            chainName = structure.chain_name_list[chainIndex]
            chainId = structure.chain_id_list[chainIndex]
            groups = structure.groups_per_chain[chainIndex]
            print("chainName: " + chainName + ", chainId: " + chainId + ", groups: " + str(groups))

            entityType = structure.entity_list[chainToEntityIndex[chainIndex]]["type"]
            entityDescription = structure.entity_list[chainToEntityIndex[chainIndex]]["description"]
            entitySequence = structure.entity_list[chainToEntityIndex[chainIndex]]["sequence"]
            print("entity type          : " + entityType);
            print("entity description   : " + entityDescription);
            print("entity sequence      : " + entitySequence);

            for k in range(0, structure.groups_per_chain[chainIndex]):
                groupId = structure.group_id_list[groupIndex]
                insertionCode = structure.ins_code_list[groupIndex]
                secStruct = structure.sec_struct_list[groupIndex]
                seqIndex = structure.sequence_index_list[groupIndex]

                groupType = structure.group_type_list[groupIndex]

                groupName = structure.group_list[groupType]["groupName"]
                chemCompType = structure.group_list[groupType]["chemCompType"]
                oneLetterCode = structure.group_list[groupType]["singleLetterCode"]
                numAtoms = len(structure.group_list[groupType]["atomNameList"])
                numBonds = len(structure.group_list[groupType]["bondOrderList"])

                print("   groupName      : " + groupName)
                print("   oneLetterCode  : " + oneLetterCode)
                print("   seq. index     : " + str(seqIndex))
                print("   numAtoms       : " + str(numAtoms))
                print("   numBonds       : " + str(numBonds))
                print("   chemCompType   : " + chemCompType)
                print("   groupId        : " + str(groupId))
                print("   insertionCode  : " + insertionCode)
                print("   DSSP secStruct.: " + DsspSecondaryStructure.get_dssp_code(secStruct).get_one_letter_code())
                print("   Atoms          : ")

                for m in range(0, (len(structure.group_list[groupType]["atomNameList"]))):
                    atomId = structure.atom_id_list[atomIndex]

#                     if not structure.alt_loc_set:
#                         structure = structure.set_alt_loc_list()

                    altLocId = structure.alt_loc_list[atomIndex]
                    x = structure.x_coord_list[atomIndex]
                    y = structure.y_coord_list[atomIndex]
                    z = structure.z_coord_list[atomIndex]
                    occupancy = structure.occupancy_list[atomIndex]
                    bFactor = structure.b_factor_list[atomIndex]

                    atomName = structure.group_list[groupType]["atomNameList"][m]
                    element = structure.group_list[groupType]["elementList"][m]

                    print("      " + str(atomId) + "\t" + atomName + "\t" + str(altLocId) +
                        "\t" + str(x) + "\t" + str(y) + "\t" + str(z) +
                        "\t" + str(occupancy) + "\t" + str(bFactor) + "\t" + str(element))
                    atomIndex = atomIndex + 1


                groupIndex = groupIndex + 1
            chainIndex = chainIndex + 1
    print('\n')


def print_chain_group_info(structure):
#     structure = _check_structure_or_tuple(structure)
#     structure = structure.set_alt_loc_list()

    print("*** CHAIN AND GROUP DATA ***")
    chainIndex = 0
    groupIndex = 0
    for i in range(0, structure.num_models):
        print("model: " + str(i+1))
        for j in range(0, structure.chains_per_model[i]):
            chainName = structure.chain_name_list[chainIndex]
            chainId = structure.chain_id_list[chainIndex]
            groups = structure.groups_per_chain[chainIndex]
            print("chainName: " + chainName + ", chainId: " + chainId + ", groups: " + str(groups))
            for k in range(0, structure.groups_per_chain[chainIndex]):
                groupId = structure.group_id_list[groupIndex]
                insertionCode = structure.ins_code_list[groupIndex]
                secStruct = structure.sec_struct_list[groupIndex]
                seqIndex = structure.sequence_index_list[groupIndex]

                groupType = structure.group_type_list[groupIndex]

                groupName = structure.group_list[groupType]["groupName"]
                chemCompType = structure.group_list[groupType]["chemCompType"]
                oneLetterCode = structure.group_list[groupType]["singleLetterCode"]
                numAtoms = len(structure.group_list[groupType]["atomNameList"])
                numBonds = len(structure.group_list[groupType]["bondOrderList"])

                print("   groupName      : " + groupName)
                print("   oneLetterCode  : " + oneLetterCode)
                print("   seq. index     : " + str(seqIndex))
                print("   numAtoms       : " + str(numAtoms))
                print("   numBonds       : " + str(numBonds))
                print("   chemCompType   : " + chemCompType)
                print("   groupId        : " + str(groupId))
                print("   insertionCode  : " + insertionCode)
                print("   DSSP secStruct.: " + DsspSecondaryStructure.get_dssp_code(secStruct).get_one_letter_code())

                print()
                groupIndex = groupIndex + 1
            chainIndex = chainIndex + 1
    print() 
    
def _get_num_atoms_and_bonds(structure):
    '''Gets the number of atoms and bonds per chain
    '''
    numChains = structure.chains_per_model[0]
    atomsPerChain = [0] * numChains
    bondsPerChain = [0] * numChains
    groupCounter = 0

    for i in range(numChains):

        for j in range(structure.groups_per_chain[i]):
            groupIndex = structure.group_type_list[groupCounter]
            atomsPerChain[i] += len(structure.group_list[groupIndex]['atomNameList'])
            bondsPerChain[i] += len(structure.group_list[groupIndex]['bondOrderList'])
            groupCounter += 1

    return atomsPerChain, bondsPerChain


def _get_chain_to_entity_index(structure):
    '''Returns an list that maps a chain index to an entity index.
    Attributes
    ----------
        structure: structureDataInterFace
    '''
    entityChainIndex = [0] * structure.num_chains

    for i in range(len(structure.entity_list)):

        for j in structure.entity_list[i]["chainIndexList"]:
            entityChainIndex[j] = i

    return entityChainIndex

#### The get-info function has 2 arguments: the structure of interest and the desired lenth of fragments. It returns dictionary corresponding to each fragment that contains the residues, atoms, and bonds as lists.

In [5]:
from mmtf.api.mmtf_writer import MMTFEncoder
from mmtf.utils import *
import py3Dmol
from mmtfPyspark.utils import traverseStructureHierarchy
from mmtfPyspark.io import mmtfWriter

def get_info(structure, n_groups):
    numChains = structure.chains_per_model[0]
    result = []
    atomCounter = 0
    groupCounter = 0
#     print(len(structure.group_list))
    for i in range(numChains):
        if structure.groups_per_chain[i] < n_groups:
            continue
        we_dont_want_to_skip_chain = True
        
        rr = []
        n_pieces = structure.groups_per_chain[i] // n_groups
        n_rest =  structure.groups_per_chain[i] % n_groups
        #print("pieces,rest", n_pieces, n_rest)
        for j in range(n_pieces):
            dd = {'chain_n':i, 'groups_n':[], 'atoms_n':[]}
            dd['groups_n'] = list(range(groupCounter, groupCounter+n_groups))
            n_bonds = 0
            for k in range(n_groups):
                groupIndex = structure.group_type_list[groupCounter]
                n_bonds += len(structure.group_list[groupIndex]['bondOrderList'])
                for l in range(len(structure.group_list[groupIndex]['atomNameList'])):
                    dd['atoms_n'].append(atomCounter)
                    atomCounter += 1
                groupCounter += 1
            dd['n_bonds'] = n_bonds
            print(dd)
            rr.append(dd)
        if we_dont_want_to_skip_chain:
            result += rr
        # skip the rest residues
        for k in range(n_rest):
            groupIndex = structure.group_type_list[groupCounter]
            atomCounter += len(structure.group_list[groupIndex]['atomNameList'])
            groupCounter += 1
    return result
struct_info = get_info(structure, 3)



{'chain_n': 0, 'groups_n': [0, 1, 2], 'atoms_n': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 'n_bonds': 17}
{'chain_n': 0, 'groups_n': [3, 4, 5], 'atoms_n': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45], 'n_bonds': 23}
{'chain_n': 1, 'groups_n': [7, 8, 9], 'atoms_n': [59, 60, 61], 'n_bonds': 0}
{'chain_n': 1, 'groups_n': [10, 11, 12], 'atoms_n': [62, 63, 64], 'n_bonds': 0}


#### The arguments of this function are the desired structure, the dictionary from struct_info, and the chain of interest.

In [6]:
from mmtf.api.mmtf_writer import MMTFEncoder
from mmtf.utils import *
import py3Dmol
from mmtfPyspark.utils import traverseStructureHierarchy
from mmtfPyspark.io import mmtfWriter
                
                
                

def break_into_fragments(structure, struct_info, chain):
    n_groups = len(struct_info[0]['groups_n'])
    result = []
    if not structure.alt_loc_set:
        structure.set_alt_loc_list()
#    numChains = structure.chains_per_model[0]
    chainToEntityIndex = _get_chain_to_entity_index(structure)
#     atomsPerChain, bondsPerChain = _get_num_atoms_and_bonds(structure)
#    chainToEntityIndex = _get_chain_to_entity_index(structure)
    
    
    piece = MMTFEncoder()
    
#    n_bonds = len(structure.group_list[0]['bondOrderList'])
#    n_atoms = len(structure.group_list[0]['atomNameList'])
#    n_groups = 1
#    fragment_number = 1
    
    for fragment_number in range(len(struct_info)):
        
        n_bonds = struct_info[fragment_number]['n_bonds']
        n_atoms = struct_info[fragment_number]['atoms_n']
        n_groups = fragment_number
    
        structureId = structure.structure_id + '.' +\
                  structure.chain_name_list[0] + '.' +\
                  structure.chain_id_list[0] + '.' +\
                  str(fragment_number)

        piece.init_structure(n_bonds, n_atoms, n_groups, 1,1, structureId)

        decoder_utils.add_xtalographic_info(structure, piece)
        decoder_utils.add_header_info(structure, piece)

        # Set model info (only one model: 0)
        piece.set_model_info(0,1)

        current_chain_number = chain
        current_group_number = fragment_number
        entityToChainIndex = chainToEntityIndex[current_chain_number]

    #     entityToChainIndex = chainToEntityIndex[current_chain_number]
    #     chain_type = structure.entity_list[entityToChainIndex]['type']
    #     polymer = chain_type == "polymer"

        # Set entity and chain info
        piece.set_entity_info([0],
            structure.entity_list[entityToChainIndex]['sequence'],
            structure.entity_list[entityToChainIndex]['description'],
            structure.entity_list[entityToChainIndex]['type'])
        piece.set_chain_info(structure.chain_id_list[current_chain_number],
            structure.chain_name_list[current_chain_number],
            1)

        groupIndex = structure.group_type_list[current_group_number]
        piece.set_group_info(structure.group_list[groupIndex]['groupName'],
            structure.group_id_list[current_group_number],
            structure.ins_code_list[current_group_number],
            structure.group_list[groupIndex]['chemCompType'],
            len(structure.group_list[groupIndex]['atomNameList']),
            len(structure.group_list[groupIndex]['bondOrderList']),
            structure.group_list[groupIndex]['singleLetterCode'],
            structure.sequence_index_list[current_group_number],
            structure.sec_struct_list[current_group_number])

        for k in range(len(structure.group_list[groupIndex]['atomNameList'])):
            atomCounter = k
            piece.set_atom_info(
                structure.group_list[groupIndex]['atomNameList'][k],
                structure.atom_id_list[atomCounter],
                structure.alt_loc_list[atomCounter],
                structure.x_coord_list[atomCounter],
                structure.y_coord_list[atomCounter],
                structure.z_coord_list[atomCounter],
                structure.occupancy_list[atomCounter],
                structure.b_factor_list[atomCounter],
                structure.group_list[groupIndex]['elementList'][k],
                structure.group_list[groupIndex]['formalChargeList'][k],)

        for l in range(len(structure.group_list[groupIndex]['bondOrderList'])):
            bondIndOne = structure.group_list[groupIndex]['bondAtomList'][l*2]
            bondIndTwo = structure.group_list[groupIndex]['bondAtomList'][l*2+1]
            bondOrder = structure.group_list[groupIndex]['bondOrderList'][l]

            piece.set_group_bond(bondIndOne, bondIndTwo, bondOrder)
        piece.finalize_structure()
        result.append(piece)

        return result

#### Here, we call our function and traverse its data structure to verify that our function is working.

In [7]:
p = break_into_fragments(structure, struct_info, 1)
print(type(structure))
print(type(p[0]))

traverseStructureHierarchy.print_metadata(p[0])
traverseStructureHierarchy.print_structure_data(p[0])
print_chain_group_info(p[0])
print_chain_entity_group_atom_info(p[0])

<class 'mmtfPyspark.utils.mmtfStructure.MmtfStructure'>
<class 'mmtf.api.mmtf_writer.MMTFEncoder'>
*** METADATA ***
StructureId           : 1YJP.A.A.0
Title                 : Structure of GNNQQNY from yeast prion Sup35
Deposition date       : 2005-01-15
Release date          : 2005-06-14
Experimental method(s): [X-RAY DIFFRACTION]
Resolution            : 1.7999999523162842
Rfree                 : 0.1901399940252304
Rwork                 : 0.1808599978685379

*** STRUCTURE DATA ***
Number of models : 1
Number of chains : 1
Number of groups : 0
Number of atoms : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
Number of bonds : 17

*** CHAIN AND GROUP DATA ***
model: 1
chainName: A, chainId: B, groups: 1
   groupName      : GLY
   oneLetterCode  : G
   seq. index     : 0
   numAtoms       : 4
   numBonds       : 3
   chemCompType   : PEPTIDE LINKING
   groupId        : 1
   insertionCode  :  
   DSSP secStruct.: C


*** CHAIN ENTITY GROUP ATOM DATA ***
model: 1
chai

#### This is the first residue from the PDB of 1YJP

#### We can view then fragments

In [8]:
from mmtfPyspark.structureViewer import view_structure

p0 = mmtfWriter.to_mmtf_base64(p[0])
viewer = py3Dmol.view()
viewer.addModel(p0, 'mmtf')
viewer.setStyle({'sphere': {'color': 'spectrum'}})
viewer.show()

In [9]:
sc.stop()

### Feedback: 
Using the Columnar Structure could be an easier avenue for this problem. As of now, we are making a dictionary of lists and using that to build our fragments. The Columnar Structure would allow us to work in a more pandas-oriented design