# Melodia: A Python Library for Protein Structure and Dynamics Analysis

## Alignment Cookbook

In [1]:
import warnings

import dill

import pandas as pd
import melodia as mel

from os import path
from Bio.PDB.PDBExceptions import PDBConstructionWarning



In [2]:
warnings.filterwarnings("ignore", category=PDBConstructionWarning)

### Melodia can parser an alignment in the PIR file format

In [3]:
# Dill can be used for storage
if path.exists('model.dill'):
    with open('model.dill', 'rb') as file:
        align = dill.load(file)
else:
    align = mel.parser_pir_file('model.ali')
    with open('model.dill', 'wb') as file:
        dill.dump(align, file)

### The result is a BioPython alignment 
#### https://biopython.org/docs/1.74/api/Bio.Align.html

#### Melodia looks for PDB files using the record.id information.
Example: >P1;1cdoa needs a 1cdoa.pdb file for parsing

In [4]:
for record in align:
    print(record)
    break

ID: 1cdoa
Name: 1cdoa
Description: structureX:1cdoa:   1 :A: 374 :A:undefined:undefined::
Number of features: 0
/PIR-type=P1
/molecule_type=protein
Per letter annotation for: curvature, torsion, arc_length, writhing, phi, psi
Seq('ATVGKVIKCKAAVAWEANKPLVIEEIEVDVPHANEIRIKIIATGVCHTDLYHLF...LSL')


### All geometric attributes can be accessed through the letter_annotations funcionality.
#### https://biopython.org/docs/1.75/api/Bio.SeqRecord.html

In [5]:
record = align[2]
print(record.id)
print(record.seq)
print(record.letter_annotations.keys())

1teha
--ANEVIKCKAAVAWEAGKPLSIEEIEVAPPKAHEVRIKIIATAVCHTDAYTLSGADPEGCFPVILGHEGAGIVESVGEGVTKLKAGDTVIPLYIPQCGECKFCLNPKTNLCQKIRVTQGKGLMPDGTSRFTCKGKTILHYMGTSTFSEYTVVADISVAKIDPLAPLDKVCLLGCGISTGYGAAVNTAKLEPGSVCAVFGLGGVGLAVIMGCKVAGASRIIGVDINKDKFARAKEFGATECINPQDFSKPIQEVLIEMTDGGVDYSFECIGNVKVMRAALEACHKGWGVSVVVGVAASGEEIATRPFQLVTGRTWKGTAFGGWKSVESVPKLVSEYMSKKIKVDEFVTHNLSFDEINKAFELMHSGKSIRTVVKI
dict_keys(['curvature', 'torsion', 'arc_length', 'writhing', 'phi', 'psi'])


#### It is easy to access and work with the alignment data

In [6]:
for i, residue in enumerate(record.seq):
    print(f"{i} - {residue} - {record.letter_annotations['curvature'][i]}")
    if i > 4:
        break

0 - - - 0.0
1 - - - 0.0
2 - A - 0.6444971866934631
3 - N - 0.6444971866934631
4 - E - 0.5355223616499662
5 - V - 0.9706126186525947


### Melodia has a function to convert the BioPython alignment to a Pandas Dataframe
#### Without a list of annotations keys it use all keys 

In [7]:
mel.dataframe_from_alignment(align=align)

Unnamed: 0,seq_1cdoa,curvature_1cdoa,torsion_1cdoa,arc_length_1cdoa,writhing_1cdoa,phi_1cdoa,psi_1cdoa,seq_1d1ta,curvature_1d1ta,torsion_1d1ta,...,writhing_2ohxa,phi_2ohxa,psi_2ohxa,seq_3huda,curvature_3huda,torsion_3huda,arc_length_3huda,writhing_3huda,phi_3huda,psi_3huda
0,A,0.911238,0.110831,8.246699,0.084959,0.000000,-10.148714,G,1.047306,0.110002,...,0.084033,0.000000,164.623349,S,0.794220,0.107699,8.312687,0.090238,0.000000,-106.741610
1,T,0.911238,0.110831,7.886044,0.084959,-116.979109,-17.230205,T,1.047306,0.110002,...,0.084033,-111.321155,-15.961734,T,0.794220,0.107699,7.870623,0.090238,-109.782193,-30.437007
2,V,0.434979,0.044909,8.043852,0.084959,-55.693954,132.988680,A,0.454057,0.032614,...,0.084033,-52.657511,128.383430,A,0.463674,0.074985,7.973132,0.090238,-40.676160,125.848618
3,G,0.822893,0.062217,8.030083,0.092038,78.090579,4.118510,G,0.801947,0.048148,...,0.083461,88.115088,-16.629985,G,0.572861,0.099983,8.011114,0.120894,113.767325,-43.875177
4,K,0.390131,-0.000838,8.154757,0.010612,-123.849409,155.744956,K,0.504131,-0.012812,...,0.006618,-110.955311,167.680235,K,0.347366,0.004505,8.178899,0.010359,-103.640973,177.954331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,T,0.061095,-0.749861,7.801091,-0.019713,-107.281588,129.499684,T,0.034498,-0.396657,...,-0.019491,-103.494266,131.348977,T,0.054648,-0.794151,7.924285,-0.023005,-99.772840,127.501598
371,V,0.023665,2.478917,7.834701,-0.021218,-116.081018,123.375950,V,0.044883,1.333120,...,-0.025700,-102.055498,134.368430,V,0.034529,1.895340,7.871723,-0.022803,-109.796765,135.339999
372,L,0.120265,0.082923,7.817057,-0.019700,-90.226598,143.795825,L,0.149571,0.055948,...,-0.024220,-105.139660,133.371850,L,0.144089,0.042052,7.732766,-0.023656,-99.507560,126.540662
373,S,0.176591,0.207346,7.868773,-0.019700,-100.424779,0.000000,T,0.098639,-0.130310,...,-0.024220,-109.600320,0.000000,T,0.139156,-0.010417,7.734215,-0.023656,-109.028007,0.000000


#### It is possible to choose the keys for the Dataframe

In [8]:
df = mel.dataframe_from_alignment(align=align, keys=['curvature', 'torsion'])

In [9]:
df.head()

Unnamed: 0,seq_1cdoa,curvature_1cdoa,torsion_1cdoa,seq_1d1ta,curvature_1d1ta,torsion_1d1ta,seq_1teha,curvature_1teha,torsion_1teha,seq_2ohxa,curvature_2ohxa,torsion_2ohxa,seq_3huda,curvature_3huda,torsion_3huda
0,A,0.911238,0.110831,G,1.047306,0.110002,-,0.0,0.0,S,0.858952,0.103759,S,0.79422,0.107699
1,T,0.911238,0.110831,T,1.047306,0.110002,-,0.0,0.0,T,0.858952,0.103759,T,0.79422,0.107699
2,V,0.434979,0.044909,A,0.454057,0.032614,A,0.644497,0.121356,A,0.505272,0.043087,A,0.463674,0.074985
3,G,0.822893,0.062217,G,0.801947,0.048148,N,0.644497,0.121356,G,0.697603,0.059488,G,0.572861,0.099983
4,K,0.390131,-0.000838,K,0.504131,-0.012812,E,0.535522,-0.003564,K,0.366238,-0.007581,K,0.347366,0.004505


### Pandas Dataframe can be stored using the Parquet file format
#### https://databricks.com/glossary/what-is-parquet

In [10]:
df.to_parquet('df.parquet.gzip', compression='gzip')  

In [11]:
pd.read_parquet('df.parquet.gzip') 

Unnamed: 0,seq_1cdoa,curvature_1cdoa,torsion_1cdoa,seq_1d1ta,curvature_1d1ta,torsion_1d1ta,seq_1teha,curvature_1teha,torsion_1teha,seq_2ohxa,curvature_2ohxa,torsion_2ohxa,seq_3huda,curvature_3huda,torsion_3huda
0,A,0.911238,0.110831,G,1.047306,0.110002,-,0.000000,0.000000,S,0.858952,0.103759,S,0.794220,0.107699
1,T,0.911238,0.110831,T,1.047306,0.110002,-,0.000000,0.000000,T,0.858952,0.103759,T,0.794220,0.107699
2,V,0.434979,0.044909,A,0.454057,0.032614,A,0.644497,0.121356,A,0.505272,0.043087,A,0.463674,0.074985
3,G,0.822893,0.062217,G,0.801947,0.048148,N,0.644497,0.121356,G,0.697603,0.059488,G,0.572861,0.099983
4,K,0.390131,-0.000838,K,0.504131,-0.012812,E,0.535522,-0.003564,K,0.366238,-0.007581,K,0.347366,0.004505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,T,0.061095,-0.749861,T,0.034498,-0.396657,T,0.064315,-0.835171,T,0.045135,-0.640672,T,0.054648,-0.794151
371,V,0.023665,2.478917,V,0.044883,1.333120,V,0.038668,1.412671,I,0.027378,2.689518,V,0.034529,1.895340
372,L,0.120265,0.082923,L,0.149571,0.055948,V,0.084575,0.152802,L,0.150112,0.031978,L,0.144089,0.042052
373,S,0.176591,0.207346,T,0.098639,-0.130310,K,0.248368,0.170128,T,0.125465,-0.016873,T,0.139156,-0.010417
