# 3Dn example script

In this example, we will demonstrate how to align two proteins using the blurry neighborhood-based 3Dn method.

## Imports

In [1]:
import numpy as np
import os
import csv
import jax
from Bio import BiopythonDeprecationWarning
from Bio.PDB import PDBParser
from Bio.SeqUtils import seq3
from Bio.PDB.Polypeptide import *

from utils_example import *

# Preliminaries

Let's first read the relevant files.

We are assuming the protein information is available with pdbs, which we will read accordingly.

I have demonstrated with two example pdb files; feel free to provide your own!

In [2]:
#pdb file paths
file_prot1='refData/d1dlwa_'
file_prot2='refData/d2gkma_'

#load trained transition matrix for generating blurry neighborhoods from nHot representations
tMtx=np.load('refData/transition_mtx.npy')

#load trained cluster centers for the blurry neighborhoods
cluster_centers=np.load('refData/MI_centers.npy')

#load blosums for 3Dn and 3Di alphabets
blosum_3Dn=np.load('refData/graph_clusters_blosum.npy')
blosum_3Di=np.load('refData/3Di_blosum.npy')

## n-Hot and blurry neighborhood representations

In [3]:
#this generates all 1000 of the bins we used in our discretization (see paper for more details)
allBins=generateBinList()

#generate and characterize positions and secondary structures of neighbors of prot1 and prot2
prot1_neighbors=allBinsNeighborsPerProt(file_prot1)
prot2_neighbors=allBinsNeighborsPerProt(file_prot2)

#get length of each protein
len_prot1=len(prot1_neighbors)
len_prot2=len(prot2_neighbors)

#generate coordinates of all alpha carbons in each protein
CA_positions_prot1=np.array(getAllPositions(file_prot1)[0])
CA_positions_prot2=np.array(getAllPositions(file_prot2)[0])

#obtain nHot representation from neighbor information 
prot1_nHot=getOneHot(prot1_neighbors,allBins)
prot2_nHot=getOneHot(prot2_neighbors,allBins)

#get blurry neighborhoods from transition matrix and nHot representation
prot1_bn=np.einsum('ik,kj->ij', prot1_nHot, tMtx)[:,:-1]
prot2_bn=np.einsum('ik,kj->ij', prot2_nHot, tMtx)[:,:-1]

### 3Dn representation (clustering blurry neighborhoods)

In [4]:
#assign 3Dn sequence based on existing cluster centers
prot1_seq_bn=seqFromCentroids(prot1_bn,cluster_centers)
prot2_seq_bn=seqFromCentroids(prot2_bn,cluster_centers)

#get oneHotVersion of clustered sequences
#This is the "3Dn sequence"
prot1_clustered_oneHot=getOneHot_Clustered(prot1_seq_bn)
prot2_clustered_oneHot=getOneHot_Clustered(prot2_seq_bn)

## Generate alignment

In [5]:
#obtain similarity matrix (necessary for local alignment algorithm) between the proteins, using the 3Dn sequences and trained blosum
simMtx_3dn=sim_mtx(prot1_clustered_oneHot,prot2_clustered_oneHot,blosum_3Dn)

#define Smith-Waterman local sequence alignment algorithm
affine_sw_func=jax.jit(sw_affine(batch=False))

#get alignment from 3dn blosum
#feel free to adjust open and gap penalties to the values you prefer!
#we recommend a parameter search to find optimal weights for your specific case
mult=1
shift=0
algt_3Dn=affine_sw_func(mult*(simMtx_3dn-shift),(len_prot1,len_prot2),gap=-.5,open=-10,temp=0.0000001)

## Evaluate alignment

In [6]:
#get lddt to assess alignment quality
lddt=lddt2(CA_positions_prot1, CA_positions_prot2, algt_3Dn,len_prot1)
print(f'The lDDT using the 3Dn alphabet alignment of your provided proteins: {lddt:.3f}')

The lDDT using the 3Dn alphabet alignment of your provided proteins: 0.887
