In [2]:
import MDAnalysis as mda
from MDAnalysis.analysis import align
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob as glob
import pickle 
import os

%matplotlib inline

## Create the atom_selection strings for each crystal structure's 'topologically equivalent' residues, with reference to the multiple sequence alignment
- We will then append each selection string to the corresponding entry in the crystal structure dictionary. This will streamline our work, for example when computing PCA.
- This requires a clustalOmega MSA input.
- !! clustal omega outputs a file that has a line structure as: 'pdbid \n sequence'.
    - Therefore, we include in the script a way to take the pdbid and the corresponding msa. This way we don't need to manually manipulate the files.

## We can visualize what the MSA looks like after aligning the sequences of the 73 crystal structures of A2A:
![](pics/msa_topologicallyEquivResidues_a2a.png)

In [5]:
## (1) Start by identifying the directory that houses the crystal structures.

# directory with PDB structures
directory = './data/a2a_pdbs/'

## creating list to hold universes of all crystal structures
crystal_unis = []
for filename in os.listdir(directory):
    crystal_unis.append(mda.Universe(directory + str(filename)))  

## (2) Use the MSA output to create the appropriate selection strings for all crystals

# open the clustal file
clustal_output_directory = './data/a2a_clustalMSA_output_2024-06-10.txt'
clustal_output = open(clustal_output_directory)

alignment_list = [] # append .txt lines to a list
pdbids_list    = [] # append matching pdb ids to list

# identify number of lines by looping
with open(clustal_output_directory, 'r') as file:
    total_lines = sum(1 for _ in file)
    
# now loop through again and store the pdbs and their sequences alignment strings.
for linenum,line in enumerate(clustal_output, start=0):
    if linenum != total_lines and linenum % 2 != 0: # odd lines are sequences
        alignment_list.append(line[:-1])            # append sequence
    elif linenum == total_lines:
        alignment_list.append(line)                 # for the final line, include last element
    elif linenum % 2 == 0:                          # append pdb id for even lines
        pdbids_list.append(line[1:5])               # exclude '>' character and line break
clustal_output.close()                              # close the file


## (3) convert the list to a binary matrix

# create an empty array to hold the lines (n_rows x n_cols)
alignment_array = np.zeros((len(alignment_list), len(alignment_list[0])))

for i in range(len(alignment_list)):                  # number of constructs
    for j in range(len(alignment_list[0])):           # max number of residues present in the file
        if alignment_list[i][j] != "-":               # if a residue is present
            alignment_array[i][j] = 1                 # convert it into a '1' digit
            
## here we create the binary array for all constructs, and where we append the corresponding resid for each structure if all 
## structures at this position have a residue present (i.e. topologically equivalent residues)
# list to hold the aligned resids for each crystal.
listofresidues = [[] for _ in range(len(alignment_list))] # to contain all of the construct IDs

# looping over each construct we determine which constructs share a residue in the position
# then we append the resid unique to the current crystal to the listofresidues for each crystal
for j in range(len(alignment_list)):                      # loop over constructs
    
    print('Working on construct', j+1,'of {}.'.format(len(alignment_list)), end='\r')
    
    # reset the counter for each crystal
    counter = -1                                          # counter; first hit of '1' gives resindex 0
    for i in range(len(alignment_list[0])):               # total no. of cols from MSA output
        if alignment_array[j][i] == 1:                    # if element has a residue
            counter += 1                                  # +1 when hit a residue (resindex, not resid)
            if np.all(alignment_array.T[i] == 1):         # iff col has all '1's (i.e. all crystals have a res here)
                listofresidues[j].append(crystal_unis[j].residues[counter].resid) # append j-th crystal resid based on the resindex (!! NOT resnum !!)
            else:
                continue
        else:
            continue

## (4) create unique atom_selections for each of the crystals
selectionstring_holder           = [] # holder for selection strings of each uni
crystals_with_selections_calphas = [] # pdb_unis with calpha selections -- to verify lengths
crystals_with_selections_full    = [] # pdb_unis with full selections -- for visualization

for k in range(len(listofresidues)): # loop over crystal structures
    
    selection_string = ''
    for i in range(len(listofresidues[k])):                             # num of common resids across structures
        
        selection_string += 'resid {} or '.format(listofresidues[k][i]) # k-th crystal, i-th resindex
    # we append the selection string & the filename, to use as an index
    selectionstring_holder.append(selection_string)
    # save the calpha representations of alignment
    crystals_with_selections_calphas.append(crystal_unis[k].select_atoms('{}'.format('protein and backbone and name CA and segindex 0 and not altLoc B and (' + selectionstring_holder[k][:-4] + ')')
                                                                 )      # exclude altlocB since some residues will double count!
                                        )                               # final selection string -- omit " or " -- AND don't forget parenthese around "resid or..."
    # save the all-atom representation of alignment
    crystals_with_selections_full.append(crystal_unis[k].select_atoms('{}'.format('protein and segindex 0 and not altLoc B and (' + selectionstring_holder[k][:-4] + ')')
                                                                 )      # exclude altlocB since some residues will double count!
                                        )
    
print('Note that the order of structures matches between the pdbids_list and the crystal directory:')

# check that the order is preserved from crystal unis directory and the msa sequence processing.
gg = []
for i in os.listdir(directory):
    gg.append(i[:4])
assert gg == pdbids_list



Working on construct 1 of 73.

IndexError: index 327 is out of bounds for axis 0 with size 327