In [1]:
import os
import itertools
import numpy as np

In [2]:
datadir = "../data"

In [3]:
msa_file = os.path.join(datadir, "DHFR.aln")

In [4]:
# Read all the lines in the file into a 2D array of type S1
with open(msa_file) as fh:
    arr = np.array([[x for x in line.strip()] for line in fh], np.dtype("S1"))

print("shape =", arr.shape, ",dtype= ", arr.dtype)

shape = (56165, 186) ,dtype=  |S1


In [5]:
# M is the number of sequences
# L is the length
M, L = arr.shape

In [6]:
# the first sequence
arr[0, :].tostring()

b'VRPLNCIVAVSQNMGIGKNGDLPWPPLRNEFKYFQRMTTTSSVEGKQNLVIMGRKTWFSIPEKNRPLKDRINIVLSRELKEPPRGAHFLAKSLDDALRLIEQPELASKVDMVWIVGGSSVYQEAMNQPGHLRLFVTRIMQEFESDTFFPEIDLGKYKLLPEYPGVLSEVQEEKGIKYKFEVYEKKD'

In [7]:
# the second sequence
arr[1, :].tostring()

b'----SIVVVMCKRFGIGRNGVLPWSPLQADMQRFRSITAG-------GGVIMGRTTFDSIPEEHRPLQGRLNVVLTTSADLMKNSNIIFVSSFDELDAIVGL----HDHLPWHVIGGVSVYQHFLEKSQVTSMYVTFVDGSLECDTFFPHQFLSHFEITRA---SALMSDTTSGMSYRFVDYTR--'

In [8]:
# We can order the amino acids any way we like
# Here is a sorting based on some amino acid properties. 
# https://proteinstructures.com/Structure/Structure/amino-acids.html
AMINO_ACIDS = np.array([aa for aa in "RKDEQNHSTCYWAILMFVPG-"], "S1")

### Compute the weights of each sequence

In [9]:
progress_bar = True
try:
    from IPython.display import clear_output
except ImportError:
     progress_bar = False


In [10]:
hamming_cutoff = 0.2 # This is x in equation 27 in the 2013 Coco et al. paper

weights_file = os.path.join(datadir, "DHFR.weights.npy")

if os.path.isfile(weights_file):
    weights = np.load(weights_file)
    print("Loading weights from : ", weights_file)

else:
    weights = np.zeros(M)

    for i in range(M):
        weights[i] = 1. / (np.sum(np.sum(arr[i, :] != arr, axis=1) < hamming_cutoff * L))
        if i % 100 == 0:
            if progress_bar:
                clear_output(wait=True)
            print ("Processing sequence", i, "of", M)
    np.save(weights_file, weights)
    print("Finished computing sequence weights and saved to : ", weights_file)


Loading weights from :  ../data/DHFR.weights.npy


In [11]:
M_eff = sum(weights) # Equation 28 in 2013 Coco et al. paper
print(int(round(M_eff)))

15238


In [12]:
q = 21
pseudo_count = round(M_eff)

### Compute Single and Double site marginals

In [13]:
single_site_marginal_file = os.path.join(datadir, "DHFR.single.npy")
double_site_marginal_file = os.path.join(datadir, "DHFR.double.npy")

if os.path.isfile(double_site_marginal_file) and os.path.isfile(single_site_marginal_file):
    f_i_a = np.load(single_site_marginal_file)
    print("Loading single site marginals from ", single_site_marginal_file)

    f_i_j_a_b = np.load(double_site_marginal_file)
    print("Loading double site marginals from ", double_site_marginal_file)    
    
else:
    # single site marginals. Eqn 29 in 2013 Coco et al. paper
    f_i_a = np.zeros((L, q))
    # double site marginals
    f_i_j_a_b = np.zeros((L*q, L*q))

    normalizer = 1. / (M_eff + pseudo_count)
    additive_factor_single = pseudo_count / q
    additive_factor_double = pseudo_count / (q*q)

    # This is a dictionary where the index (i,a) points to a 
    # numpy array of integers which is 1 if amino acid a is in row i
    # and 0 otherwise 
    ia = dict()
    for i, a in itertools.product(range(L), range(q)):
        ia[(i, a)] = (arr[:, i] == AMINO_ACIDS[a]).astype(np.int)

    for i, a in itertools.product(range(L), range(q)):
        delta_i_a = ia[(i,a)]
        f_i_a[i, a] =  normalizer * (additive_factor_single +
                        np.sum(weights * delta_i_a))
        for j, b in itertools.product(range(L), range(q)):
            f_i_j_a_b[i + q*a, j + q*b] = normalizer * (additive_factor_double +
                        np.sum(weights * ia[(j,b)] * delta_i_a ))
        if progress_bar:
            clear_output(wait=True)
        print("Finished processing i={}, a={}, AA={}".format(i, a, AMINO_ACIDS[a].tostring().decode()))

    del ia
    np.save(single_site_marginal_file, f_i_a)
    np.save(double_site_marginal_file, f_i_j_a_b)
    print("Finished computing sigle and double site marginals and saved to cache files")

Loading single site marginals from  ../data/DHFR.single.npy
Loading double site marginals from  ../data/DHFR.double.npy
