In [27]:
# Testing / retracing computations from Karlin & Altschul 1993
# 10.1073/pnas.90.12.5873


import os as os
import pandas as pd
import mpmath as mpm
import numpy as np


def build_matrix(fpath):
    row_names = []
    rows = []
    with open(fpath, 'r') as col:
        this_row = []
        for num, line in enumerate(col, start=1):
            content = line.strip()
            try:
                score = int(content)
                this_row.append(score)
            except ValueError:
                if content.isalpha():
                    row_names.append(content)
                    if this_row:
                        rows.append(this_row)
                    this_row = []
    rows.append(this_row)
    df = pd.DataFrame(rows, index=row_names)
    df.replace(np.nan, 0, inplace=True)
    df.columns = df.index
    df = df + df.transpose()
    for i in range(df.shape[0]):
        df.iloc[i, i] = df.iat[i, i] // 2
    df = df.astype(np.int16)
    return df


def load_matrix(fpath):
    subm = pd.read_csv(fpath, delimiter='\t')
    subm.index = subm['Residue']
    drop_cols = ['Residue', '*', 'B', 'Z', 'X']
    subm.index.name = ''
    subm.drop(drop_cols, axis=0, inplace=True, errors='ignore')
    subm.drop(drop_cols, axis=1, inplace=True, errors='ignore')
    subm = subm.astype(np.int16)
    return subm

outdir = '/home/pebert/work/code/mpggit/statediff/annotation/misc'

# Raw biostrings PAM120 matrix taken from R Biostrings package
raw_biostrings = '/home/pebert/work/code/mpggit/statediff/annotation/misc/raw_biostrings_pam120.tsv'

# Raw Quretec PAM120 matrix dowloaded from
# http://www.quretec.com/u/vilo/edu/2002-03/Tekstialgoritmid_I/Loengud/Loeng3_Edit_Distance/bcorum_copy/seq_align5.htm
raw_quretec = '/home/pebert/work/code/mpggit/statediff/annotation/misc/raw_quretec_pam120.tsv'

bio_pam120 = load_matrix(raw_biostrings)
qur_pam120 = build_matrix(raw_quretec)

# The two matrices are not identical
# Since I do not know the original source,
# hard to say which one is "historically" correct
if (bio_pam120 == qur_pam120).all(axis=0).all():
    print('PAM120 matrices identical')
else:
    print('PAM120 matrices NOT identical')

# Values taken from above mentioned publication
# Examples from Table 3

lam, k, n = 0.314, 0.17, 34336

def norm_score(s, l, k, n):
    return np.round(l * s - np.log(k * n), 1)

# Pair 1
s1 = pd.Series(list('VYLPQMKIEEKYNLTSVLMALGMTDLF'), index=range(27))
s2 = pd.Series(list('LYLPKFELEDDVDLKDALIHMGCNDLF'), index=range(27))

raw_bio_p1 = bio_pam120.lookup(s1, s2).sum()
raw_qur_p1 = qur_pam120.lookup(s1, s2).sum()

assert raw_bio_p1 == raw_qur_p1 == 52, 'Wrong score for pair 1: {} / {}'.format(raw_bio_p1, raw_qur_p1)

norm_bio_p1 = norm_score(raw_bio_p1, lam, k, n)
norm_qur_p1 = norm_score(raw_qur_p1, lam, k, n)

assert np.allclose([norm_bio_p1, norm_qur_p1], [7.6, 7.6], atol=0.1),\
    'Normalized scores off: {} / {}'.format(norm_bio_p1, norm_qur_p1)

# Pair 2
s3 = pd.Series(list('SANLTGISSAESLKISQAVHGAFMELSEDGIEMAGST'), index=range(37))
s4 = pd.Series(list('SGELVGISDTKTLRIGNIRQKSVIKVDEYGTEAASVT'), index=range(37))

raw_bio_p2 = bio_pam120.lookup(s3, s4).sum()
raw_qur_p2 = qur_pam120.lookup(s3, s4).sum()

assert raw_bio_p2 == raw_qur_p2 == 49, 'Wrong score for pair 2: {} / {}'.format(raw_bio_p2, raw_qur_p2)

norm_bio_p2 = norm_score(raw_bio_p2, lam, k, n)
norm_qur_p2 = norm_score(raw_qur_p2, lam, k, n)

assert np.allclose([norm_bio_p2, norm_qur_p2], [6.7, 6.7], atol=0.1),\
    'Normalized scores off: {} / {}'.format(norm_bio_p1, norm_qur_p1)

# dump cleaned matrices to new file
norm_biostrings = os.path.join(outdir, os.path.basename(raw_biostrings).replace('raw_', 'norm_'))
with open(norm_biostrings, 'w') as dump:
    bio_pam120.to_csv(dump, header=True, index=True, index_label='IUPAC',
                      sep='\t', line_terminator='\n')
    
norm_quretec = os.path.join(outdir, os.path.basename(raw_quretec).replace('raw_', 'norm_'))
with open(norm_quretec, 'w') as dump:
    qur_pam120.to_csv(dump, header=True, index=True, index_label='IUPAC',
                      sep='\t', line_terminator='\n')

PAM120 matrices NOT identical
