## Homework Assignment #07 UPGMA/WPGMA

Implement UPGMA and WPGMA algorithms

In [1]:
import numpy as np

def pgma(d_matrix, leaves_names, use_weight, debug=False):
    """
    a (numpy.ndarray): distance matrix
    use_weight (bool): whether to use weight coefficients
    debug (bool): flag to pring intermediate info.
    
    return (str): tree in Newick format.
    """
    def display_fn(debug_flag):
        def foo(text):
            if debug_flag:
                print(text)
        return foo
    print_debug = display_fn(debug)
    
    n = len(leaves_names)
    newick = {k: {'val': None} for k in leaves_names}
    a = np.copy(d_matrix)
    a_new = np.copy(a)
    leaves = leaves_names[:]
    
    for i_n in range(len(leaves)-1):
        print_debug(f"\nSTEP {i_n+1}")
        
        # Create new matrix (unefficient but more clear)
        a = a_new
        # Use inf as super high scores as we will pick minimum value
        a_new = np.ones((n-1-i_n, n-i_n)) * np.inf
        
        # Find most close leaves
        i_min, j_min = np.unravel_index(np.argmin(a, axis=None), a.shape)
        # Let i_min always be smaller than j_min
        if i_min > j_min:
            i_min, j_min = j_min, i_min
        print_debug(f"Indices of min val: {i_min} {j_min}")
        
        # Add leaves info to the newick format
        val = np.min(a) / 2
        if newick[leaves[i_min]]['val'] is None and newick[leaves[j_min]]['val'] is None:
            merged = f"({leaves[i_min]}:{val}, {leaves[j_min]}:{val})"
            new_key = leaves[i_min] + leaves[j_min]
            newick[new_key] = {'val': merged, 'len': val}
        elif newick[leaves[i_min]]['val'] is None:
            old_score = val - newick[leaves[j_min]]['len']
            merged = f"({newick[leaves[j_min]]['val']}:{old_score}, {leaves[i_min]}:{val})"
            new_key = leaves[i_min] + leaves[j_min]
            newick[new_key] = {'val': merged, 'len': val}
        elif newick[leaves[j_min]]['val'] is None:
            old_score = val - newick[leaves[i_min]]['len']
            merged = f"({newick[leaves[i_min]]['val']}:{old_score}, {leaves[j_min]}:{val})"
            new_key = leaves[i_min] + leaves[j_min]
            newick[new_key] = {'val': merged, 'len': val}
        else:
            i_score = val - newick[leaves[i_min]]['len']
            j_score = val - newick[leaves[j_min]]['len']
            merged = f"({newick[leaves[i_min]]['val']}:{i_score}, {newick[leaves[j_min]]['val']}:{j_score})"
            new_key = leaves[i_min] + leaves[j_min]
            newick[new_key] = {'val': merged, 'len': val}
        del newick[leaves[i_min]]
        del newick[leaves[j_min]]
    
        print_debug(f"Current newick: {newick}")
        
        # Determine weights
        w_i = w_j = 1
        if use_weight:
            w_i, w_j = len(leaves[i_min]), len(leaves[j_min])
        print_debug(f"weights {w_i} {w_j}")
        
        # Change leaves' names
        leaves[i_min] = leaves[i_min] + leaves[j_min]
        leaves[j_min:-1] = leaves[j_min+1:]
        leaves = leaves[:-1]
        print_debug(f"Merged leaves: {leaves}")
        
        if len(leaves) == 1:
            print_debug("End algorithm.")
            break

        # Create new reduced matrix
        a_new = np.ones((n-1-i_n, n-1-i_n)) * np.inf
        
        # Transfer old matrix values to the new one
        print_debug(f"Matrix:\n {a}")
        for i in range(n-1-i_n):
            if i in [i_min, j_min]:
                continue
            for j in range(i+1, n-i_n):
                if j in [i_min, j_min]:
                    continue
                # Shift elements backwards
                if j > i_min and i > j_min:
                    a_new[i-1, j-1] = a[i, j]
                elif j > j_min:
                    a_new[i, j-1] = a[i, j]
                else:
                    a_new[i, j] = a[i, j]
                            
        # let i be the index of the merged columns 
        
        # Calculate merged column
        for i in range(n-i_n):
            if i not in [i_min, j_min]:            
                score = (w_i * a[min(i_min, i), max(i_min, i)] + w_j * a[min(j_min, i), max(j_min, i)]) / (w_i + w_j)
                if i < j_min:
                    a_new[min(i, i_min), max(i, i_min)] = score
                else:
                    a_new[min(i-1, i_min), max(i-1, i_min)] = score
                
        print_debug(f"New matrix:\n {a_new}")
        
    return newick[list(newick.keys())[0]]['val']
    

    
def wpgma(a, leaves, debug=False):
    return pgma(a, leaves, use_weight=False, debug=debug)


def upgma(a, leaves, debug=False):
    return pgma(a, leaves, use_weight=True, debug=debug)

## Preliminary tests

### Prelim. test 1 from in-class tutorial

In [2]:
leaves = ['K', 'L', 'M', 'N']
a = np.array([
    [np.inf, 16, 16, 10],
    [np.inf, np.inf, 8, 8],
    [np.inf, np.inf, np.inf, 4],
    [np.inf, np.inf, np.inf, np.inf]
])
print("WPGMA: \n", wpgma(a, leaves, debug=True))


STEP 1
Indices of min val: 2 3
Current newick: {'K': {'val': None}, 'L': {'val': None}, 'MN': {'val': '(M:2.0, N:2.0)', 'len': 2.0}}
weights 1 1
Merged leaves: ['K', 'L', 'MN']
Matrix:
 [[inf 16. 16. 10.]
 [inf inf  8.  8.]
 [inf inf inf  4.]
 [inf inf inf inf]]
New matrix:
 [[inf 16. 13.]
 [inf inf  8.]
 [inf inf inf]]

STEP 2
Indices of min val: 1 2
Current newick: {'K': {'val': None}, 'LMN': {'val': '((M:2.0, N:2.0):2.0, L:4.0)', 'len': 4.0}}
weights 1 1
Merged leaves: ['K', 'LMN']
Matrix:
 [[inf 16. 13.]
 [inf inf  8.]
 [inf inf inf]]
New matrix:
 [[ inf 14.5]
 [ inf  inf]]

STEP 3
Indices of min val: 0 1
Current newick: {'KLMN': {'val': '(((M:2.0, N:2.0):2.0, L:4.0):3.25, K:7.25)', 'len': 7.25}}
weights 1 1
Merged leaves: ['KLMN']
End algorithm.
WPGMA: 
 (((M:2.0, N:2.0):2.0, L:4.0):3.25, K:7.25)


### Wikipedia example

In [3]:
leaves = ['a', 'b', 'c', 'd', 'e']
a = np.array([
    [np.inf, 17, 21, 31, 23],
    [np.inf, np.inf, 30, 34, 21],
    [np.inf, np.inf, np.inf, 28, 39],
    [np.inf, np.inf, np.inf, np.inf, 43]
])
print(upgma(a, leaves, debug=True))


STEP 1
Indices of min val: 0 1
Current newick: {'c': {'val': None}, 'd': {'val': None}, 'e': {'val': None}, 'ab': {'val': '(a:8.5, b:8.5)', 'len': 8.5}}
weights 1 1
Merged leaves: ['ab', 'c', 'd', 'e']
Matrix:
 [[inf 17. 21. 31. 23.]
 [inf inf 30. 34. 21.]
 [inf inf inf 28. 39.]
 [inf inf inf inf 43.]]
New matrix:
 [[ inf 25.5 32.5 22. ]
 [ inf  inf 28.  39. ]
 [ inf  inf  inf 43. ]
 [ inf  inf  inf  inf]]

STEP 2
Indices of min val: 0 3
Current newick: {'c': {'val': None}, 'd': {'val': None}, 'abe': {'val': '((a:8.5, b:8.5):2.5, e:11.0)', 'len': 11.0}}
weights 2 1
Merged leaves: ['abe', 'c', 'd']
Matrix:
 [[ inf 25.5 32.5 22. ]
 [ inf  inf 28.  39. ]
 [ inf  inf  inf 43. ]
 [ inf  inf  inf  inf]]
New matrix:
 [[inf 30. 36.]
 [inf inf 28.]
 [inf inf inf]]

STEP 3
Indices of min val: 1 2
Current newick: {'abe': {'val': '((a:8.5, b:8.5):2.5, e:11.0)', 'len': 11.0}, 'cd': {'val': '(c:14.0, d:14.0)', 'len': 14.0}}
weights 1 1
Merged leaves: ['abe', 'cd']
Matrix:
 [[inf 30. 36.]
 [inf inf 

## Test1

In [4]:
leaves = ['a', 'b', 'c', 'd']
a = np.array([
    [np.inf, 16, 16, 10],
    [np.inf, np.inf, 8, 8],
    [np.inf, np.inf, np.inf, 4],
    [np.inf, np.inf, np.inf, np.inf]
])
print('WPGMA: ', wpgma(a, leaves))
print('UPGMA: ', upgma(a, leaves))

WPGMA:  (((c:2.0, d:2.0):2.0, b:4.0):3.25, a:7.25)
UPGMA:  (((c:2.0, d:2.0):2.0, b:4.0):3.0, a:7.0)


## Test 2

In [5]:
leaves = ['a', 'b', 'c', 'd', 'e', 'f']
a = np.array([
    [np.inf, 5, 4, 7, 6, 8],
    [np.inf, np.inf, 7, 10, 9, 11],
    [np.inf, np.inf, np.inf, 7, 6, 8],
    [np.inf, np.inf, np.inf, np.inf, 5, 9],
    [np.inf, np.inf, np.inf, np.inf, np.inf, 8],
    [np.inf, np.inf, np.inf, np.inf, np.inf, np.inf]
])
print('WPGMA: ', wpgma(a, leaves))
print('UPGMA: ', upgma(a, leaves))

WPGMA:  ((((a:2.0, c:2.0):1.0, b:3.0):1.0, (d:2.5, e:2.5):1.5):0.5, f:4.5)
UPGMA:  ((((a:2.0, c:2.0):1.0, b:3.0):0.75, (d:2.5, e:2.5):1.25):0.6500000000000004, f:4.4)
