In [56]:
import os
import sys
import copy
import numpy as np

In [85]:
# The file paths. You can change the directory if you want.
distance_matrix_path = 'distance_matrix2.txt'
labels_path = 'labels2.txt'

In [86]:
def read_labels(labels_path=''):
    # Read labels.txt to dictionary.
    labels_list = []
    with open(labels_path, 'r') as f_label:
        labels = f_label.readline()[1:-2]
        labels_list = [label.replace('"', '') for label in labels.split(', ')]
    return np.array(labels_list)

def read_distance_matrix(distance_matrix_path=''):
   # Read distance_matrix.txt to dictionary.
    distance_matrix = np.zeros((len(labels_list), len(labels_list)))
    with open(distance_matrix_path, 'r') as f_dist:
        for i, line in enumerate(f_dist):
            distances_i = np.array([int(dist) for dist in line[1:-2].split(', ')])
            distances_i = np.hstack([distances_i, np.zeros(len(labels_list) - distances_i.shape[0])])
            distance_matrix[i, :] = distances_i
    return distance_matrix 

In [87]:
labels = read_labels(labels_path=labels_path)
distance_matrix = read_distance_matrix(distance_matrix_path=distance_matrix_path)
print(labels)
distance_matrix

['A' 'B' 'C' 'D' 'E' 'F' 'G']


array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [19.,  0.,  0.,  0.,  0.,  0.,  0.],
       [27., 31.,  0.,  0.,  0.,  0.,  0.],
       [ 8., 18., 26.,  0.,  0.,  0.,  0.],
       [33., 36., 41., 31.,  0.,  0.,  0.],
       [18.,  1., 32., 17., 35.,  0.,  0.],
       [13., 13., 29., 14., 28., 12.,  0.]])

In [94]:
def find_pair_with_minimum_distance(distance_matrix):
    upper_triu =  np.triu_indices(distance_matrix.shape[0])
    distance_matrix_distorted = distance_matrix.copy()
    distance_matrix_distorted[upper_triu] = np.inf
    return np.unravel_index(np.argmin(distance_matrix_distorted, axis=None), distance_matrix_distorted.shape)

def construct_new_distance_matrix(distance_cluster_array, distance_matrix, first_species, second_species):
    distance_matrix = np.delete(distance_matrix, second_species, 0)
    distance_matrix = np.delete(distance_matrix, second_species, 1)
    distance_matrix[first_species, :] = distance_cluster_array
    distance_matrix[:, first_species] = distance_cluster_array
    upper_triu =  np.triu_indices(distance_matrix.shape[0])
    distance_matrix[upper_triu] = 0
    return distance_matrix

def update_labels_of_clustered_species(first_cluster, second_cluster):
    new_cluster_label = "({},{})".format(first_cluster, second_cluster)
    print(new_cluster_label)
    return new_cluster_label

def update_distances_after_joining(selected_pair, distance_matrix, labels):
    new_distance_matrix = np.zeros((distance_matrix.shape[0]-1, distance_matrix.shape[1]-1))
    first_species, second_species = np.min(selected_pair), np.max(selected_pair)
    
    distance_of_new_cluster = np.mean([np.sum([distance_matrix[first_species, :], distance_matrix[:, first_species]], axis=0),\
                                      np.sum([distance_matrix[second_species, :], distance_matrix[:, second_species]], axis=0)], axis=0)
    distance_cluster_array = np.concatenate([distance_of_new_cluster[0:first_species], [0], distance_of_new_cluster[first_species+1:second_species],\
                                             distance_of_new_cluster[second_species+1:]])
    
    new_cluster_label = update_labels_of_clustered_species(labels[first_species], labels[second_species])
    
    labels = np.concatenate([labels[0:first_species], [new_cluster_label], labels[first_species+1:second_species], labels[second_species+1:]])
    
    distance_matrix = construct_new_distance_matrix(distance_cluster_array, distance_matrix, first_species, second_species)
    return distance_matrix, labels

def start_upgma_algorithm(distance_matrix, labels):
    while labels.shape[0] > 1:
        selected_pair = find_pair_with_minimum_distance(distance_matrix)
        print("selected_pair is ", selected_pair)
        distance_matrix, labels = update_distances_after_joining(selected_pair, distance_matrix, labels)
        print("New labels and distance matrix")
        print(labels)
        print(distance_matrix)
        print("------------------------------------------------------------")

In [95]:
start_upgma_algorithm(distance_matrix, labels)

selected_pair is  (5, 1)
(B,F)
New labels and distance matrix
['A' '(B,F)' 'C' 'D' 'E' 'G']
[[ 0.   0.   0.   0.   0.   0. ]
 [18.5  0.   0.   0.   0.   0. ]
 [27.  31.5  0.   0.   0.   0. ]
 [ 8.  17.5 26.   0.   0.   0. ]
 [33.  35.5 41.  31.   0.   0. ]
 [13.  12.5 29.  14.  28.   0. ]]
------------------------------------------------------------
selected_pair is  (3, 0)
(A,D)
New labels and distance matrix
['(A,D)' '(B,F)' 'C' 'E' 'G']
[[ 0.   0.   0.   0.   0. ]
 [18.   0.   0.   0.   0. ]
 [26.5 31.5  0.   0.   0. ]
 [32.  35.5 41.   0.   0. ]
 [13.5 12.5 29.  28.   0. ]]
------------------------------------------------------------
selected_pair is  (4, 1)
((B,F),G)
New labels and distance matrix
['(A,D)' '((B,F),G)' 'C' 'E']
[[ 0.    0.    0.    0.  ]
 [15.75  0.    0.    0.  ]
 [26.5  30.25  0.    0.  ]
 [32.   31.75 41.    0.  ]]
------------------------------------------------------------
selected_pair is  (1, 0)
((A,D),((B,F),G))
New labels and distance matrix
['((A,D),((B,F