## MIP Program for choosing preferred path for ancestor nodes.
1. Program - Gurobi Solver
2. Date - 31 Aug 2022

In [1]:
# libraries
import gurobipy as gp
from gurobipy import abs_,quicksum
from gurobipy import GRB
import time
import json
from collections import defaultdict
from ete3 import Tree

In [2]:
''' Convert the json pogs file to the data structure needed for MIP Model'''
class Pogs:
    def __init__(self,pogs_file,tree_file):
        
        self.json_pogs_file = pogs_file
        tree_file = open(nwk_file_path,"r")
        my_tree = tree_file.read() + ";"
        self.tree = Tree(my_tree, format=1)
          
    def create_neighbor_object(self):
        ''' create neighbor dict '''
        tree_neighbor_dict = defaultdict(list)
        
        for n in self.tree.traverse():
            if n.is_leaf() == False:    
                for c in n.children:
                    tree_neighbor_dict[n.name] += [c.name]
        return tree_neighbor_dict
    
    def create_node_info_dict(self):
        ''' create edge dictionary and other node information'''
        
        '''Example output-- {\
                  1:{0:[1,2],1:[2,3,4],2:[3,4],3:[4],4:[5]} ,\
                  2:{0:[1],1:[2],2:[3],3:[4],4:[5]} \
                  } '''
        
        
        # read the json file
        with open(self.json_pogs_file, 'r') as j:
            pog_all_data = json.loads(j.read())

            node_path_dict = {}
            node_path_reverse_dict = {}
            extant_list = []
            
            # read all ancestors
            for node_type in ['Ancestors','Extants']:
                for pog_data in pog_all_data[node_type]:
                    if node_type == 'Ancestors':
                        node_name = 'N' + pog_data['Name']
                    else:
                        node_name = pog_data['Name']
                        extant_list.append(node_name)  # all extants
                    
                    node_edges_info_dict = defaultdict(list)
                    node_edges_reverse_info_dict = defaultdict(list)
                    
                    # read that node's data
                    nodes = pog_data['Size'] + 2
                    
                    # Edges from special Start node to the start nodes
                    for s in pog_data['Starts']:
                        node_edges_info_dict[0] += [s+1] 
                        node_edges_reverse_info_dict[s+1] += [0] 
            
                    # Edges from last node to the special End node
                    for e in pog_data['Ends']:
                        node_edges_info_dict[e + 1] += [nodes-1]
                        node_edges_reverse_info_dict[nodes-1] += [e + 1]

                    # create the adjency matrix for all nodes except from special node start
                    for ind,node in enumerate(pog_data['Indices']):
                        row_mat = node
                        row_col = pog_data['Adjacent'][ind]

                        for rc in row_col:
                            node_edges_info_dict[row_mat + 1] += [rc + 1]
                            node_edges_reverse_info_dict[rc + 1] += [row_mat + 1]
                            
                    # put all info together in the final dict
                    node_path_dict[node_name] = node_edges_info_dict
                    node_path_reverse_dict[node_name] = node_edges_reverse_info_dict
        
        return node_path_dict,node_path_reverse_dict,nodes,extant_list
    
## testing
# nwk_file_path = '/Users/sanjanatule/Documents/uq/Projects/PreferredPath/data/sample_tree/grasp_ancestors.nwk'
# pogs_file     = '/Users/sanjanatule/Documents/uq/Projects/PreferredPath/data/sample_tree/pogs.json'
# p = Pogs(pogs_file,nwk_file_path)
# p.create_node_info_dict()
# p.create_neighbor_object()

In [18]:
''' MIP Model for preferred path with least parsimonous score '''
class PhyloTree:
    def __init__(self,nodes,sequence_length,neighbor_dict,node_from_edge_dict,node_to_edge_dict,folder_location\
                 ,extant_list,tree_name):

        # 1 - define the configuration  and decision variables for the tree
        self.nodes = nodes
        self.sequence_length = sequence_length
        self.neighbor_dict = neighbor_dict
        self.node_from_edge_dict = node_from_edge_dict
        self.node_to_edge_dict = node_to_edge_dict
        self.folder_location = folder_location
        self.extant_list = extant_list
        self.tree_name = tree_name
        self.edges = {}
        self.positions = {}
        self.penalty = {}
        self.diff = {}
        self.objective = []
        self.M = 999

        # 2 - create a new model
        self.m = gp.Model("PreferredPathSolve")

        # 3 - create variables
        # 3.1 - create variables - positions and edges of the POG
        for node,node_edge_from_list in self.node_from_edge_dict.items():
            for pos_from in range(0,sequence_length):
                # positions
                pos_id = (node,pos_from)
                pos = self.m.addVar(vtype=GRB.BINARY, name="n-%s-%s"%pos_id)
                self.positions[pos_id] = pos
                
                # edges
                for pos_to in range(pos_from + 1, sequence_length):
                    edge_id = (node,pos_from,pos_to)
                    e = self.m.addVar(vtype=GRB.BINARY, name='e-%s-%s-%s'%edge_id)
                    self.edges[edge_id] = e
        
        
    def add_constraints_n_objective(self):
        
        # constraints for extants
        
        
        for e in self.extant_list:
            
            position_present = [] # track the position having amino acid
            
            ###### FORWARD EDGES #######
            extant_node_edges_forward = self.node_from_edge_dict[e]
            for pos_from,node_edge_from_list_item in extant_node_edges_forward.items(): 
                
                possible_edges = [ r for r in range(pos_from + 1,sequence_length)]
                edges_not_present_pos = list(set(possible_edges) - set(node_edge_from_list_item))
                
                for pos_to in edges_not_present_pos:
                    edge_from_id = (e,pos_from,pos_to)
                    self.m.addConstr(self.edges[edge_from_id] == 0,name=\
                                            "na_edge_constraint-%s-%s-%s"%(e,pos_from,pos_to))
                
                for pos_to in node_edge_from_list_item:
                    edge_from_id = (e,pos_from,pos_to)
                    self.m.addConstr(self.edges[edge_from_id] == 1,name=\
                                            "edge_constraint-%s-%s-%s"%(e,pos_from,pos_to))
                    position_present.append(pos_from)
                    position_present.append(pos_to)
                    
                    
                ######## EDGES RECON CONSTRAINT ##########
                # edges coming in
                edges_coming_in = [ r for r in range(0,pos_from)]
                
                #edges going out
                edges_going_out = [ r for r in range(pos_from + 1,sequence_length)]
                
                edges_coming_in_list = []
                edges_going_out_list = []
                
                for edges_coming_in_item in edges_coming_in:
                    edge_to_id = (e,edges_coming_in_item,pos_from)
                    edges_coming_in_list.append(self.edges[edge_to_id])
                        
                for edges_going_out_item in edges_going_out:
                    edge_to_id = (e,pos_from,edges_going_out_item)
                    edges_going_out_list.append(self.edges[edge_to_id])
                    
                self.m.addConstr(sum(edges_coming_in_list) == sum(edges_going_out_list),\
                                     name="edge_recon_constraint-%s-%s"%\
                               (e,pos_from))
                
                
                
            #### POSITIONS #####
            
            
            ## ALL OTHER POSITIONS - PRESENT ###
            for pos in list(set(position_present)):
                pos_id = (e,pos)
                self.m.addConstr(self.positions[pos_id] == 1,\
                                 name="other_position_constraint-%s-%s"%(e,pos))
                
           
            ## ALL OTHER POSITIONS - NOT PRESENT ###
            all_positions = [ps for ps in range(0,self.sequence_length)]
            for pos in list(set(all_positions) - set(position_present)):
                pos_id = (e,pos)
                self.m.addConstr(self.positions[pos_id] == 0,\
                                 name="other_position_constraint-%s-%s"%(e,pos))
            
                        
        
        
    def train(self,n_threads,time_out):
        
        # Params
        self.m.Params.Threads = n_threads
        self.m.Params.TimeLimit = time_out*60
        #self.m.Params.LogFile =  folder_location + 
        self.m.Params.LogToConsole = 0
        self.m.Params.Degenmoves=0
        
        # Optimize
        #self.total_objective = sum([o for o in self.objective])
        #print("self.total_objective",self.total_objective)
        #self.m.setObjective(self.total_objective, GRB.MINIMIZE)
        self.m.update()
        
        self.m.write((self.folder_location + 'pf_mip_formulation_' + self.tree_name + '.lp'))
        self.m.optimize()
        
        # Is feasible?
        return self.m.SolCount > 0
    
    
    def get_info(self):
        info_all = {}
        info_all["objective"] = self.m.ObjVal
        info_all["bound"] = self.m.ObjBound
        info_all["gap"] = self.m.MIPGap
        info_all["is_optimal"] = (self.m.status == GRB.OPTIMAL)
        info_all["num_nodes"] = self.m.NodeCount
        info_all["num_vars"] = self.m.NumIntVars + self.m.NumBinVars

        if self.m.SolCount > 0:
            print("objective: %0.2f"%info_all["objective"])
            print("bound: %0.2f"%info_all["bound"])
            print("gap: %0.2f"%info_all["gap"])

        return info_all
    
    def get_solution(self):
        all_node_paths = {}
        for node,node_edge_from_list in self.node_from_edge_dict.items():
            preferred_path = []
            for pos_from in range(0,sequence_length):
                pos_id = (node,pos_from)
                preferred_path.append(int(self.positions[pos_id].X))
                
            all_node_paths[node] = preferred_path
        return all_node_paths
               

In [19]:
# TEST EXAMPLE - 1
# folder_location = '/Users/sanjanatule/Documents/uq/Projects/PreferredPath/scripts/mip_files/'
# nodes = 3
# sequence_length = 6 #(start and end pos)
# neighbor_dict = dict({1:[2,3]})
# extant_list = [2,3]
# node_from_edge_dict = dict({\
#                   1:{0:[1,2],1:[2,3,4],2:[3,4],3:[4],4:[5]} ,\
#                   2:{0:[1],1:[2],2:[3],3:[4],4:[5]} ,\
#                   3:{0:[1],1:[2],2:[3],3:[4],4:[5]}})
# node_to_edge_dict = dict({\
#                   1:{1:[0],2:[0,1],3:[1,2],4:[1,2,3],5:[4]} ,\
#                   2:{1:[0],2:[1],3:[2],4:[3],5:[4]} ,\
#                   3:{1:[0],2:[1],3:[2],4:[3],5:[4]}\
#                     })
# tree_name = 'test_example_1'

In [20]:
#TEST EXAMPLE - 2
folder_location = '/Users/sanjanatule/Documents/uq/Projects/PreferredPath/scripts/mip_files/'
nodes = 3
sequence_length = 6 #(start and end pos)
neighbor_dict = dict({1:[2,3]})
extant_list = [2,3]
node_from_edge_dict = dict({\
                  1:{0:[1,2],1:[2,3,4],2:[3,4],3:[4],4:[5]} ,\
                  2:{0:[1],1:[2],2:[3],3:[4],4:[5]} ,\
                  3:{0:[1],1:[4],2:[],3:[],4:[5]}})
node_to_edge_dict = dict({\
                  1:{1:[0],2:[0,1],3:[1,2],4:[1,2,3],5:[4]} ,\
                  2:{1:[0],2:[1],3:[2],4:[3],5:[4]} ,\
                  3:{1:[0],2:[],3:[],4:[1],5:[4]}\
                    })
tree_name = 'test_example_2'

In [21]:
# TEST MIP Models for different trees


## sample tree
nwk_file_path = '/Users/sanjanatule/Documents/uq/Projects/PreferredPath/data/sample_tree/grasp_ancestors.nwk'
pogs_file     = '/Users/sanjanatule/Documents/uq/Projects/PreferredPath/data/sample_tree/pogs.json'
folder_location = '/Users/sanjanatule/Documents/uq/Projects/PreferredPath/scripts/mip_files/'
tree_name = 'sample_tree'
TreePogs = Pogs(pogs_file,nwk_file_path)
node_from_edge_dict,node_to_edge_dict,sequence_length,extant_list = TreePogs.create_node_info_dict()
nodes = len(node_from_edge_dict)
neighbor_dict = TreePogs.create_neighbor_object()
print("TOTAL NODES::",nodes)
print("TOTAL POSITIONS:",sequence_length)
print("EXTANT LIST:",extant_list)
print("NEIGHBOR DICT:",neighbor_dict)
node_from_edge_dict['A6']

TOTAL NODES:: 39
TOTAL POSITIONS: 16
EXTANT LIST: ['A3', 'A9', 'A4', 'A20', 'A8', 'A5', 'A19', 'A1', 'A17', 'A10', 'A7', 'A16', 'A18', 'A13', 'A15', 'A14', 'A11', 'A12', 'A2', 'A6']
NEIGHBOR DICT: defaultdict(<class 'list'>, {'N0': ['N1', 'N9'], 'N1': ['N2', 'N6'], 'N9': ['N10', 'N18'], 'N2': ['N3', 'N5'], 'N6': ['A5', 'N7'], 'N10': ['N11', 'N13'], 'N18': ['A2', 'A6'], 'N3': ['N4', 'A4'], 'N5': ['A20', 'A8'], 'N7': ['N8', 'A17'], 'N11': ['A10', 'N12'], 'N13': ['N14', 'N15'], 'N4': ['A3', 'A9'], 'N8': ['A19', 'A1'], 'N12': ['A7', 'A16'], 'N14': ['A18', 'A13'], 'N15': ['A15', 'N16'], 'N16': ['A14', 'N17'], 'N17': ['A11', 'A12']})


defaultdict(list,
            {0: [1],
             14: [15],
             1: [3],
             3: [4],
             4: [5],
             5: [6],
             6: [7],
             7: [10],
             10: [12],
             12: [14]})

In [22]:
start = time.time()
print("Start Time:",start)
n_threads = 1
time_out = 60

PyTree = PhyloTree(nodes,sequence_length,neighbor_dict,node_from_edge_dict,node_to_edge_dict,folder_location\
                   ,extant_list,tree_name)
PyTree.add_constraints_n_objective()
is_sat = PyTree.train(n_threads, time_out)
print("is_sat",is_sat)
total_time = ((time.time()-start))
print("-----------------------------")
print("Total time = %0.2f[m]"%total_time)
info = PyTree.get_info()
info["total_time"] = total_time
info["is_sat"] = is_sat
print("info",info)

if is_sat:
    all_node_paths = PyTree.get_solution()
    print("all_node_paths",all_node_paths)
else:
    print("Did not find any satisfactory solution to the model")

Start Time: 1662038151.466066
Set parameter Threads to value 1
Set parameter TimeLimit to value 3600
is_sat False
-----------------------------
Total time = 0.06[m]


AttributeError: Unable to retrieve attribute 'ObjVal'