### Canvas Creation from Enrichr

In [65]:
import pandas as pd 
import numpy as np
import pickle
import json
import requests
import math
import matplotlib
import uuid
import urllib
import time as time
from textwrap import dedent
from IPython.core.display import display, HTML
from string import Template
from random import seed, randint
from operator import itemgetter

Below this is arbitrary definitions for these variables, their actual values will be selected in the Appyter (so DON'T ADD BELOW CELL TO APPYTER)

In [66]:
all_libraries = ['Transcription_Factor_PPIs', 'TRRUST_Transcription_Factors_2019', 'HumanCyc_2016', 'huMAP', 'KEA_2015', 'KEGG_2019_Human', 'KEGG_2019_Mouse']

In [67]:
def library_processing(library_index):
    # processes library data
    raw_library_data = []
    library_data = []

    with urllib.request.urlopen('https://amp.pharm.mssm.edu/Enrichr/geneSetLibrary?mode=text&libraryName=' + all_libraries[library_index]) as f:
        for line in f.readlines():
                raw_library_data.append(line.decode("utf-8").split("\t\t"))
    name = []
    gene_list = []

    for i in range(len(raw_library_data)):
        name += [raw_library_data[i][0]]
        raw_genes = raw_library_data[i][1].split("\t")
        gene_list += [raw_genes[:-1]]

    # determine the dimensions of the canvas
    x_dimension = math.ceil(math.sqrt(len(name)))
    y_dimension = math.ceil(math.sqrt(len(name)))

    # zip name, gene_list, indices, and blank list for neighbor score then add dummy entries to the zipped list
    num_hex = x_dimension*y_dimension
    neighbor_score = [0.0] * len(name)
    anneal_list = list(zip(name, gene_list, neighbor_score))

    # add "dummy" hexagons so the rectangular shape is filled
    for i in range(len(name), num_hex):
        anneal_list += [('', [], 0.0)]

    return anneal_list, x_dimension, y_dimension

def unzip_list(anneal_list):
    unzipped_list = zip(*anneal_list)
    return list(unzipped_list)

In [68]:
def find_neighbors(ind, x_dimension, y_dimension):
    # returns a list of the indices of the neighbors of the index given
    
    neighbors = []
    num_hex = x_dimension * y_dimension

    if 0 <= ind <= x_dimension-1:
        # top row (inc. corners)
        if ind == 0:
            # top left corner
            neighbors = [num_hex-1, num_hex-x_dimension, x_dimension-1, 2*x_dimension-1, ind+1, ind+x_dimension]
        elif ind == (x_dimension-1):
            # top right corner
            neighbors = [ind-1, ind+x_dimension, 0, ind+x_dimension-1, num_hex-2, num_hex-1]
        else:
            # non-corner top row
            neighbors = [ind-1, ind+1, ind+x_dimension, ind+num_hex-x_dimension-1, 
            ind+num_hex-x_dimension, ind+x_dimension-1]

    elif (num_hex - x_dimension) <= ind <= num_hex -1:
        if ind == (num_hex-x_dimension):
            # bottom left corner
            neighbors = [ind+1, ind-x_dimension, ind-x_dimension+1, 0, 1, num_hex-1]
        elif ind == (num_hex-1):
            # bottom right corner
            neighbors = [ind-1, ind-x_dimension, ind-x_dimension+1, 0, x_dimension-1,
            num_hex-2*x_dimension]
        else:
            # non-corner bottom row
            neighbors = [ind-1, ind+1, ind-x_dimension, ind-x_dimension+1, ind-num_hex+x_dimension,
            ind-num_hex+x_dimension+1]
    elif ind % y_dimension == 0 and (ind/y_dimension)%2 == 1:
        # "inner" left edge (not top or bottom row)
        neighbors = [ind+x_dimension-1, ind+1, ind-x_dimension, ind-x_dimension+1, ind+x_dimension, 
        ind+x_dimension+1]
    elif ind % y_dimension == 0 and (ind/y_dimension)%2 == 0:
        # "outer" left edge (not top or bottom row)
        neighbors = [ind-1, ind+1, ind+x_dimension, ind+2*x_dimension-1, ind-x_dimension, 
        ind+x_dimension-1]
    elif (ind+1) % y_dimension == 0 and ((ind+1)/y_dimension)%2 == 0:
        # "outer" right edge (not top or bottom row)
        neighbors = [ind-1, ind+1, ind-x_dimension, ind-x_dimension+1, ind+x_dimension, 
        ind-2*x_dimension+1]
    elif (ind+1) % y_dimension == 0 and ((ind+1)/y_dimension)%2 == 1:
        # "inner" right edge (not top or bottom row)
        neighbors = [ind-1, ind-x_dimension-1, ind-x_dimension, ind-x_dimension+1, ind+x_dimension, 
        ind+x_dimension-1]
    else:
        # middle
        neighbors = [ind-1, ind+1, ind-x_dimension, ind-x_dimension+1, ind+x_dimension, 
        ind+x_dimension+1]
    return neighbors

In [69]:
# initially find fitness
def find_fitness(anneal_list, x_dimension, y_dimension):
    fitness = 0
    for i in range(len(anneal_list)):
        neighbors = find_neighbors(i, x_dimension, y_dimension)
        sum_neighbor_score = 0
        for index in neighbors:
            intersection = [value for value in anneal_list[index][1] if value in anneal_list[i][1]]
            if len(anneal_list[index][1]) + len(anneal_list[i][1]) != 0:
                jaccard = len(intersection)/(len(anneal_list[index][1]) + len(anneal_list[i][1]))
            else:
                jaccard = 0.0
            sum_neighbor_score += jaccard
        hex_list = list(anneal_list[i])
        hex_list[2] = sum_neighbor_score
        hex_tuple = tuple(hex_list)
        anneal_list[i] = hex_tuple
        fitness += sum_neighbor_score
    return fitness, anneal_list

# take indices of swapped hexagons
def find_swapped_fitness(anneal_list, swapped_a, swapped_b, old_fitness, x_dimension, y_dimension):
    neighbors_a = find_neighbors(swapped_a, x_dimension, y_dimension)
    neighbors_b = find_neighbors(swapped_b, x_dimension, y_dimension)
    hexagons_to_update = [swapped_a, swapped_b] + neighbors_a + neighbors_b
    anneal_copy = anneal_list.copy()

    new_fitness = 0
    # Recalculate scores for all hexagons that need updating
    for hex in hexagons_to_update:

        # subtract out the swapped neighbor fitnesses because they are changing 
        old_fitness -= anneal_copy[hex][2]

        neighbors = find_neighbors(hex, x_dimension, y_dimension)
        sum_neighbor_score = 0
        for index in neighbors:
            intersection = [value for value in anneal_copy[index][1] if value in anneal_copy[hex][1]]
            if len(anneal_copy[index][1]) + len(anneal_copy[hex][1]) != 0:
                jaccard = len(intersection)/(len(anneal_copy[index][1]) + len(anneal_copy[hex][1]))
            else:
                jaccard = 0.0
            sum_neighbor_score += jaccard
        hex_list = list(anneal_copy[hex])
        hex_list[2] = sum_neighbor_score
        hex_tuple = tuple(hex_list)
        anneal_copy[hex] = hex_tuple
        new_fitness += sum_neighbor_score
        
    return old_fitness + new_fitness, anneal_copy

In [70]:
def annealing(anneal_list, steps, old_fitness, x_dimension, y_dimension):
    num_hex = x_dimension * y_dimension
    # returns unzipped list
    for i in range(steps):
        index_a = randint(0, num_hex-1)
        index_b = randint(0, num_hex-1)
        anneal_list[index_a], anneal_list[index_b] = anneal_list[index_b], anneal_list[index_a]
        new_fitness, new_anneal_list = find_swapped_fitness(anneal_list, index_a, index_b, old_fitness, x_dimension, y_dimension)

        if new_fitness <= old_fitness:
            # swap back
            anneal_list[index_a], anneal_list[index_b] = anneal_list[index_b], anneal_list[index_a]
        else:
            # finalize the swap by resetting old_fitness and changing anneal_list
            old_fitness = new_fitness
            anneal_list = new_anneal_list
    return anneal_list

In [71]:
times_list = []
for library_index in range(len(all_libraries)):
    t = time.time()
    anneal_list, x_dimension, y_dimension = library_processing(library_index)
    anneal_list = annealing(anneal_list, 100000, find_fitness(anneal_list, x_dimension, y_dimension)[0], x_dimension, y_dimension)
    unzipped_anneal_list = unzip_list(anneal_list)
    processed_list = list(zip(unzipped_anneal_list[0], unzipped_anneal_list[1]))
    with open('Annealed-Libaries/' + all_libraries[library_index] + '.txt', 'wb+') as f:
        pickle.dump(processed_list, f)
    times_list += [(all_libraries[library_index], time.time()-t)]

In [72]:
print(times_list)

[('Transcription_Factor_PPIs', 945.6568388938904), ('TRRUST_Transcription_Factors_2019', 93.70540809631348), ('HumanCyc_2016', 28.387732982635498), ('huMAP', 62.78419494628906), ('KEA_2015', 182.9156768321991), ('KEGG_2019_Human', 813.9302940368652), ('KEGG_2019_Mouse', 903.9501688480377)]
