In [4]:
from collections import Counter
import csv
import json
import time

import common

In [5]:
# Whether to sort all the sequences (nodes) after each round of pruning
SORT_SEQUENCES_AFTER_EACH_ROUND = True

OUTPUT_FILE = 'data/processed/final_paths.csv'
OUTPUT_FILE_ORDERED = 'data/processed/final_paths_ordered.csv'

# The maximum number of how many times is the sequence one level down in the tree as a child
MAX_COUNTS = {1: 6, 2: 5, 3: 4, 4: 3, 5: 2, 6: 1}

## Load the source data from EMOPEC

In [6]:
with open('data/raw/emopec_data.json', 'r') as file:
    EMOPEC_DATA = json.load(file)

## Generate all sequences for the tree creation

In [7]:
# Get all the specific sequences (all nucleotides are specific)
specific_sequences = EMOPEC_DATA.keys()

# Get all possible sequences except the specific ones
all_sequences = [common.RBSSequence(sequence) for sequence in common.generate_all_sequences(6) if
                 sequence not in specific_sequences]

In [8]:
# Get the tree root
root = None
for obj in all_sequences:
    if obj.sequence == "______":
        root = obj

In [9]:
# For the specific sequences, use the rel strength value from EMOPEC and add them to the all_sequences
for sequence in specific_sequences:
    all_sequences.append(common.RBSSequence(sequence, mean=EMOPEC_DATA[sequence]))

## Generate the tree and prune it level by level

In [10]:
# For sequences with different numbers of underscores, starting from 1
for underscores_count in range(1, 7):

    # Initialize a Counter object to count the number of times each sequence is one level down in the tree
    children_sequences_counter = Counter()

    # Start the round processing timer
    start = time.time()

    # Get the maximum number of how many times is the sequence one level down in the tree as a child
    current_max_count = MAX_COUNTS[underscores_count]

    print(f'Processing sequences with {underscores_count} underscore(s)...')

    # Get all sequences with the specified number of underscores
    current_sequences_objects = [sequence_object for sequence_object in all_sequences if
                                 sequence_object.sequence.count('_') == underscores_count]

    # For each sequence, get the possible children sequences and assign the objects from the all_sequences
    for sequence in current_sequences_objects:
        # Assign the children
        sequence.children = [child_object for child_object in all_sequences if
                             child_object.sequence in sequence.generate_children() and (
                                         child_object.children or child_object.is_specific())]

        # Count how many times each child is used
        for child in sequence.children:
            children_sequences_counter[child.sequence] += 1

        # Update the stats
        sequence.calculate_statistics()

    # Sort sequences by CV, descending order
    sorted_sequences = sorted(current_sequences_objects, key=lambda seq: seq.cv, reverse=True)

    # Use counter to track how many times is each child-sequence is used
    seq_counter = {sequence_object.sequence: current_max_count for sequence_object in all_sequences if
                   sequence_object.sequence.count('_') == underscores_count - 1 and (sequence_object.children or sequence_object.is_specific())}

    # Print the initial overall CV
    print(f'Initial CV for sequences with {underscores_count} underscore(s): {sum([seq.cv for seq in sorted_sequences])}')

    print('Prunning started...')

    loop_number = 0

    # # Loop until we decrease the count of each sequence to 1
    while True:

        loop_number += 1

        # For each sequence with an underscore
        for seq in sorted_sequences:
            # Remove the child with the mean furthest away from the mean of the parent
            child_to_remove = None
            max_difference = 0

            for child in seq.children:
                # Get the difference between the mean of the child and the mean of the parent
                difference = abs(child.mean - seq.mean)
                # If the difference is higher than the current max difference, select this child for removal
                if difference >= max_difference and seq_counter[child.sequence] != 1:
                    child_to_remove = child
                    max_difference = difference

            # If there is a child to remove, remove it and recalculate the statistics of the parent
            if child_to_remove:
                seq.children.remove(child_to_remove)
                seq.calculate_statistics()
                seq_counter[child_to_remove.sequence] -= 1

        stop = True
        for value in seq_counter.values():
            if value != 1:
                stop = False
                break

        print(seq_counter)

        if stop or loop_number > 30:
            if loop_number > 30:
                print('Pruning stopped after 30 loops!')
            break

        # Sort sequences again by CV, descending order
        if SORT_SEQUENCES_AFTER_EACH_ROUND:
            sorted_sequences = sorted(sorted_sequences, key=lambda seq: seq.cv, reverse=True)

    # After pruning, remove the used children objects
    for sequence in sorted_sequences:
        for child in sequence.children:
            try:
                all_sequences.remove(child)
            except ValueError:
                print(f'Error - the sequence {child.sequence} is not in the list of all sequences!')

    # Print the new overall CV
    print(
        f'New CV for sequences with {underscores_count} underscore(s): {sum([seq.cv for seq in sorted_sequences])}')

    # Print the time spent
    print(f'Time spent: {time.time() - start}')

Processing sequences with 1 underscore(s)...
Initial CV for sequences with 1 underscore(s): 2725.4171005429653
Prunning started...
{'AAAAAA': 6, 'AAAAAC': 6, 'AAAAAG': 5, 'AAAAAT': 6, 'AAAACA': 6, 'AAAACC': 6, 'AAAACG': 5, 'AAAACT': 6, 'AAAAGA': 5, 'AAAAGC': 5, 'AAAAGG': 1, 'AAAAGT': 5, 'AAAATA': 6, 'AAAATC': 6, 'AAAATG': 5, 'AAAATT': 6, 'AAACAA': 6, 'AAACAC': 5, 'AAACAG': 5, 'AAACAT': 6, 'AAACCA': 6, 'AAACCC': 6, 'AAACCG': 5, 'AAACCT': 6, 'AAACGA': 5, 'AAACGC': 6, 'AAACGG': 4, 'AAACGT': 5, 'AAACTA': 6, 'AAACTC': 6, 'AAACTG': 6, 'AAACTT': 5, 'AAAGAA': 4, 'AAAGAC': 5, 'AAAGAG': 4, 'AAAGAT': 4, 'AAAGCA': 5, 'AAAGCC': 5, 'AAAGCG': 3, 'AAAGCT': 5, 'AAAGGA': 3, 'AAAGGC': 3, 'AAAGGG': 4, 'AAAGGT': 4, 'AAAGTA': 5, 'AAAGTC': 6, 'AAAGTG': 4, 'AAAGTT': 5, 'AAATAA': 6, 'AAATAC': 6, 'AAATAG': 5, 'AAATAT': 6, 'AAATCA': 6, 'AAATCC': 5, 'AAATCG': 6, 'AAATCT': 6, 'AAATGA': 5, 'AAATGC': 5, 'AAATGG': 4, 'AAATGT': 5, 'AAATTA': 5, 'AAATTC': 5, 'AAATTG': 6, 'AAATTT': 6, 'AACAAA': 5, 'AACAAC': 6, 'AACAAG': 

In [11]:
# Get all sequences in the tree and their CV
cv_dict = common.collect_node_data(root)
# Save the dict as json - so we can use the file to check the final CV of each sequence after the pruning
with open('data/processed/cv_dict.json', 'w') as fp:
    json.dump(cv_dict, fp)

# Get all paths from the root up to the leaves
all_paths = common.get_all_paths(root)

csv_filename = OUTPUT_FILE_ORDERED if SORT_SEQUENCES_AFTER_EACH_ROUND else OUTPUT_FILE

with open(csv_filename, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)

    csvwriter.writerow(['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5', 'Level 6', 'Level 7', 'Relative Strength'])

    # Write the paths to the CSV
    for path in all_paths:

        # Extend the path with the relative strength from EMOPEC
        rel_strength = EMOPEC_DATA[path[-1]]
        path.append(rel_strength)

        print(path)

        csvwriter.writerow(path)

['______', 'A_____', 'AA____', 'AAT___', 'AATT__', 'AATTT_', 'AATTTA', 0.119]
['______', 'A_____', 'AA____', 'AAT___', 'AATT__', 'AATT_T', 'AATTCT', 0.1175]
['______', 'A_____', 'AA____', 'AAT___', 'AAT_C_', 'AATTC_', 'AATTCA', 0.1068]
['______', 'A_____', 'AA____', 'AA_T__', 'AA_TT_', 'AA_TTA', 'AACTTA', 0.1186]
['______', 'A_____', 'AA____', 'AA__C_', 'AA__CC', 'AAA_CC', 'AAATCC', 0.1145]
['______', 'A_____', 'AA____', 'AA__T_', 'AAT_T_', 'AATAT_', 'AATATG', 0.1336]
['______', 'A_____', 'AA____', 'AA__T_', 'AAT_T_', 'AAT_TT', 'AATATT', 0.1334]
['______', 'A_____', 'AA____', 'AA__T_', 'AA_CT_', 'AA_CTC', 'AAACTC', 0.1208]
['______', 'A_____', 'AA____', 'AA__T_', 'AA__TA', 'AAC_TA', 'AACATA', 0.1273]
['______', 'A_____', 'AA____', 'AA__T_', 'AA__TC', 'AAA_TC', 'AAAATC', 0.1204]
['______', 'A_____', 'AA____', 'AA___C', 'AA_C_C', 'AA_CGC', 'AACCGC', 0.1177]
['______', 'A_____', 'AT____', 'ATC___', 'ATCC__', 'ATCC_C', 'ATCCGC', 0.229]
['______', 'A_____', 'AT____', 'AT_C__', 'AT_CA_', 'AT