# Project to AKS

## Reference 

- https://homel.vsb.cz/~vas218/pdf/acs/grammar.pdf
- https://homel.vsb.cz/~vas218/pdf/acs/vasinek-thesis.pdf
- https://homel.vsb.cz/~vas218/acs.html

## Choosen task

- Reduction paradox

## Description

- Try to reduce message and with every step maximaze entropy

## Official description 

- **RePair - maximal anticompression - reduction paradox**
    - Find the smallest possible representation(measured in the number of symbols) of file using the reduction paradox which leads to the largest increase of zero order entropy representation.
    - Heuristics - largest first, random
    - Describe the algorithm and summarize results to a .doc(x) or .pdf report.
    - Prepare a presentation for 10 minutes about your method.
    - Literature: Vasinek, Dissertation thesis (Chapter 5)

## Steps

- Load dataset
- Can pick a subset for faster running (sample it)
- Find every bigram and try to create rule
- Calculate Entropy for new message
- Need to find rule which reduces size of message but increases entropy (harder to compress)
- Find extremes for every file
- Show graphs

In [1]:
import os
import sys

def adding_module_path():
    module_path = os.path.abspath(os.path.sep.join([".."]*1))

    if module_path not in sys.path:
        sys.path.append(module_path)

adding_module_path()

In [2]:
from src.load_data import get_dataset
from src.load_data import DataSets
from src.get_probs import get_sorted_probs_as_df
import numpy as np
import pandas as pd
import time
from src.save import save_both
from enum import Enum
import re
from ast import literal_eval
import random
import plotly.express as px


In [3]:
TEST_NORMALIZATION_SIZE = 10000

In [4]:
def load_dataset(type, normalization=None):
    data_dna, path_dna = get_dataset(type)
    if normalization is None:
        return data_dna, path_dna
    return "".join(np.random.choice(list(data_dna), normalization)), path_dna

In [5]:
def get_datasets(normalization=None):
    return [
        #(*get_dataset(DataSets.english, normalization), DataSets.english), 
        (*load_dataset(DataSets.dna, normalization), DataSets.dna),
        #(*load_dataset(DataSets.proteins, normalization), DataSets.proteins),
        #(*load_dataset(DataSets.sources, normalization), DataSets.sources),
    ]


In [6]:
test_datasets = get_datasets(TEST_NORMALIZATION_SIZE)

Loading C://Users//proko//Desktop//University//iv//aks//datasets\dna\dna.50MB


In [7]:
test_datasets

[('TTATGTTCCTATCAGTATAGACTGCAGCTGAAAGTTTAGCAGATGCCATCTGAACAGCTCTAATGCTATATAGAAGTAGCTACTCAAAACGTTAGTAATCACGGCTAGCTATACTTGGGAAAAACCTTGCTCGCCCCAGTTGTATTCGCATATGTGCTTCATGGACACATAGATCGACTGGAAACCGTTACTCTAAGTTGGATTCTTGGAAATACGAAATAGGTGATAATCTTGCACGAATCATGAGTCACAACGCAGTATGGAGTGTCGACTCAAGACCCTACATTCACTTACCGGTCCTTAAGTGTAAAAAACGCACATTGGACTAATCAGTGAACGACGTTACTTATGATCCACTGCGTTGAATGTCTGGTTAGACCCGTTATATGTAATTCCGTATTGGGGAATTATCTTAACTCCAGCCTTTTCAATCTTCGAGTGAGTTTGCTTACGGAAGAATGGTATTCTGACATATCGTTCGATATCCTCCCCGAAATAAGCGATTAAGTGGAGAGTAAAGTATCTTCTGGATAGCGGTAGCTGTCGTTCGTTTAAAAGTTGATAGGAAAGATACTTGCCTTCTATCAATGCCATATCAAAAGTGTCTCACAGGCAACGATATCAATAGTGGCCACGTCATCATTGCAATTCGCACAGGCGCATCCCCCTTTGTGTTTCTTGGAAAACTGTGTGCATATTAATAAGTGATTTTGCAGCTTAACCAAGGAACTAATGGATCTCCAGGCATACTGAGTCAGAGAAAAAAACCTGCGTCCCCGGAAGCGTAGCATTTATTTCACAGTTTAGTTTTTGTTGGCATCAAAAGTTTTTTGACTGAGATACAGATTTGGTCAGCGACCCGGACCTCCCGGGTCTGTTTGGTAGAGATTTGCTTACTGTACAAATAGTGATACTCCAACGAGGATTGCAAGATGGTGGTCGCTTCACACTCTTGTAGAGCGGACAGTAGACCTCGCAGAACGATCCTAGACTAATA

In [8]:
data_test, data_path, data_type = test_datasets[0]

In [9]:
def find_k_grams_freq(data, max_size_k=2):
    
    kgrams_dic = {}

    for k in range(2, max_size_k+1):

        for i in range(len(data) - k):

            n_gram = data[i:i+k]
            
            kgrams_dic[n_gram] = kgrams_dic.get(n_gram, 0) + 1

    return kgrams_dic

In [10]:
find_k_grams_freq(data_test)

{'TT': 854,
 'TA': 827,
 'AT': 849,
 'TG': 629,
 'GT': 636,
 'TC': 640,
 'CC': 422,
 'CT': 611,
 'CA': 580,
 'AG': 604,
 'GA': 619,
 'AC': 573,
 'GC': 440,
 'AA': 815,
 'CG': 462,
 'GG': 435,
 'T\n': 1,
 '\nA': 1}

In [11]:
import math
from collections import Counter

def calc_freq(content):
    c = Counter(list(content))
    return c

def calc_p(counter, n):
    counter = dict(counter)
    res = {}
    for k, v in counter.items():
        res[k] = v / n  
    return res

def get_n(counter):
    counter = dict(counter)
    return np.sum(list(counter.values()))

def calc_H(p):
    H = 0
    for k, v in p.items():
        #Shannon equation!
        H += p[k] * math.log2(p[k])
    return -H

def calc_entropy_for_message(message):
    counter = calc_freq(message)
    n = get_n(counter)
    p = calc_p(counter, n)
    H = calc_H(p)
    return H

In [12]:
calc_entropy_for_message(data_test)

1.9827831849450892

In [1]:
def diff_entropy(message1, message2):
    message1_entropy = calc_entropy_for_message(message1)
    message2_entropy = calc_entropy_for_message(message2)
    message_1_entropy_size = message1_entropy * len(message1)
    message_2_entropy_size = message2_entropy * len(message2)


    diff = message_1_entropy_size - message_2_entropy_size
    print(f'{message1_entropy} * {len(message1)} < {message2_entropy} * {len(message2)} ... {diff}')
    return diff

In [2]:
diff_entropy(data_test, data_test)

NameError: name 'data_test' is not defined

In [15]:
all_chars_uni = tuple(chr(i) for i in range(32, 0x110000) if chr(i).isprintable())

In [16]:
all_chars_ascii = list(range(0, 256))
all_chars_ascii = [chr(ascii_char) for ascii_char in all_chars_ascii]

In [17]:
def find_not_existing_character(current_alpahbet, gen_chars=all_chars_uni):
    can_use = set(gen_chars).difference(set(current_alpahbet))
    ascii_picked_char = random.choice(list(can_use))
    return ascii_picked_char

In [18]:
find_not_existing_character(['A', 'C', 'G', 'T'])

'©Ωó'

In [19]:
def transform_message(message, ngram_for_replace):
    current_alphabet_size = np.unique(list(message))

    #print('Alphabet size', len(current_alphabet_size))

    replace_character = find_not_existing_character(current_alphabet_size)
    
    return message.replace(ngram_for_replace, replace_character), replace_character

In [53]:
def calculate_for_ngrams_diff(ngrams, message, method_entropy=diff_entropy, init_message=None):
    res = {}
    
    for k, v in ngrams.items():
        res[k] = {
            "Counter": v,
            #message - current
            #message - next message
            #"Diff": method_entropy(init_message, transform_message(message, k)[0])
            "Diff": method_entropy(message, transform_message(message, k)[0])
        }
    return res

In [54]:
r = calculate_for_ngrams_diff(
        find_k_grams_freq(data_test),
        data_test,
        diff_entropy,
        data_test
    )

pd.DataFrame.from_dict(
    r, 
    orient="index"
)

Unnamed: 0,Counter,Diff
TT,854,-729.2762
TA,827,-882.229748
AT,849,-863.358991
TG,629,-682.001728
GT,636,-676.604391
TC,640,-644.63445
CC,422,-422.363488
CT,611,-667.794745
CA,580,-654.192984
AG,604,-663.345945


In [55]:
r

{'TT': {'Counter': 854, 'Diff': -729.2761996854606},
 'TA': {'Counter': 827, 'Diff': -882.2297480514208},
 'AT': {'Counter': 849, 'Diff': -863.3589910351257},
 'TG': {'Counter': 629, 'Diff': -682.0017279815547},
 'GT': {'Counter': 636, 'Diff': -676.6043906316991},
 'TC': {'Counter': 640, 'Diff': -644.6344500735722},
 'CC': {'Counter': 422, 'Diff': -422.36348846448163},
 'CT': {'Counter': 611, 'Diff': -667.7947450954998},
 'CA': {'Counter': 580, 'Diff': -654.1929841770943},
 'AG': {'Counter': 604, 'Diff': -663.3459446156303},
 'GA': {'Counter': 619, 'Diff': -651.9364852830295},
 'AC': {'Counter': 573, 'Diff': -658.9366269316452},
 'GC': {'Counter': 440, 'Diff': -508.5517264855989},
 'AA': {'Counter': 815, 'Diff': -694.9585052221155},
 'CG': {'Counter': 462, 'Diff': -494.0477182414543},
 'GG': {'Counter': 435, 'Diff': -450.20117376424605},
 'T\n': {'Counter': 1, 'Diff': 1.7608965062972857},
 '\nA': {'Counter': 1, 'Diff': 1.8146956160526315}}

In [56]:
r.items()

dict_items([('TT', {'Counter': 854, 'Diff': -729.2761996854606}), ('TA', {'Counter': 827, 'Diff': -882.2297480514208}), ('AT', {'Counter': 849, 'Diff': -863.3589910351257}), ('TG', {'Counter': 629, 'Diff': -682.0017279815547}), ('GT', {'Counter': 636, 'Diff': -676.6043906316991}), ('TC', {'Counter': 640, 'Diff': -644.6344500735722}), ('CC', {'Counter': 422, 'Diff': -422.36348846448163}), ('CT', {'Counter': 611, 'Diff': -667.7947450954998}), ('CA', {'Counter': 580, 'Diff': -654.1929841770943}), ('AG', {'Counter': 604, 'Diff': -663.3459446156303}), ('GA', {'Counter': 619, 'Diff': -651.9364852830295}), ('AC', {'Counter': 573, 'Diff': -658.9366269316452}), ('GC', {'Counter': 440, 'Diff': -508.5517264855989}), ('AA', {'Counter': 815, 'Diff': -694.9585052221155}), ('CG', {'Counter': 462, 'Diff': -494.0477182414543}), ('GG', {'Counter': 435, 'Diff': -450.20117376424605}), ('T\n', {'Counter': 1, 'Diff': 1.7608965062972857}), ('\nA', {'Counter': 1, 'Diff': 1.8146956160526315})])

In [57]:
def pick_largest(items):
    return list(sorted(items, key=lambda x: x[1]['Diff']))[0]

def pick_random(items):
    return random.choice(items)

def pick_only_decreasing(dic, pick_method=pick_largest):
    items = dic.items()

    decreasing_items = list(filter(lambda x: x[1]['Diff'] < 0, items))

    if len(decreasing_items) == 0:
        return None

    return pick_method(decreasing_items)    

In [58]:
pick_only_decreasing(r, pick_largest)

('TA', {'Counter': 827, 'Diff': -882.2297480514208})

# Algorithm implementation - Reduction paradox

In [59]:
class TableFields(Enum):
    Rule = "Rule"
    EntropyMove = "EntropyMove"
    DataType = "DataType"
    DescriptionData = "DescriptionData"
    MessageSize = "MessageSize"
    AlphabetSize = "AlphabetSize"
    GrammaticSize = "GrammaticSize"
    CalcTime = "CalcTime"


In [60]:
def create_value(message_0, message_1, diff, replace_character, n_gram, grammatic, tic, type_data=None, description_data=None):
    new_message_alphabet_size = len(np.unique(list(message_1)))
    new_message_size = len(message_1)

    new_rule = f"{n_gram} -> {replace_character}"

    grammatic[n_gram] = replace_character

    tac = time.time()
    
    return {
        TableFields.Rule.value: new_rule,
        TableFields.EntropyMove.value: diff,
        TableFields.DataType.value: type_data,
        TableFields.DescriptionData.value: description_data,
        TableFields.MessageSize.value: new_message_size,
        TableFields.AlphabetSize.value: new_message_alphabet_size,
        TableFields.GrammaticSize.value: len(list(grammatic.keys())),
        TableFields.CalcTime.value: tac - tic,
    }


In [66]:
def algorithm_step(init_message, message, grammatic, type_data=None, description_data=None, tic=None, heuristics_method=pick_largest):
    #Find ngrams
    n_grams = find_k_grams_freq(message)
    

    diff_table = calculate_for_ngrams_diff(
        n_grams,
        message,
        diff_entropy,
        init_message
    )

    picked = pick_only_decreasing(diff_table, heuristics_method)
    print(picked)

    if picked is None:
        return None

    #('AT', {'Counter': 867, 'Diff': -0.2814694848677286})
    n_gram, dic_values = picked
    transformed_message, replace_character = transform_message(message, n_gram)
    
    return transformed_message, create_value(message, transformed_message, dic_values['Diff'], replace_character, n_gram, grammatic, tic, type_data, description_data)


In [67]:
def algorithm(message, type_data=None, description_data=None, limit_step=None):
    init_message = message

    res = {}
    grammatic = {}

    step = 0
    while True:
        tic = time.time()
        if limit_step is not None and limit_step == step:
            break
        step_value = algorithm_step(init_message, message, grammatic, type_data, description_data, tic)
        print(step, len(message))
        print('\n')
        if step_value is None:
            break

        else:
            transformed_message, value = step_value
            message = transformed_message

            step += 1
            res[step] = value

    return res

## Testing data

In [68]:
test_result = algorithm(data_test, limit_step=20)

('TA', {'Counter': 827, 'Diff': -882.2297480514208})
0 10000


('GT', {'Counter': 464, 'Diff': -569.5022345748221})
1 9173


('AC', {'Counter': 400, 'Diff': -575.0995729161186})
2 8709


('CG', {'Counter': 299, 'Diff': -407.07225767593627})
3 8309


('AT', {'Counter': 414, 'Diff': -320.72796692238626})
4 8010


('GC', {'Counter': 299, 'Diff': -256.91591626573427})
5 7596


('TG', {'Counter': 211, 'Diff': -204.81810819133534})
6 7297


('CA', {'Counter': 212, 'Diff': -203.95397383572345})
7 7086


('T‚®ñ', {'Counter': 141, 'Diff': -147.2831482531983})
8 6874


('‚®ñA', {'Counter': 100, 'Diff': -127.98022970538295})
9 6733


('G‚®ñ', {'Counter': 89, 'Diff': -91.30783315588633})
10 6633


('CC', {'Counter': 175, 'Diff': -87.99567853513145})
11 6544


('AG', {'Counter': 167, 'Diff': -83.78315789410772})
12 6403


('Tòèø', {'Counter': 81, 'Diff': -78.29208661933808})
13 6236


('‡™≥T', {'Counter': 44, 'Diff': -69.94522600013806})
14 6155


('‚®ñT', {'Counter': 91, 'Diff': -62.1104991591928

In [64]:
df = pd.DataFrame.from_dict(test_result, orient="index")
df

Unnamed: 0,Rule,EntropyMove,DataType,DescriptionData,MessageSize,AlphabetSize,GrammaticSize,CalcTime
1,TA -> ÁÅü,-882.229748,,,9173,6,1,0.482255
2,GT -> Ê∏á,-569.502235,,,8709,7,2,0.647222
3,AC -> ò°º,-575.099573,,,8309,8,3,0.863668
4,CG -> ¢ß°,-407.072258,,,8010,9,4,1.096606
5,AT -> ´µè,-320.727967,,,7596,10,5,1.413561
6,GC -> Êù¶,-256.915916,,,7297,11,6,1.734922
7,TG -> £¥®,-204.818108,,,7086,12,7,2.029316
8,CA -> ¨¥∑,-203.953974,,,6874,13,8,2.468467
9,TÁÅü -> ©∑ü,-147.283148,,,6733,14,9,2.637513
10,ÁÅüA -> ©•∫,-127.98023,,,6633,15,10,3.274131


In [45]:
fig = px.line(df, x=df.index, y=df.EntropyMove, text=[("%.3f" % x) for x in df.EntropyMove.values], title='Entropy paradox')
fig.show()

In [46]:
fig = px.line(df, x=df.index, y=df.MessageSize, text=df.MessageSize, title='Message size')
fig.show()
fig.write_image("test.png")

# Real experiment data

In [47]:
def get_datasets(normalization=None):
    return [
        #(*load_dataset(DataSets.english, normalization), DataSets.english), 
        (*load_dataset(DataSets.dna, normalization), DataSets.dna),
        (*load_dataset(DataSets.proteins, normalization), DataSets.proteins),
        #(*load_dataset(DataSets.sources, normalization), DataSets.sources),
    ]

In [48]:
NORM_VALUES = [10000, 20000]
CSV_NAME = "steps.csv"
ENTROPY_GRAPH = "entropy_paradox.png"
MESSAGE_GRAPH = "message_size.png"

In [49]:
def write_images(df, path):
    fig = px.line(df, x=df.index, y=df.EntropyMove, text=[("%.3f" % x) for x in df.EntropyMove.values], title='Entropy paradox')
    fig.write_image(os.path.sep.join([path, ENTROPY_GRAPH]))
    fig = px.line(df, x=df.index, y=df.MessageSize, text=df.MessageSize, title='Message size')
    fig.write_image(os.path.sep.join([path, MESSAGE_GRAPH]))    

In [50]:
def save_dataframe(df, path):
    path = os.path.sep.join([path, CSV_NAME])
    df.to_csv(path, index=False)

In [51]:
def run_algorithm_for_datasets(normalization_values=NORM_VALUES, limit_steps=None):
    for n_v in normalization_values:

        datasets = get_datasets(n_v)

        for data, data_path, data_type in datasets:
            data_type_string = data_type.value
            current_path = os.path.sep.join([data_type_string, str(n_v)])


            limit_steps_str = "None" if limit_steps is None else str(limit_steps)
            steps_path = os.path.sep.join([current_path, limit_steps_str])

            if not os.path.isdir(steps_path):
                os.makedirs(steps_path)

            res = algorithm(data, data_type_string, "", limit_step=limit_steps)
            df = pd.DataFrame.from_dict(res, orient="index")




            save_dataframe(df, steps_path)
            write_images(df, steps_path)

In [69]:
run_algorithm_for_datasets()

Loading C://Users//proko//Desktop//University//iv//aks//datasets\dna\dna.50MB
Loading C://Users//proko//Desktop//University//iv//aks//datasets\proteins\proteins.50MB
('TA', {'Counter': 819, 'Diff': -873.8662290760512})
0 10000


('GC', {'Counter': 431, 'Diff': -579.1534863949419})
1 9181


('AT', {'Counter': 425, 'Diff': -574.2953257232366})
2 8750


('CG', {'Counter': 284, 'Diff': -420.0601849436498})
3 8325


('CT', {'Counter': 341, 'Diff': -295.3401374411915})
4 8041


('AG', {'Counter': 361, 'Diff': -298.18887423037086})
5 7700


('GA', {'Counter': 226, 'Diff': -190.41601745917433})
6 7339


('TC', {'Counter': 248, 'Diff': -193.4222293665116})
7 7113


('¢∂∫T', {'Counter': 141, 'Diff': -151.4200575779323})
8 6865


('A¢∂∫', {'Counter': 118, 'Diff': -124.80432680759259})
9 6724


('C¢∂∫', {'Counter': 72, 'Diff': -86.69197509943115})
10 6606


('AÈí¶', {'Counter': 55, 'Diff': -79.41877016116996})
11 6534


('¢∂∫G', {'Counter': 73, 'Diff': -76.50825390650061})
12 6479


('Èí¶T', {

In [None]:
10000 * 0.5

In [None]:
9500 * 1