# Project to AKS

## Reference 

- https://homel.vsb.cz/~vas218/pdf/acs/grammar.pdf
- https://homel.vsb.cz/~vas218/pdf/acs/vasinek-thesis.pdf
- https://homel.vsb.cz/~vas218/acs.html

## Choosen task

- Reduction paradox

## Description

- Try to reduce message and with every step maximaze entropy

## Official description 

- **RePair - maximal anticompression - reduction paradox**
    - Find the smallest possible representation(measured in the number of symbols) of file using the reduction paradox which leads to the largest increase of zero order entropy representation.
    - Heuristics - largest first, random
    - Describe the algorithm and summarize results to a .doc(x) or .pdf report.
    - Prepare a presentation for 10 minutes about your method.
    - Literature: Vasinek, Dissertation thesis (Chapter 5)

## Steps

- Load dataset
- Can pick a subset for faster running (sample it)
- Find every bigram and try to create rule
- Calculate Entropy for new message
- Need to find rule which reduces size of message but increases entropy (harder to compress)
- Find extremes for every file
- Show graphs

In [1]:
import os
import sys

def adding_module_path():
    module_path = os.path.abspath(os.path.sep.join([".."]*1))

    if module_path not in sys.path:
        sys.path.append(module_path)

adding_module_path()

In [2]:
from src.load_data import get_dataset
from src.load_data import DataSets
from src.get_probs import get_sorted_probs_as_df
import numpy as np
import pandas as pd
import time
from src.save import save_both
from enum import Enum
import re
from ast import literal_eval
import random
import plotly.express as px


In [3]:
TEST_NORMALIZATION_SIZE = 10000

In [4]:
def load_dataset(type, normalization=None):
    data_dna, path_dna = get_dataset(type)
    if normalization is None:
        return data_dna, path_dna
    return "".join(np.random.choice(list(data_dna), normalization)), path_dna

In [5]:
def get_datasets(normalization=None):
    return [
        #(*get_dataset(DataSets.english, normalization), DataSets.english), 
        (*load_dataset(DataSets.dna, normalization), DataSets.dna),
        #(*load_dataset(DataSets.proteins, normalization), DataSets.proteins),
        #(*load_dataset(DataSets.sources, normalization), DataSets.sources),
    ]


In [6]:
test_datasets = get_datasets(TEST_NORMALIZATION_SIZE)

Loading C://Users//proko//Desktop//University//iv//aks//datasets\dna\dna.50MB


In [7]:
test_datasets

[('AACGTGTTGTACGCTTCCAGGGATTAGAATTTACTAAGCGTCAAATAAGTTAAACATGTTCACACTATGGCGCATTTATGCGTCAACTTATCTTTATTTCATGATAAAACTTGTTTCGCAGTCAACTAATATGATACTAATCCGTAACATGTTGGGAGAGTCTTTGCATCAGGGAGCTGCTCCGTTCAAATGTAGTGTACATGAGCTTGGCAGATCGGAGGCCCGTAAGGGTTAATCTTGACTAGGAATCAAAGTTTCTTTTTGAAGGATTGGCTCGATATTGTTGTGTAATACCCGTACCGGTTCAGGACGAGACCCGTGGAAAGTTACAGAGGTAAAACGATGTTTCTGGCAGTGAGGTCCACACAGGGCAACGTACTTCATAGGGAGTAGTAATTTCCATCTAAGGTGCTAAAGCAGCCCCTTAATCACGTTCGACGGGATACAATTTCGTCAACCTGTTAAAACATGCTGCGTAAGGACGTGTAAAGGGCAGATACACCTTGTGGTTGATGTCTCGGAAATATTAATACCAATTTGGCAATCCAGATCTATTTGTAGAGATGATGGGAGTTGACACAATTATGTTACCACTCACTGGAGCATTTAAATCTCAAGTCTTTTAGGAGCGGCCATGTATGTGAACAAGAAAGATCGTTGACTATCGTATGCGGCCCGTCATCAATGGTAGTTTAGATCGCATCTTGTAAGTTGATTTAACTCCAGATGGCCCTAACTGCCAATAGTCTTATCATAGTTTAACTGGCAGACCTCCCCCTGCTTTAGCTAAACGATAACAACTGCCCTCGGAAACTATGTAGAAAGAGGCCTGTTTATCACGACTATGATGATCCTATATCTATGGAGCAGTTAGGTGTGGCTCGGGCTCAAAATCAGCTGCCTAATAGCTGCTCAGTTGTCGGTAGACTAGTTTCAACTCCGATCAACATCATATACTGGGCCAAGTTATCAAAATGACCCTACACCTATTTCCATT

In [8]:
data_test, data_path, data_type = test_datasets[0]

In [9]:
def find_k_grams_freq(data, max_size_k=2):
    
    kgrams_dic = {}

    for k in range(2, max_size_k+1):

        for i in range(len(data) - k):

            n_gram = data[i:i+k]
            
            kgrams_dic[n_gram] = kgrams_dic.get(n_gram, 0) + 1

    return kgrams_dic

In [10]:
find_k_grams_freq(data_test)

{'AA': 862,
 'AC': 612,
 'CG': 397,
 'GT': 585,
 'TG': 609,
 'TT': 797,
 'TA': 856,
 'GC': 457,
 'CT': 648,
 'TC': 616,
 'CC': 476,
 'CA': 640,
 'AG': 604,
 'GG': 424,
 'GA': 567,
 'AT': 848}

In [11]:
import math
from collections import Counter

def calc_freq(content):
    c = Counter(list(content))
    return c

def calc_p(counter, n):
    counter = dict(counter)
    res = {}
    for k, v in counter.items():
        res[k] = v / n  
    return res

def get_n(counter):
    counter = dict(counter)
    return np.sum(list(counter.values()))

def calc_H(p):
    H = 0
    for k, v in p.items():
        #Shannon equation!
        H += p[k] * math.log2(p[k])
    return -H

def calc_entropy_for_message(message):
    counter = calc_freq(message)
    n = get_n(counter)
    p = calc_p(counter, n)
    H = calc_H(p)
    return H

In [12]:
calc_entropy_for_message(data_test)

1.9809154856636604

In [13]:
def diff_entropy(message1, message2):
    message1_entropy = calc_entropy_for_message(message1)
    message2_entropy = calc_entropy_for_message(message2)
    message_1_entropy_size = message1_entropy * len(message1)
    message_2_entropy_size = message2_entropy * len(message2)


    diff = message_1_entropy_size - message_2_entropy_size
    print(f'{round(message1_entropy, 2)} * {len(message1)} < {round(message2_entropy, 2)} * {len(message2)} ... {diff}')
    return diff

In [14]:
diff_entropy(data_test, data_test)

1.98 * 10000 < 1.98 * 10000 ... 0.0


0.0

In [15]:
all_chars_uni = tuple(chr(i) for i in range(32, 0x110000) if chr(i).isprintable())

In [16]:
all_chars_ascii = list(range(0, 256))
all_chars_ascii = [chr(ascii_char) for ascii_char in all_chars_ascii]

In [17]:
def find_not_existing_character(current_alpahbet, gen_chars=all_chars_uni):
    can_use = set(gen_chars).difference(set(current_alpahbet))
    ascii_picked_char = random.choice(list(can_use))
    return ascii_picked_char

In [18]:
find_not_existing_character(['A', 'C', 'G', 'T'])

'꺅'

In [19]:
def transform_message(message, ngram_for_replace):
    current_alphabet_size = np.unique(list(message))

    #print('Alphabet size', len(current_alphabet_size))

    replace_character = find_not_existing_character(current_alphabet_size)
    
    return message.replace(ngram_for_replace, replace_character), replace_character

In [20]:
def calculate_for_ngrams_diff(ngrams, message, method_entropy=diff_entropy, init_message=None):
    res = {}
    
    for k, v in ngrams.items():
        res[k] = {
            "Counter": v,
            #message - current
            #message - next message
            #"Diff": method_entropy(init_message, transform_message(message, k)[0])
            "Diff": method_entropy(message, transform_message(message, k)[0])
        }
    return res

In [21]:
r = calculate_for_ngrams_diff(
        find_k_grams_freq(data_test),
        data_test,
        diff_entropy,
        data_test
    )

pd.DataFrame.from_dict(
    r, 
    orient="index"
)

1.98 * 10000 < 2.2 * 9335 ... -711.28786290059
1.98 * 10000 < 2.18 * 9388 ... -700.9447049299597
1.98 * 10000 < 2.12 * 9603 ... -526.4840876851231
1.98 * 10000 < 2.17 * 9414 ... -642.013983240744
1.98 * 10000 < 2.18 * 9391 ... -624.1327148802011
1.98 * 10000 < 2.19 * 9380 ... -725.6473704782875
1.98 * 10000 < 2.26 * 9144 ... -863.1005739158027
1.98 * 10000 < 2.13 * 9543 ... -493.3778858426231
1.98 * 10000 < 2.19 * 9352 ... -656.895096152235
1.98 * 10000 < 2.18 * 9384 ... -682.0161422352794
1.98 * 10000 < 2.11 * 9619 ... -450.4012919118868
1.98 * 10000 < 2.19 * 9360 ... -680.4225267693218
1.98 * 10000 < 2.18 * 9396 ... -644.0158564472586
1.98 * 10000 < 2.09 * 9647 ... -397.45748738656766
1.98 * 10000 < 2.17 * 9433 ... -669.9551612689065
1.98 * 10000 < 2.26 * 9152 ... -870.2255035416347


Unnamed: 0,Counter,Diff
AA,862,-711.287863
AC,612,-700.944705
CG,397,-526.484088
GT,585,-642.013983
TG,609,-624.132715
TT,797,-725.64737
TA,856,-863.100574
GC,457,-493.377886
CT,648,-656.895096
TC,616,-682.016142


In [22]:
r

{'AA': {'Counter': 862, 'Diff': -711.28786290059},
 'AC': {'Counter': 612, 'Diff': -700.9447049299597},
 'CG': {'Counter': 397, 'Diff': -526.4840876851231},
 'GT': {'Counter': 585, 'Diff': -642.013983240744},
 'TG': {'Counter': 609, 'Diff': -624.1327148802011},
 'TT': {'Counter': 797, 'Diff': -725.6473704782875},
 'TA': {'Counter': 856, 'Diff': -863.1005739158027},
 'GC': {'Counter': 457, 'Diff': -493.3778858426231},
 'CT': {'Counter': 648, 'Diff': -656.895096152235},
 'TC': {'Counter': 616, 'Diff': -682.0161422352794},
 'CC': {'Counter': 476, 'Diff': -450.4012919118868},
 'CA': {'Counter': 640, 'Diff': -680.4225267693218},
 'AG': {'Counter': 604, 'Diff': -644.0158564472586},
 'GG': {'Counter': 424, 'Diff': -397.45748738656766},
 'GA': {'Counter': 567, 'Diff': -669.9551612689065},
 'AT': {'Counter': 848, 'Diff': -870.2255035416347}}

In [23]:
r.items()

dict_items([('AA', {'Counter': 862, 'Diff': -711.28786290059}), ('AC', {'Counter': 612, 'Diff': -700.9447049299597}), ('CG', {'Counter': 397, 'Diff': -526.4840876851231}), ('GT', {'Counter': 585, 'Diff': -642.013983240744}), ('TG', {'Counter': 609, 'Diff': -624.1327148802011}), ('TT', {'Counter': 797, 'Diff': -725.6473704782875}), ('TA', {'Counter': 856, 'Diff': -863.1005739158027}), ('GC', {'Counter': 457, 'Diff': -493.3778858426231}), ('CT', {'Counter': 648, 'Diff': -656.895096152235}), ('TC', {'Counter': 616, 'Diff': -682.0161422352794}), ('CC', {'Counter': 476, 'Diff': -450.4012919118868}), ('CA', {'Counter': 640, 'Diff': -680.4225267693218}), ('AG', {'Counter': 604, 'Diff': -644.0158564472586}), ('GG', {'Counter': 424, 'Diff': -397.45748738656766}), ('GA', {'Counter': 567, 'Diff': -669.9551612689065}), ('AT', {'Counter': 848, 'Diff': -870.2255035416347})])

In [24]:
def pick_largest(items):
    return list(sorted(items, key=lambda x: x[1]['Diff']))[0]

def pick_random(items):
    return random.choice(items)

def pick_only_decreasing(dic, pick_method=pick_largest):
    items = dic.items()

    decreasing_items = list(filter(lambda x: x[1]['Diff'] < 0, items))

    if len(decreasing_items) == 0:
        return None

    return pick_method(decreasing_items)    

In [25]:
pick_only_decreasing(r, pick_largest)

('AT', {'Counter': 848, 'Diff': -870.2255035416347})

# Algorithm implementation - Reduction paradox

In [26]:
class TableFields(Enum):
    Rule = "Rule"
    EntropyMove = "EntropyMove"
    CurrentEntropy = "CurrentEntropy"
    EntropySize = "EntropySize"
    DataType = "DataType"
    DescriptionData = "DescriptionData"
    MessageSize = "MessageSize"
    AlphabetSize = "AlphabetSize"
    GrammaticSize = "GrammaticSize"
    CalcTime = "CalcTime"


In [27]:
def create_value(message_0, message_1, diff, replace_character, n_gram, grammatic, tic, type_data=None, description_data=None):
    new_message_alphabet_size = len(np.unique(list(message_1)))
    new_message_size = len(message_1)
    new_message_entropy = calc_entropy_for_message(message_1)

    new_rule = f"{n_gram} -> {replace_character}"
    grammatic[n_gram] = replace_character

    tac = time.time()
    
    return {
        TableFields.Rule.value: new_rule,
        TableFields.EntropyMove.value: diff,
        TableFields.CurrentEntropy.value: new_message_entropy,
        TableFields.EntropySize.value: new_message_entropy * new_message_size,
        TableFields.DataType.value: type_data,
        TableFields.DescriptionData.value: description_data,
        TableFields.MessageSize.value: new_message_size,
        TableFields.AlphabetSize.value: new_message_alphabet_size,
        TableFields.GrammaticSize.value: len(list(grammatic.keys())),
        TableFields.CalcTime.value: tac - tic,
    }


In [28]:
def algorithm_step(init_message, message, grammatic, type_data=None, description_data=None, tic=None, heuristics_method=pick_largest):
    #Find ngrams
    n_grams = find_k_grams_freq(message)
    

    diff_table = calculate_for_ngrams_diff(
        n_grams,
        message,
        diff_entropy,
        init_message
    )

    picked = pick_only_decreasing(diff_table, heuristics_method)
    print(picked)

    if picked is None:
        return None

    #('AT', {'Counter': 867, 'Diff': -0.2814694848677286})
    n_gram, dic_values = picked
    transformed_message, replace_character = transform_message(message, n_gram)
    
    return transformed_message, create_value(message, transformed_message, dic_values['Diff'], replace_character, n_gram, grammatic, tic, type_data, description_data)


In [29]:
def algorithm(message, type_data=None, description_data=None, limit_step=None, heuristic_method=pick_largest):
    init_message = message

    res = {}
    grammatic = {}

    step = 0
    while True:
        tic = time.time()
        if limit_step is not None and limit_step == step:
            break
        step_value = algorithm_step(init_message, message, grammatic, type_data, description_data, tic, heuristic_method)
        print(step, len(message))
        print('\n')
        if step_value is None:
            break

        else:
            transformed_message, value = step_value
            message = transformed_message

            step += 1
            res[step] = value

    return res

## Testing data

In [30]:
test_result = algorithm(data_test, limit_step=20)

1.98 * 10000 < 2.2 * 9335 ... -711.28786290059
1.98 * 10000 < 2.18 * 9388 ... -700.9447049299597
1.98 * 10000 < 2.12 * 9603 ... -526.4840876851231
1.98 * 10000 < 2.17 * 9414 ... -642.013983240744
1.98 * 10000 < 2.18 * 9391 ... -624.1327148802011
1.98 * 10000 < 2.19 * 9380 ... -725.6473704782875
1.98 * 10000 < 2.26 * 9144 ... -863.1005739158027
1.98 * 10000 < 2.13 * 9543 ... -493.3778858426231
1.98 * 10000 < 2.19 * 9352 ... -656.895096152235
1.98 * 10000 < 2.18 * 9384 ... -682.0161422352794
1.98 * 10000 < 2.11 * 9619 ... -450.4012919118868
1.98 * 10000 < 2.19 * 9360 ... -680.4225267693218
1.98 * 10000 < 2.18 * 9396 ... -644.0158564472586
1.98 * 10000 < 2.09 * 9647 ... -397.45748738656766
1.98 * 10000 < 2.17 * 9433 ... -669.9551612689065
1.98 * 10000 < 2.26 * 9152 ... -870.2255035416347
('AT', {'Counter': 848, 'Diff': -870.2255035416347})
0 10000


2.26 * 9152 < 2.42 * 8681 ... -329.18421923216374
2.26 * 9152 < 2.47 * 8540 ... -434.1034245904775
2.26 * 9152 < 2.43 * 8755 ... -578.3205242

In [31]:
df = pd.DataFrame.from_dict(test_result, orient="index")
df

Unnamed: 0,Rule,EntropyMove,CurrentEntropy,EntropySize,DataType,DescriptionData,MessageSize,AlphabetSize,GrammaticSize,CalcTime
1,AT -> 𠖛,-870.225504,2.259548,20679.38036,,,9152,5,1,0.430863
2,CG -> 彖,-578.320524,2.428064,21257.700884,,,8755,6,2,0.605959
3,TA -> ꮽ,-570.099903,2.619441,21827.800787,,,8333,7,3,0.862512
4,GC -> 塹,-421.072625,2.770374,22248.873412,,,8031,8,4,1.050873
5,TC -> ꥉ,-314.249904,2.931799,22563.123316,,,7696,9,5,1.369953
6,GA -> Ⓗ,-295.128136,3.107852,22858.251452,,,7355,10,6,1.812332
7,A𠖛 -> 𡄁,-187.721638,3.198608,23045.973091,,,7205,11,7,2.046878
8,CT -> ̉,-189.736278,3.347603,23235.709368,,,6941,12,8,2.492519
9,AG -> 𤒞,-151.699203,3.485975,23387.408571,,,6709,13,9,3.123317
10,𠖛T -> 𖨧,-128.59157,3.565732,23516.000141,,,6595,14,10,3.246259


In [32]:
fig = px.line(df, x=df.index, y=df.EntropyMove, text=[("%.1f" % x) for x in df.EntropyMove.values], title='Pohyb entropie')
fig.show()

In [33]:
fig = px.line(df, x=df.index, y=df.MessageSize, text=df.MessageSize, title='Velikost zprávy')
fig.show()
fig.write_image("test.png")

In [34]:
fig = px.line(df, x=df.index, y=[df.MessageSize, df.EntropySize], title='Velikost zprávy proti aktuální entropii')
fig.show()

In [35]:
fig = px.line(df, x=df.index, y=df.EntropySize, title='H*len(m)', text=[("%.0f" % x) for x in df.EntropySize.values])
fig.update_traces(textposition='top center')
fig.show()

# Real experiment data

In [36]:
def get_datasets(normalization=None):
    return [
        (*load_dataset(DataSets.english, normalization), DataSets.english), 
        (*load_dataset(DataSets.dna, normalization), DataSets.dna),
        (*load_dataset(DataSets.proteins, normalization), DataSets.proteins),
        (*load_dataset(DataSets.sources, normalization), DataSets.sources),
    ]

In [37]:
NORM_VALUES = [10000, 20000]
CSV_NAME = "steps.csv"
ENTROPY_GRAPH = "entropy_paradox.png"
MESSAGE_GRAPH = "message_size.png"
MESSAGE_SIZE_ENTROPY_GRAPH = "message_entropy_size.png"
MESSAGE_ENTROPY = "message_entropy.png"

In [38]:
def write_images(df, path):
    fig = px.line(df, x=df.index, y=df.EntropyMove, text=[("%.1f" % x) for x in df.EntropyMove.values], title='Pohyb entropie')
    fig.update_traces(textposition='top center')
    fig.write_image(os.path.sep.join([path, ENTROPY_GRAPH]))
    
    fig = px.line(df, x=df.index, y=df.MessageSize, text=df.MessageSize, title='Velikost zprávy')
    fig.update_traces(textposition='top center')
    fig.write_image(os.path.sep.join([path, MESSAGE_GRAPH]))    

    fig = px.line(df, x=df.index, y=[df.MessageSize, df.EntropySize], title='Velikost zprávy proti aktuální entropii')
    fig.update_traces(textposition='top center')
    fig.write_image(os.path.sep.join([path, MESSAGE_SIZE_ENTROPY_GRAPH]))

    fig = px.line(df, x=df.index, y=df.EntropySize, title='H*len(m)', text=[("%.0f" % x) for x in df.EntropySize.values])
    fig.update_traces(textposition='top center')   
    fig.write_image(os.path.sep.join([path, MESSAGE_ENTROPY]))

In [39]:
def save_dataframe(df, path):
    path = os.path.sep.join([path, CSV_NAME])
    df.to_csv(path, index=False)

In [44]:
def run_algorithm_for_datasets(normalization_values=NORM_VALUES, largest=True, limit_steps=None):
    for n_v in normalization_values:

        datasets = get_datasets(n_v)

        for data, data_path, data_type in datasets:
            data_type_string = data_type.value
            current_path = os.path.sep.join([data_type_string, str(n_v)])


            limit_steps_str = "None" if limit_steps is None else str(limit_steps)
            steps_path = os.path.sep.join([current_path, limit_steps_str, str(largest)])

            if not os.path.isdir(steps_path):
                os.makedirs(steps_path)

            heurestic_method = pick_largest if largest else pick_random

            res = algorithm(data, data_type_string, "", limit_step=limit_steps, heuristic_method=heurestic_method)
            df = pd.DataFrame.from_dict(res, orient="index")




            save_dataframe(df, steps_path)
            write_images(df, steps_path)

In [45]:
LIMIT_STEPS = [15]
LARGEST_METHOD = False #largest || random

In [46]:
for limit_steps in LIMIT_STEPS:
    run_algorithm_for_datasets(NORM_VALUES, LARGEST_METHOD, limit_steps)

Loading C://Users//proko//Desktop//University//iv//aks//datasets\english\english.50MB
Loading C://Users//proko//Desktop//University//iv//aks//datasets\dna\dna.50MB
Loading C://Users//proko//Desktop//University//iv//aks//datasets\proteins\proteins.50MB
Loading C://Users//proko//Desktop//University//iv//aks//datasets\sources\sources.50MB
4.51 * 10000 < 4.53 * 9969 ... -44.61200524452579
4.51 * 10000 < 4.54 * 9946 ... -65.21820166247198
4.51 * 10000 < 4.53 * 9968 ... -58.28092556304182
4.51 * 10000 < 4.53 * 9959 ... -59.12748491251841
4.51 * 10000 < 4.56 * 9911 ... -116.45954069228173
4.51 * 10000 < 4.54 * 9946 ... -66.03313463924133
4.51 * 10000 < 4.51 * 9992 ... -8.315584094569203
4.51 * 10000 < 4.51 * 9999 ... -2.3274170734366635
4.51 * 10000 < 4.52 * 9991 ... -8.520840506760578
4.51 * 10000 < 4.51 * 9992 ... -6.721923205019266
4.51 * 10000 < 4.51 * 9999 ... 0.47044722042483045
4.51 * 10000 < 4.51 * 9998 ... -1.3575099045556271
4.51 * 10000 < 4.57 * 9904 ... -122.53308766170812
4.51 * 