# Project to AKS

## Reference 

- https://homel.vsb.cz/~vas218/pdf/acs/grammar.pdf
- https://homel.vsb.cz/~vas218/pdf/acs/vasinek-thesis.pdf
- https://homel.vsb.cz/~vas218/acs.html

## Choosen task

- Reduction paradox

## Description

- Try to reduce message and with every step maximaze entropy

## Official description 

- **RePair - maximal anticompression - reduction paradox**
    - Find the smallest possible representation(measured in the number of symbols) of file using the reduction paradox which leads to the largest increase of zero order entropy representation.
    - Heuristics - largest first, random
    - Describe the algorithm and summarize results to a .doc(x) or .pdf report.
    - Prepare a presentation for 10 minutes about your method.
    - Literature: Vasinek, Dissertation thesis (Chapter 5)

## Steps

- Load dataset
- Can pick a subset for faster running (sample it)
- Find every bigram and try to create rule
- Calculate Entropy for new message
- Need to find rule which reduces size of message but increases entropy (harder to compress)
- Find extremes for every file
- Show graphs

In [1]:
import os
import sys

def adding_module_path():
    module_path = os.path.abspath(os.path.sep.join([".."]*1))

    if module_path not in sys.path:
        sys.path.append(module_path)

adding_module_path()

In [2]:
from src.load_data import get_dataset
from src.load_data import DataSets
import numpy as np
import pandas as pd
import time
from enum import Enum
import random
import plotly.express as px

In [3]:
TEST_NORMALIZATION_SIZE = 10000

In [4]:
def load_dataset(type, normalization=None, random_pick=False):
    data_dna, path_dna = get_dataset(type)
    if normalization is None:
        return data_dna, path_dna
    if random_pick:
        return "".join(np.random.choice(list(data_dna), normalization)), path_dna
    return data_dna[0:normalization], path_dna 

In [5]:
def get_datasets(normalization=None):
    return [
        #(*get_dataset(DataSets.english, normalization), DataSets.english), 
        (*load_dataset(DataSets.dna, normalization), DataSets.dna),
        #(*load_dataset(DataSets.proteins, normalization), DataSets.proteins),
        #(*load_dataset(DataSets.sources, normalization), DataSets.sources),
    ]


In [6]:
test_datasets = get_datasets(TEST_NORMALIZATION_SIZE)

Loading C://Users//proko//Desktop//University//iv//aks//datasets\dna\dna.50MB


In [7]:
test_datasets

[('GATCAATGAGGTGGACACCAGAGGCGGGGACTTGTAAATAACACTGGGCTGTAGGAGTGATGGGGTTCACCTCTAATTCTAAGATGGCTAGATAATGCATCTTTCAGGGTTGTGCTTCTATCTAGAAGGTAGAGCTGTGGTCGTTCAATAAAAGTCCTCAAGAGGTTGGTTAATACGCATGTTTAATAGTACAGTATGGTGACTATAGTCAACAATAATTTATTGTACATTTTTAAATAGCTAGAAGAAAAGCATTGGGAAGTTTCCAACATGAAGAAAAGATAAATGGTCAAGGGAATGGATATCCTAATTACCCTGATTTGATCATTATGCATTATATACATGAATCAAAATATCACACATACCTTCAAACTATGTACAAATATTATATACCAATAAAAAATCATCATCATCATCTCCATCATCACCACCCTCCTCCTCATCACCACCAGCATCACCACCATCATCACCACCACCATCATCACCACCACCACTGCCATCATCATCACCACCACTGTGCCATCATCATCACCACCACTGTCATTATCACCACCACCATCATCACCAACACCACTGCCATCGTCATCACCACCACTGTCATTATCACCACCACCATCACCAACATCACCACCACCATTATCACCACCATCAACACCACCACCCCCATCATCATCATCACTACTACCATCATTACCAGCACCACCACCACTATCACCACCACCACCACAATCACCATCACCACTATCATCAACATCATCACTACCACCATCACCAACACCACCATCATTATCACCACCACCACCATCACCAACATCACCACCATCATCATCACCACCATCACCAAGACCATCATCATCACCATCACCACCAACATCACCACCATCACCAACACCACCATCACCACCACCACCACCATCATCACCACCACCACCATCATCATCACCACCACCGCCATCATCATCGCCACCACCATGACCACCACCATCACAACCATCAC

In [8]:
data_test, data_path, data_type = test_datasets[0]

In [9]:
def filter_func(dic):
    items = list(filter(lambda x: x[1] > 1, dic.items()))
    res = dict(items)
    return res

In [10]:
def find_k_grams_freq(data, max_size_k=2, filter_func=filter_func):
    
    kgrams_dic = {}

    for k in range(2, max_size_k+1):

        for i in range(len(data) - k):

            n_gram = data[i:i+k]
            
            kgrams_dic[n_gram] = kgrams_dic.get(n_gram, 0) + 1
    
    return filter_func(kgrams_dic)

In [11]:
find_k_grams_freq(data_test)

{'GA': 586,
 'AT': 716,
 'TC': 616,
 'CA': 968,
 'AA': 879,
 'TG': 705,
 'AG': 702,
 'GG': 558,
 'GT': 475,
 'AC': 640,
 'CC': 751,
 'GC': 468,
 'CG': 122,
 'CT': 634,
 'TT': 674,
 'TA': 504}

In [12]:
import math
from collections import Counter

def calc_freq(content):
    c = Counter(list(content))
    return c

def calc_p(counter, n):
    counter = dict(counter)
    res = {}
    for k, v in counter.items():
        res[k] = v / n  
    return res

def get_n(counter):
    counter = dict(counter)
    return np.sum(list(counter.values()))

def calc_H(p):
    H = 0
    for k, v in p.items():
        #Shannon equation!
        H += p[k] * math.log2(p[k])
    return -H

def calc_entropy_for_message(message):
    counter = calc_freq(message)
    n = get_n(counter)
    p = calc_p(counter, n)
    H = calc_H(p)
    return H

In [13]:
all_chars_uni = tuple(chr(i) for i in range(32, 0x110000) if chr(i).isprintable())
all_chars_ascii = list(range(0, 256))
all_chars_ascii = [chr(ascii_char) for ascii_char in all_chars_ascii]
def find_not_existing_character(current_alpahbet, gen_chars=all_chars_uni):
    can_use = set(gen_chars).difference(set(current_alpahbet))
    ascii_picked_char = random.choice(list(can_use))
    return ascii_picked_char

In [14]:
find_not_existing_character(['A', 'C', 'G', 'T'])

'쎪'

In [15]:
calc_entropy_for_message(data_test)

1.9895772725667296

In [16]:
def diff_entropy(message1, message2):
    message1_entropy = calc_entropy_for_message(message1)
    message2_entropy = calc_entropy_for_message(message2)
    message_1_entropy_size = message1_entropy * len(message1)
    message_2_entropy_size = message2_entropy * len(message2)


    diff = message_1_entropy_size - message_2_entropy_size
    return diff

In [17]:
diff_entropy(data_test, data_test)

0.0

In [18]:
def transform_message(message, ngram_for_replace):
    current_alphabet_size = np.unique(list(message))

    replace_character = find_not_existing_character(current_alphabet_size)
    
    return message.replace(ngram_for_replace, replace_character), replace_character

In [19]:
def calculate_for_ngrams_diff(ngrams, message, method_entropy=diff_entropy, init_message=None):
    res = {}
    
    for k, v in ngrams.items():
        res[k] = {
            "Counter": v,
            "Diff": method_entropy(message, transform_message(message, k)[0])
        }
    return res

In [20]:
r = calculate_for_ngrams_diff(
        find_k_grams_freq(data_test),
        data_test,
        diff_entropy,
        data_test
    )

pd.DataFrame.from_dict(
    r, 
    orient="index"
)

Unnamed: 0,Counter,Diff
GA,586,-686.909129
AT,716,-794.27238
TC,616,-679.714642
CA,968,-488.571041
AA,879,-753.006409
TG,705,-393.334248
AG,702,-586.882909
GG,558,-314.800226
GT,475,-611.867113
AC,640,-831.446036


In [21]:
r

{'GA': {'Counter': 586, 'Diff': -686.9091292880221},
 'AT': {'Counter': 716, 'Diff': -794.2723799380256},
 'TC': {'Counter': 616, 'Diff': -679.7146417421354},
 'CA': {'Counter': 968, 'Diff': -488.5710412647459},
 'AA': {'Counter': 879, 'Diff': -753.0064090554151},
 'TG': {'Counter': 705, 'Diff': -393.33424757568355},
 'AG': {'Counter': 702, 'Diff': -586.882908578893},
 'GG': {'Counter': 558, 'Diff': -314.80022619213923},
 'GT': {'Counter': 475, 'Diff': -611.8671131922092},
 'AC': {'Counter': 640, 'Diff': -831.4460359297373},
 'CC': {'Counter': 751, 'Diff': -415.72576803416814},
 'GC': {'Counter': 468, 'Diff': -607.6780573412434},
 'CG': {'Counter': 122, 'Diff': -421.589100361467},
 'CT': {'Counter': 634, 'Diff': -666.1015642405691},
 'TT': {'Counter': 674, 'Diff': -552.6838623637159},
 'TA': {'Counter': 504, 'Diff': -873.9444092367921}}

In [22]:
r.items()

dict_items([('GA', {'Counter': 586, 'Diff': -686.9091292880221}), ('AT', {'Counter': 716, 'Diff': -794.2723799380256}), ('TC', {'Counter': 616, 'Diff': -679.7146417421354}), ('CA', {'Counter': 968, 'Diff': -488.5710412647459}), ('AA', {'Counter': 879, 'Diff': -753.0064090554151}), ('TG', {'Counter': 705, 'Diff': -393.33424757568355}), ('AG', {'Counter': 702, 'Diff': -586.882908578893}), ('GG', {'Counter': 558, 'Diff': -314.80022619213923}), ('GT', {'Counter': 475, 'Diff': -611.8671131922092}), ('AC', {'Counter': 640, 'Diff': -831.4460359297373}), ('CC', {'Counter': 751, 'Diff': -415.72576803416814}), ('GC', {'Counter': 468, 'Diff': -607.6780573412434}), ('CG', {'Counter': 122, 'Diff': -421.589100361467}), ('CT', {'Counter': 634, 'Diff': -666.1015642405691}), ('TT', {'Counter': 674, 'Diff': -552.6838623637159}), ('TA', {'Counter': 504, 'Diff': -873.9444092367921})])

In [23]:
def pick_largest(items):
    return list(sorted(items, key=lambda x: x[1]['Diff']))[0]

def pick_random(items):
    return random.choice(items)

def pick_only_decreasing(dic, pick_method=pick_largest):
    items = dic.items()

    decreasing_items = list(filter(lambda x: x[1]['Diff'] < 0, items))

    if len(decreasing_items) == 0:
        return None

    return pick_method(decreasing_items)    

In [24]:
pick_only_decreasing(r, pick_largest)

('TA', {'Counter': 504, 'Diff': -873.9444092367921})

# Algorithm implementation - Reduction paradox

In [25]:
class TableFields(Enum):
    Rule = "Rule"
    EntropyMove = "EntropyMove"
    CurrentEntropy = "CurrentEntropy"
    EntropySize = "EntropySize"
    DataType = "DataType"
    DescriptionData = "DescriptionData"
    MessageSize = "MessageSize"
    AlphabetSize = "AlphabetSize"
    GrammaticSize = "GrammaticSize"
    CalcTime = "CalcTime"


In [26]:
def create_value(message_0, message_1, diff, replace_character, n_gram, grammatic, tic, type_data=None, description_data=None):
    new_message_alphabet_size = len(np.unique(list(message_1)))
    new_message_size = len(message_1)
    new_message_entropy = calc_entropy_for_message(message_1)

    new_rule = f"{n_gram} -> {replace_character}"
    grammatic[n_gram] = replace_character

    tac = time.time()
    
    return {
        TableFields.Rule.value: new_rule,
        TableFields.EntropyMove.value: diff,
        TableFields.CurrentEntropy.value: new_message_entropy,
        TableFields.EntropySize.value: new_message_entropy * new_message_size,
        TableFields.DataType.value: type_data,
        TableFields.DescriptionData.value: description_data,
        TableFields.MessageSize.value: new_message_size,
        TableFields.AlphabetSize.value: new_message_alphabet_size,
        TableFields.GrammaticSize.value: len(list(grammatic.keys())),
        TableFields.CalcTime.value: tac - tic,
    }


In [27]:
def algorithm_step(init_message, message, grammatic, type_data=None, description_data=None, tic=None, heuristics_method=pick_largest, optimized=False):
    #Find ngrams
    n_grams = find_k_grams_freq(message)
    
    #optimized = diff_entropy change pls use formula

    diff_table = calculate_for_ngrams_diff(
        n_grams,
        message,
        diff_entropy,
        init_message
    )

    picked = pick_only_decreasing(diff_table, heuristics_method)

    if picked is None:
        return None

    #('AT', {'Counter': 867, 'Diff': -0.2814694848677286})
    n_gram, dic_values = picked
    transformed_message, replace_character = transform_message(message, n_gram)
    
    return transformed_message, create_value(message, transformed_message, dic_values['Diff'], replace_character, n_gram, grammatic, tic, type_data, description_data)


In [28]:
def algorithm(message, type_data=None, description_data=None, limit_step=None, heuristic_method=pick_largest, optimized=False):
    init_message = message

    res = {}
    grammatic = {}
    step = 0
    step_value = create_value(init_message, init_message, 0, "", "", grammatic, time.time(), type_data)

    res[step] = step_value
    step += 1

    while True:
        tic = time.time()
        if limit_step is not None and limit_step == step:
            break
        step_value = algorithm_step(init_message, message, grammatic, type_data, description_data, tic, heuristic_method, optimized)
        if step_value is None:
            break

        else:
            transformed_message, value = step_value
            message = transformed_message

            step += 1
            res[step] = value

    return res

## Testing data

In [29]:
test_result = algorithm(data_test, limit_step=20)

In [30]:
df = pd.DataFrame.from_dict(test_result, orient="index")
df

Unnamed: 0,Rule,EntropyMove,CurrentEntropy,EntropySize,DataType,DescriptionData,MessageSize,AlphabetSize,GrammaticSize,CalcTime
0,->,0.0,1.989577,19895.772726,,,10000,4,1,0.003999
2,TA -> 𥄼,-873.944409,2.187207,20769.717135,,,9496,5,2,0.654996
3,AC -> ằ,-739.921484,2.402775,21509.638619,,,8952,6,3,0.874931
4,GT -> 𣱪,-560.433418,2.576474,22070.072037,,,8566,7,4,0.942739
5,CA -> 㪖,-372.54555,2.796936,22442.617587,,,8024,8,5,1.207027
6,AT -> 𦻉,-317.468805,2.936785,22760.086392,,,7750,9,6,1.581773
7,GC -> 룩,-326.192303,3.117661,23086.278695,,,7405,10,7,1.814869
8,TG -> 𫉠,-266.603119,3.277597,23352.881814,,,7125,11,8,2.198708
9,CG -> 𭷞,-131.089595,3.313669,23483.971409,,,7087,12,9,2.636055
10,ằT -> 𬚇,-106.074175,3.359927,23590.045584,,,7021,13,10,2.88535


In [31]:
fig = px.line(df, x=df.index, y=df.EntropyMove, text=[("%.1f" % x) for x in df.EntropyMove.values], title='Pohyb entropie')
fig.show()

In [32]:
fig = px.line(df, x=df.index, y=df.MessageSize, text=df.MessageSize, title='Velikost zprávy')
fig.show()
fig.write_image("test.png")

In [33]:
fig = px.line(df, x=df.index, y=[df.MessageSize, df.EntropySize], title='Velikost zprávy proti aktuální entropii')
fig.show()

In [34]:
fig = px.line(df, x=df.index, y=df.EntropySize, title='H*len(m)', text=[("%.0f" % x) for x in df.EntropySize.values])
fig.update_traces(textposition='top center')
fig.show()

# Real experiment data

In [None]:
def get_datasets(normalization=None):
    return [
        #(*load_dataset(DataSets.english, normalization), DataSets.english),  done
        #(*load_dataset(DataSets.dna, normalization), DataSets.dna),
        #(*load_dataset(DataSets.proteins, normalization), DataSets.proteins),
        (*load_dataset(DataSets.sources, normalization), DataSets.sources),
    ]

In [None]:
NORM_VALUES = [10000]
CSV_NAME = "steps.csv"
ENTROPY_GRAPH = "entropy_paradox.png"
MESSAGE_GRAPH = "message_size.png"
MESSAGE_SIZE_ENTROPY_GRAPH = "message_entropy_size.png"
MESSAGE_ENTROPY = "message_entropy.png"

In [None]:
def write_images(df, path):
    fig = px.line(df, x=df.index, y=df.EntropyMove, text=[("%.1f" % x) for x in df.EntropyMove.values], title='Pohyb entropie')
    fig.update_traces(textposition='top center')
    fig.write_image(os.path.sep.join([path, ENTROPY_GRAPH]))
    
    fig = px.line(df, x=df.index, y=df.MessageSize, text=df.MessageSize, title='Velikost zprávy')
    fig.update_traces(textposition='top center')
    fig.write_image(os.path.sep.join([path, MESSAGE_GRAPH]))    

    fig = px.line(df, x=df.index, y=[df.MessageSize, df.EntropySize], title='Velikost zprávy proti aktuální entropii')
    fig.update_traces(textposition='top center')
    fig.write_image(os.path.sep.join([path, MESSAGE_SIZE_ENTROPY_GRAPH]))

    fig = px.line(df, x=df.index, y=df.EntropySize, title='H*len(m)', text=[("%.0f" % x) for x in df.EntropySize.values])
    fig.update_traces(textposition='top center')   
    fig.write_image(os.path.sep.join([path, MESSAGE_ENTROPY]))

In [None]:
def save_dataframe(df, path):
    path = os.path.sep.join([path, CSV_NAME])
    df.to_csv(path, index=False)

In [None]:
def run_algorithm_for_datasets(normalization_values=NORM_VALUES, largest=True, limit_steps=None, optimized=False):
    for n_v in normalization_values:

        datasets = get_datasets(n_v)

        for data, data_path, data_type in datasets:
            data_type_string = data_type.value
            current_path = os.path.sep.join([data_type_string, str(n_v)])

            limit_steps_str = "None" if limit_steps is None else str(limit_steps)
            optimized_string = "optimized" if optimized else "bruto"

            steps_path = os.path.sep.join([current_path, limit_steps_str, str(largest), optimized_string])

            if not os.path.isdir(steps_path):
                os.makedirs(steps_path)

            heurestic_method = pick_largest if largest else pick_random

            res = algorithm(data, data_type_string, "", limit_step=limit_steps, heuristic_method=heurestic_method, optimized=optimized)
            df = pd.DataFrame.from_dict(res, orient="index")


            save_dataframe(df, steps_path)
            write_images(df, steps_path)

In [None]:
LIMIT_STEPS = [None]
LARGEST_METHOD = [True] #largest || random
OPTIMIZED = [False] #not optimized (bruto) || optimized

In [None]:
import itertools

In [None]:
exps = list(itertools.product(LIMIT_STEPS, LARGEST_METHOD, OPTIMIZED))
exps

In [None]:
for limit_steps, largest_method, optimized in exps:
    run_algorithm_for_datasets(NORM_VALUES, largest_method, limit_steps, optimized)