# Project to AKS

## Reference 

- https://homel.vsb.cz/~vas218/pdf/acs/grammar.pdf
- https://homel.vsb.cz/~vas218/pdf/acs/vasinek-thesis.pdf
- https://homel.vsb.cz/~vas218/acs.html

## Choosen task

- Reduction paradox

## Description

- Try to reduce message and with every step maximaze entropy

## Official description 

- **RePair - maximal anticompression - reduction paradox**
    - Find the smallest possible representation(measured in the number of symbols) of file using the reduction paradox which leads to the largest increase of zero order entropy representation.
    - Heuristics - largest first, random
    - Describe the algorithm and summarize results to a .doc(x) or .pdf report.
    - Prepare a presentation for 10 minutes about your method.
    - Literature: Vasinek, Dissertation thesis (Chapter 5)

## Steps

- Load dataset
- Can pick a subset for faster running (sample it)
- Find every bigram and try to create rule
- Calculate Entropy for new message
- Need to find rule which reduces size of message but increases entropy (harder to compress)
- Find extremes for every file
- Show graphs

In [1]:
import os
import sys

def adding_module_path():
    module_path = os.path.abspath(os.path.sep.join([".."]*1))

    if module_path not in sys.path:
        sys.path.append(module_path)

adding_module_path()

In [2]:
from src.load_data import get_dataset
from src.load_data import DataSets
from src.get_probs import get_sorted_probs_as_df
import numpy as np
import pandas as pd
import time
from src.save import save_both
from enum import Enum
import re
from ast import literal_eval
import random
import plotly.express as px


In [3]:
TEST_NORMALIZATION_SIZE = 10000

In [4]:
def load_dataset(type, normalization=None):
    data_dna, path_dna = get_dataset(type)
    if normalization is None:
        return data_dna, path_dna
    return "".join(np.random.choice(list(data_dna), normalization)), path_dna

In [5]:
def get_datasets(normalization=None):
    return [
        #(*get_dataset(DataSets.english, normalization), DataSets.english), 
        (*load_dataset(DataSets.dna, normalization), DataSets.dna),
        #(*load_dataset(DataSets.proteins, normalization), DataSets.proteins),
        #(*load_dataset(DataSets.sources, normalization), DataSets.sources),
    ]


In [6]:
test_datasets = get_datasets(TEST_NORMALIZATION_SIZE)

Loading C://Users//Vojta//Desktop//iv//AKS//datasets\dna\dna.50MB


In [7]:
test_datasets

[('CTATTAGAAGCATTGTACCTGCAATTGAAGTAAGGAATCATTTCTGCTCGATAAGTTCTTAAGATATGACAAGAGAAGGCCTGGTAACATTAGGTTCGGAGCTTAGTAATAATCATGATCCCGCTGCTGTGCTCATCGGCCCGATAAAATGGTGTGTACTGCCCGAGGCACTAGGGCGTATAAAACAGGTAATCTTAATTAACCCGGGTTATATATAGCTCAAGTAGACTCTTCTCGGACGTATTGTATCACTTACCTCCAAAATACAAATGCAACATAGACAGTAGTAGGACGGCGGTGTGGCTTGGGACCACCGCATGTGGAAAAATTCAGGTTATTAATCTCGAGCTCATGTGTCCTACCTAAGCGTACCGTTCTTAAAGTGATGTCAGTGCGATAATGGAATGATATCGTAATCCGTTGCAGAATTCTATCTAGCGCGGGTGTCAGAAGAATCATGGAGCTGTGGCTAAAGTGTATGGCATATTTATTAGTGCGAAGGTCATTAGATATTTACAGATGAATGTAGAACTAATGATCAACATATAAGACGGTCGATAACCTCTCTGAATAAGAGCCCTAGCATTTTTCATCACCGTCCAACATTACGAGACAAGTCCAGTCCTCATAGATACGTGTAACAATATGATATGAAATACCTTTGGATGGTTCAAGAAGGTACTCCATCAAACTATTTATCCTCCTCGAGGCAGTAACGGACCGATTGCTAGGTAACCAGCGTCAAAAAACGCCAGTTTTCGATAACCAGCAATCCATGTCAATGGTCAGAAGTATGTTCATCTTGGAATGTACTACAGACGAAATGATCCAAGGATCTGGCCCGTAGACAATGCCCAGTCAAGCTGCTCTAGGAAAGAATGTTTAGAAAGAAATGTGACTTATTTGTCTTTTATTCCTAATATAATAGTTTGTAGCTATACTAGTAAGTGAATGGCAACTGCTTTACAAACAACCCTCGTCAAACCAATGATAACTA

In [8]:
data_test, data_path, data_type = test_datasets[0]

In [9]:
def find_k_grams_freq(data, max_size_k=2):
    
    kgrams_dic = {}

    for k in range(2, max_size_k+1):

        for i in range(len(data) - k):

            n_gram = data[i:i+k]
            
            kgrams_dic[n_gram] = kgrams_dic.get(n_gram, 0) + 1

    return kgrams_dic

In [12]:
find_k_grams_freq(data_test)

{'CT': 597,
 'TA': 846,
 'AT': 842,
 'TT': 764,
 'AG': 650,
 'GA': 609,
 'AA': 837,
 'GC': 459,
 'CA': 646,
 'TG': 600,
 'GT': 613,
 'AC': 608,
 'CC': 455,
 'GG': 432,
 'TC': 607,
 'CG': 431,
 'CN': 1,
 'NT': 1}

In [13]:
import math
from collections import Counter

def calc_freq(content):
    c = Counter(list(content))
    return c

def calc_p(counter, n):
    counter = dict(counter)
    res = {}
    for k, v in counter.items():
        res[k] = v / n  
    return res

def get_n(counter):
    counter = dict(counter)
    return np.sum(list(counter.values()))

def calc_H(p):
    H = 0
    for k, v in p.items():
        #Shannon equation!
        H += p[k] * math.log2(p[k])
    return -H

def calc_entropy_for_message(message):
    counter = calc_freq(message)
    n = get_n(counter)
    p = calc_p(counter, n)
    H = calc_H(p)
    return H

In [14]:
calc_entropy_for_message(data_test)

1.9845498475854497

In [15]:
def diff_entropy(message1, message2):
    message1_entropy = calc_entropy_for_message(message1)
    message2_entropy = calc_entropy_for_message(message2)
    diff = message1_entropy - message2_entropy
    return diff

In [16]:
diff_entropy(data_test, data_test)

0.0

In [77]:
all_chars_uni = tuple(chr(i) for i in range(32, 0x110000) if chr(i).isprintable())

In [78]:
all_chars_ascii = list(range(0, 256))
all_chars_ascii = [chr(ascii_char) for ascii_char in all_chars_ascii]

In [81]:
def find_not_existing_character(current_alpahbet, gen_chars=all_chars_uni):
    can_use = set(gen_chars).difference(set(current_alpahbet))
    ascii_picked_char = random.choice(list(can_use))
    return ascii_picked_char

In [82]:
find_not_existing_character(['A', 'C', 'G', 'T'])

'愒'

In [87]:
def transform_message(message, ngram_for_replace):
    current_alphabet_size = np.unique(list(message))

    #print('Alphabet size', len(current_alphabet_size))

    replace_character = find_not_existing_character(current_alphabet_size)
    
    return message.replace(ngram_for_replace, replace_character), replace_character

In [88]:
def calculate_for_ngrams_diff(ngrams, message, method_entropy=diff_entropy, init_message=None):
    res = {}
    
    for k, v in ngrams.items():
        res[k] = {
            "Counter": v,
            #message - current
            #message - next message
            "Diff": method_entropy(init_message, transform_message(message, k)[0])
        }
    return res

In [89]:
r = calculate_for_ngrams_diff(
        find_k_grams_freq(data_test),
        data_test,
        diff_entropy,
        data_test
    )

pd.DataFrame.from_dict(
    r, 
    orient="index"
)

Unnamed: 0,Counter,Diff
CT,597,-0.196186
TA,846,-0.275865
AT,842,-0.275273
TT,764,-0.200684
AG,650,-0.207655
GA,609,-0.201508
AA,837,-0.217441
GC,459,-0.148935
CA,646,-0.20809
TG,600,-0.195732


In [90]:
r

{'CT': {'Counter': 597, 'Diff': -0.19618629360066064},
 'TA': {'Counter': 846, 'Diff': -0.2758653005273093},
 'AT': {'Counter': 842, 'Diff': -0.2752725789397319},
 'TT': {'Counter': 764, 'Diff': -0.20068351128492745},
 'AG': {'Counter': 650, 'Diff': -0.2076548720955249},
 'GA': {'Counter': 609, 'Diff': -0.2015079176600456},
 'AA': {'Counter': 837, 'Diff': -0.21744114972683803},
 'GC': {'Counter': 459, 'Diff': -0.14893465699458508},
 'CA': {'Counter': 646, 'Diff': -0.20809006579600142},
 'TG': {'Counter': 600, 'Diff': -0.19573218696993777},
 'GT': {'Counter': 613, 'Diff': -0.19767884143496062},
 'AC': {'Counter': 608, 'Diff': -0.20244331360586454},
 'CC': {'Counter': 455, 'Diff': -0.1228539744510142},
 'GG': {'Counter': 432, 'Diff': -0.12082447445283462},
 'TC': {'Counter': 607, 'Diff': -0.19771983447771557},
 'CG': {'Counter': 431, 'Diff': -0.1444363972328082},
 'CN': {'Counter': 1, 'Diff': 2.461386857421921e-05},
 'NT': {'Counter': 1, 'Diff': -1.5661301515423887e-05}}

In [91]:
r.items()

dict_items([('CT', {'Counter': 597, 'Diff': -0.19618629360066064}), ('TA', {'Counter': 846, 'Diff': -0.2758653005273093}), ('AT', {'Counter': 842, 'Diff': -0.2752725789397319}), ('TT', {'Counter': 764, 'Diff': -0.20068351128492745}), ('AG', {'Counter': 650, 'Diff': -0.2076548720955249}), ('GA', {'Counter': 609, 'Diff': -0.2015079176600456}), ('AA', {'Counter': 837, 'Diff': -0.21744114972683803}), ('GC', {'Counter': 459, 'Diff': -0.14893465699458508}), ('CA', {'Counter': 646, 'Diff': -0.20809006579600142}), ('TG', {'Counter': 600, 'Diff': -0.19573218696993777}), ('GT', {'Counter': 613, 'Diff': -0.19767884143496062}), ('AC', {'Counter': 608, 'Diff': -0.20244331360586454}), ('CC', {'Counter': 455, 'Diff': -0.1228539744510142}), ('GG', {'Counter': 432, 'Diff': -0.12082447445283462}), ('TC', {'Counter': 607, 'Diff': -0.19771983447771557}), ('CG', {'Counter': 431, 'Diff': -0.1444363972328082}), ('CN', {'Counter': 1, 'Diff': 2.461386857421921e-05}), ('NT', {'Counter': 1, 'Diff': -1.5661301515

In [92]:
def pick_largest(items):
    return list(sorted(items, key=lambda x: x[1]['Diff']))[0]

def pick_random(items):
    return random.choice(items)

def pick_only_decreasing(dic, pick_method=pick_largest):
    items = dic.items()

    decreasing_items = list(filter(lambda x: x[1]['Diff'] < 0, items))

    if len(decreasing_items) == 0:
        return None

    return pick_method(decreasing_items)    

In [93]:
pick_only_decreasing(r, pick_largest)

('TA', {'Counter': 846, 'Diff': -0.2758653005273093})

# Algorithm implementation - Reduction paradox

In [94]:
class TableFields(Enum):
    Rule = "Rule"
    EntropyMove = "EntropyMove"
    DataType = "DataType"
    DescriptionData = "DescriptionData"
    MessageSize = "MessageSize"
    AlphabetSize = "AlphabetSize"
    GrammaticSize = "GrammaticSize"
    CalcTime = "CalcTime"


In [95]:
def create_value(message_0, message_1, diff, replace_character, n_gram, grammatic, tic, type_data=None, description_data=None):
    new_message_alphabet_size = len(np.unique(list(message_1)))
    new_message_size = len(message_1)

    new_rule = f"{n_gram} -> {replace_character}"

    grammatic[n_gram] = replace_character

    tac = time.time()
    
    return {
        TableFields.Rule.value: new_rule,
        TableFields.EntropyMove.value: diff,
        TableFields.DataType.value: type_data,
        TableFields.DescriptionData.value: description_data,
        TableFields.MessageSize.value: new_message_size,
        TableFields.AlphabetSize.value: new_message_alphabet_size,
        TableFields.GrammaticSize.value: len(list(grammatic.keys())),
        TableFields.CalcTime.value: tac - tic,
    }


In [96]:
def algorithm_step(init_message, message, grammatic, type_data=None, description_data=None, tic=None, heuristics_method=pick_largest):
    #Find ngrams
    n_grams = find_k_grams_freq(message)
    

    diff_table = calculate_for_ngrams_diff(
        n_grams,
        message,
        diff_entropy,
        init_message
    )

    picked = pick_only_decreasing(diff_table, heuristics_method)

    if picked is None:
        return None

    #('AT', {'Counter': 867, 'Diff': -0.2814694848677286})
    n_gram, dic_values = picked
    transformed_message, replace_character = transform_message(message, n_gram)
    
    return transformed_message, create_value(message, transformed_message, dic_values['Diff'], replace_character, n_gram, grammatic, tic, type_data, description_data)


In [97]:
def algorithm(message, type_data=None, description_data=None, limit_step=None):
    init_message = message

    res = {}
    grammatic = {}

    step = 0
    while True:
        tic = time.time()
        if limit_step is not None and limit_step == step:
            break
        step_value = algorithm_step(init_message, message, grammatic, type_data, description_data, tic)
        print(step, len(message))
        if step_value is None:
            break

        else:
            transformed_message, value = step_value
            message = transformed_message

            step += 1
            res[step] = value

    return res

## Testing data

In [98]:
test_result = algorithm(data_test, limit_step=20)

0 10000
1 9154
2 8508
3 7908
4 7477
5 7156
6 6858
7 6622
8 6388
9 6198
10 6060
11 5930
12 5784
13 5675
14 5580
15 5473
16 5386
17 5315
18 5219
19 5150


In [99]:
df = pd.DataFrame.from_dict(test_result, orient="index")
df

Unnamed: 0,Rule,EntropyMove,DataType,DescriptionData,MessageSize,AlphabetSize,GrammaticSize,CalcTime
1,TA -> 𪷻,-0.275865,,,9154,6,1,0.654009
2,CA -> ◆,-0.492469,,,8508,7,2,1.005837
3,TG -> 𗤏,-0.73561,,,7908,8,3,1.267969
4,CG -> 𩹇,-0.916619,,,7477,9,4,1.797999
5,AA -> 𑒓,-1.069667,,,7156,10,5,2.141035
6,TC -> 󠅃,-1.22479,,,6858,11,6,2.508997
7,T𪷻 -> 𩼭,-1.347674,,,6622,12,7,3.081969
8,AG -> 縂,-1.478274,,,6388,13,8,4.064031
9,TT -> 𨿟,-1.580932,,,6198,14,9,4.347
10,GG -> 𥚏,-1.67756,,,6060,15,10,4.708434


In [100]:
fig = px.line(df, x=df.index, y=df.EntropyMove, text=[("%.3f" % x) for x in df.EntropyMove.values], title='Entropy paradox')
fig.show()

In [101]:
fig = px.line(df, x=df.index, y=df.MessageSize, text=df.MessageSize, title='Message size')
fig.show()
fig.write_image("test.png")

# Real experiment data

In [102]:
def get_datasets(normalization=None):
    return [
        #(*load_dataset(DataSets.english, normalization), DataSets.english), 
        (*load_dataset(DataSets.dna, normalization), DataSets.dna),
        (*load_dataset(DataSets.proteins, normalization), DataSets.proteins),
        #(*load_dataset(DataSets.sources, normalization), DataSets.sources),
    ]

In [103]:
NORM_VALUES = [10000, 20000]
CSV_NAME = "steps.csv"
ENTROPY_GRAPH = "entropy_paradox.png"
MESSAGE_GRAPH = "message_size.png"

In [67]:
def write_images(df, path):
    fig = px.line(df, x=df.index, y=df.EntropyMove, text=[("%.3f" % x) for x in df.EntropyMove.values], title='Entropy paradox')
    fig.write_image(os.path.sep.join([path, ENTROPY_GRAPH]))
    fig = px.line(df, x=df.index, y=df.MessageSize, text=df.MessageSize, title='Message size')
    fig.write_image(os.path.sep.join([path, MESSAGE_GRAPH]))    

In [68]:
def save_dataframe(df, path):
    path = os.path.sep.join([path, CSV_NAME])
    df.to_csv(path, index=False)

In [104]:
def run_algorithm_for_datasets(normalization_values=NORM_VALUES, limit_steps=None):
    for n_v in normalization_values:

        datasets = get_datasets(n_v)

        for data, data_path, data_type in datasets:
            data_type_string = data_type.value
            current_path = os.path.sep.join([data_type_string, str(n_v)])


            limit_steps_str = "None" if limit_steps is None else str(limit_steps)
            steps_path = os.path.sep.join([current_path, limit_steps_str])

            if not os.path.isdir(steps_path):
                os.makedirs(steps_path)

            res = algorithm(data, data_type_string, "", limit_step=limit_steps)
            df = pd.DataFrame.from_dict(res, orient="index")




            save_dataframe(df, steps_path)
            write_images(df, steps_path)

In [105]:
run_algorithm_for_datasets()

Loading C://Users//Vojta//Desktop//iv//AKS//datasets\dna\dna.50MB
Loading C://Users//Vojta//Desktop//iv//AKS//datasets\proteins\proteins.50MB
0 10000
1 9150
2 8514
3 7945
4 7495
5 7071
6 6840
7 6588
8 6367
9 6180
10 6029
11 5888
12 5727
13 5590
14 5456
15 5373
16 5276
17 5199
18 5104
19 5044
20 4983
21 4925
22 4867
23 4813
24 4745
25 4695
26 4642
27 4603
28 4541
29 4499
30 4473
31 4445
32 4401
33 4370
34 4347
35 4317
36 4292
37 4257
38 4236
39 4218
40 4198
41 4157
42 4132
43 4107
44 4087
45 4069
46 4044
47 4027
48 4000
49 3984
50 3967
51 3951
52 3934
53 3914
54 3901
55 3887
56 3875
57 3855
58 3836
59 3823
60 3810
61 3798
62 3784
63 3772
64 3757
65 3746
66 3731
67 3720
68 3710
69 3700
70 3685
71 3671
72 3650
73 3640
74 3631
75 3617
76 3598
77 3588
78 3578
79 3569
80 3561
81 3552
82 3542
83 3532
84 3523
85 3515
86 3497
87 3487
88 3479
89 3471
90 3462
91 3451
92 3444
93 3437
94 3430
95 3422
96 3411
97 3405
98 3398
99 3391
100 3385
101 3374
102 3360
103 3354
104 3341
105 3335
106 3327
107 