# Project to AKS

## Reference 

- https://homel.vsb.cz/~vas218/pdf/acs/grammar.pdf
- https://homel.vsb.cz/~vas218/pdf/acs/vasinek-thesis.pdf
- https://homel.vsb.cz/~vas218/acs.html

## Choosen task

- Reduction paradox

## Description

- Try to reduce message and with every step maximaze entropy

## Official description 

- **RePair - maximal anticompression - reduction paradox**
    - Find the smallest possible representation(measured in the number of symbols) of file using the reduction paradox which leads to the largest increase of zero order entropy representation.
    - Heuristics - largest first, random
    - Describe the algorithm and summarize results to a .doc(x) or .pdf report.
    - Prepare a presentation for 10 minutes about your method.
    - Literature: Vasinek, Dissertation thesis (Chapter 5)

## Steps

- Load dataset
- Can pick a subset for faster running (sample it)
- Find every bigram and try to create rule
- Calculate Entropy for new message
- Need to find rule which reduces size of message but increases entropy (harder to compress)
- Find extremes for every file
- Show graphs

In [4]:
import os
import sys

def adding_module_path():
    module_path = os.path.abspath(os.path.sep.join([".."]*1))

    if module_path not in sys.path:
        sys.path.append(module_path)

adding_module_path()

In [265]:
from src.load_data import get_dataset
from src.load_data import DataSets
from src.get_probs import get_sorted_probs_as_df
import numpy as np
import pandas as pd
import time
from src.save import save_both
from enum import Enum
import re
from ast import literal_eval
import random
import plotly.express as px


In [25]:
TEST_NORMALIZATION_SIZE = 10000

In [26]:
def load_dataset(type, normalization=None):
    data_dna, path_dna = get_dataset(type)
    if normalization is None:
        return data_dna, path_dna
    return "".join(np.random.choice(list(data_dna), normalization)), path_dna

In [27]:
def get_datasets(normalization=None):
    return [
        #(*get_dataset(DataSets.english, normalization), DataSets.english), 
        (*load_dataset(DataSets.dna, normalization), DataSets.dna),
        #(*load_dataset(DataSets.proteins, normalization), DataSets.proteins),
        #(*load_dataset(DataSets.sources, normalization), DataSets.sources),
    ]


In [28]:
test_datasets = get_datasets(TEST_NORMALIZATION_SIZE)

Loading C://Users//Vojta//Desktop//iv//AKS//datasets\dna\dna.50MB


In [29]:
test_datasets

[('CGGTATTAAGATTATTATCATTGCAGAAAGACTTTACTAAGATAAGTATACATTCAGCTTTTGTTATAGATCGTGCATATTCCCGGGAAAGGATCGGTTTGCTACGATTGAATGGAAGGAACAAACCCTACAAGATTTCTATTCCAGTAATCTCGTGTGGAAGGTATTTGTTAATATAGACTGAGGGTCGTACGTGAGACTCCATAGGGTCCGGAAACTCCCAAACGTACTGGCCGGCGAACCCTTGGATTCATTGGTGGAGACCAGTATTTATTGACTCACGTCCGACAGATGACCGCAAGCCCATGCTTCCCGCATTGCGTTAGTCGACGGTCAGTTCTACAGTAATAAAGAGGCACTATTGGCACTCCAGGCTTCAGAAAAGTTCCATTTTAGTTTTATATCTATTAAGACTAGTAGAGCGTTGAAATATTTTACTCCATAAGTTCTCAAGCTCTTTCTTTCAATCAAGAGGGCCAAATATTGGGTGCCTAAAGTTAAACCTGCGTATACAATAAATCGGCTCGGCTCCTAAGGGTATACCCCTTGTTGTTAAGAAACTGACTTGTAGCCTCAAGATACGCATAAGTCCCGAGCGATCAATACCAGCTCGTCGTAGCCTAAGATGTTTGAACGTGCGAACATGGGAGGCATTATTTGTACTGAGCGCCACAAAAGGACGTGGAGCCGGTCAAAGTGAGAGCTTCTAATGGGAGATACTAAGGTTTGTGTCTCAATTCACAAAAGAATATAAATTAAATTAGACCATCAACGGTGTTGATATAATGTAACTTTTCCCCCCCTCGTAATAACTAATTCGTTGCACGGTTAGCTGCTCACTTTAATGCTTTGTAGCCCAAAAACACATTGGTGATGTCCTTAAAGCAATGACGCAATTCGGTGATTGAAAATGCATGTGGAATTCGTAAGAGTCCTAAGCTCATAGAGAACATATAAACACAGGATATGGAACCCAAACCTACTCCTTAACATGATG

In [203]:
data_test, data_path, data_type = test_datasets[0]

In [204]:
def find_k_grams_freq(data, max_size_k=2):
    
    kgrams_dic = {}

    for k in range(2, max_size_k+1):

        for i in range(len(data) - k):

            n_gram = data[i:i+k]
            
            kgrams_dic[n_gram] = kgrams_dic.get(n_gram, 0) + 1

    return kgrams_dic

In [205]:
find_k_grams_freq(data_test)

{'CG': 440,
 'GG': 423,
 'GT': 590,
 'TA': 832,
 'AT': 867,
 'TT': 842,
 'AA': 804,
 'AG': 579,
 'GA': 625,
 'TC': 627,
 'CA': 623,
 'TG': 618,
 'GC': 422,
 'AC': 634,
 'CT': 621,
 'CC': 451}

In [206]:
import math
from collections import Counter

def calc_freq(content):
    c = Counter(list(content))
    return c

def calc_p(counter, n):
    counter = dict(counter)
    res = {}
    for k, v in counter.items():
        res[k] = v / n  
    return res

def get_n(counter):
    counter = dict(counter)
    return np.sum(list(counter.values()))

def calc_H(p):
    H = 0
    for k, v in p.items():
        #Shannon equation!
        H += p[k] * math.log2(p[k])
    return -H

def calc_entropy_for_message(message):
    counter = calc_freq(message)
    n = get_n(counter)
    p = calc_p(counter, n)
    H = calc_H(p)
    return H

In [207]:
calc_entropy_for_message(data_test)

1.9811065817840268

In [208]:
def diff_entropy(message1, message2):
    message1_entropy = calc_entropy_for_message(message1)
    message2_entropy = calc_entropy_for_message(message2)
    diff = message1_entropy - message2_entropy
    return diff

In [209]:
diff_entropy(data_test, data_test)

0.0

In [210]:
def find_not_existing_character(current_alpahbet):
    ascii_values = list(range(0, 256))
    ascii_codes = [ord(char) for char in current_alpahbet]
    can_use = set(ascii_values).difference(set(ascii_codes))
    ascii_picked_char = random.choice(list(can_use))
    char = chr(ascii_picked_char)
    return char

In [211]:
find_not_existing_character(['A', 'C', 'G', 'T'])

'@'

In [237]:
def transform_message(message, ngram_for_replace):
    current_alphabet_size = np.unique(list(message))

    replace_character = find_not_existing_character(current_alphabet_size)
    
    return message.replace(ngram_for_replace, replace_character), replace_character

In [238]:
def calculate_for_ngrams_diff(ngrams, message, method_entropy=diff_entropy):
    res = {}
    
    for k, v in ngrams.items():
        res[k] = {
            "Counter": v,
            "Diff": method_entropy(message, transform_message(message, k)[0])
        }
    return res

In [239]:
r = calculate_for_ngrams_diff(
        find_k_grams_freq(data_test),
        data_test
    )

pd.DataFrame.from_dict(
    r, 
    orient="index"
)

Unnamed: 0,Counter,Diff
CG,440,-0.144009
GG,423,-0.116308
GT,590,-0.194946
TA,832,-0.276203
AT,867,-0.281469
TT,842,-0.216464
AA,804,-0.209085
AG,579,-0.191932
GA,625,-0.198838
TC,627,-0.204607


In [240]:
r

{'CG': {'Counter': 440, 'Diff': -0.14400911057088184},
 'GG': {'Counter': 423, 'Diff': -0.11630809042022605},
 'GT': {'Counter': 590, 'Diff': -0.1949460187176224},
 'TA': {'Counter': 832, 'Diff': -0.2762032304859767},
 'AT': {'Counter': 867, 'Diff': -0.2814694848677286},
 'TT': {'Counter': 842, 'Diff': -0.21646359726551978},
 'AA': {'Counter': 804, 'Diff': -0.209085389731968},
 'AG': {'Counter': 579, 'Diff': -0.19193238152330538},
 'GA': {'Counter': 625, 'Diff': -0.1988380616008576},
 'TC': {'Counter': 627, 'Diff': -0.20460664457236177},
 'CA': {'Counter': 623, 'Diff': -0.20262176743536653},
 'TG': {'Counter': 618, 'Diff': -0.19920108924319635},
 'GC': {'Counter': 422, 'Diff': -0.1410886853334885},
 'AC': {'Counter': 634, 'Diff': -0.20425441540731226},
 'CT': {'Counter': 621, 'Diff': -0.20369031140422456},
 'CC': {'Counter': 451, 'Diff': -0.12298011907015827}}

In [241]:
r.items()

dict_items([('CG', {'Counter': 440, 'Diff': -0.14400911057088184}), ('GG', {'Counter': 423, 'Diff': -0.11630809042022605}), ('GT', {'Counter': 590, 'Diff': -0.1949460187176224}), ('TA', {'Counter': 832, 'Diff': -0.2762032304859767}), ('AT', {'Counter': 867, 'Diff': -0.2814694848677286}), ('TT', {'Counter': 842, 'Diff': -0.21646359726551978}), ('AA', {'Counter': 804, 'Diff': -0.209085389731968}), ('AG', {'Counter': 579, 'Diff': -0.19193238152330538}), ('GA', {'Counter': 625, 'Diff': -0.1988380616008576}), ('TC', {'Counter': 627, 'Diff': -0.20460664457236177}), ('CA', {'Counter': 623, 'Diff': -0.20262176743536653}), ('TG', {'Counter': 618, 'Diff': -0.19920108924319635}), ('GC', {'Counter': 422, 'Diff': -0.1410886853334885}), ('AC', {'Counter': 634, 'Diff': -0.20425441540731226}), ('CT', {'Counter': 621, 'Diff': -0.20369031140422456}), ('CC', {'Counter': 451, 'Diff': -0.12298011907015827})])

In [242]:
def pick_largest(items):
    return list(sorted(items, key=lambda x: x[1]['Diff']))[0]

def pick_random(items):
    return random.choice(items)

def pick_only_decreasing(dic, pick_method=pick_largest):
    items = dic.items()

    decreasing_items = list(filter(lambda x: x[1]['Diff'] < 0, items))

    if len(decreasing_items) == 0:
        return None

    return pick_method(decreasing_items)    

In [243]:
pick_only_decreasing(r, pick_largest)

('AT', {'Counter': 867, 'Diff': -0.2814694848677286})

# Algorithm implementation - Reduction paradox

In [266]:
class TableFields(Enum):
    Rule = "Rule"
    EntropyMove = "EntropyMove"
    DataType = "DataType"
    DescriptionData = "DescriptionData"
    MessageSize = "MessageSize"
    AlphabetSize = "AlphabetSize"
    GrammaticSize = "GrammaticSize"
    CalcTime = "CalcTime"


In [267]:
def create_value(message_0, message_1, diff, replace_character, n_gram, grammatic, tic, type_data=None, description_data=None):
    new_message_alphabet_size = len(np.unique(list(message_1)))
    new_message_size = len(message_1)

    new_rule = f"{n_gram} -> {replace_character}"

    grammatic[n_gram] = replace_character

    tac = time.time()
    
    return {
        TableFields.Rule.value: new_rule,
        TableFields.EntropyMove.value: diff,
        TableFields.DataType.value: type_data,
        TableFields.DescriptionData.value: description_data,
        TableFields.MessageSize.value: new_message_size,
        TableFields.AlphabetSize.value: new_message_alphabet_size,
        TableFields.GrammaticSize.value: len(list(grammatic.keys())),
        TableFields.CalcTime.value: tac - tic,
    }


In [268]:
def algorithm_step(message, grammatic, type_data=None, description_data=None, tic=None, heuristics_method=pick_largest):
    #Find ngrams
    n_grams = find_k_grams_freq(message)
    

    diff_table = calculate_for_ngrams_diff(
        n_grams,
        message
    )

    picked = pick_only_decreasing(diff_table, heuristics_method)

    if picked is None:
        return None

    #('AT', {'Counter': 867, 'Diff': -0.2814694848677286})
    n_gram, dic_values = picked
    transformed_message, replace_character = transform_message(message, n_gram)
    
    return transformed_message, create_value(message, transformed_message, dic_values['Diff'], replace_character, n_gram, grammatic, tic, type_data, description_data)


In [269]:
def algorithm(message, type_data=None, description_data=None, limit_step=None):
    res = {}
    grammatic = {}

    step = 0
    while True:
        tic = time.time()
        if limit_step is not None and limit_step == step:
            break
        step_value = algorithm_step(message, grammatic, type_data, description_data, tic)
        print(step, len(message))
        if step_value is None:
            break

        else:
            transformed_message, value = step_value
            message = transformed_message

            step += 1
            res[step] = value

    return res

## Testing data

In [272]:
test_result = algorithm(data_test, limit_step=20)

0 10000
1 9133
2 8512
3 7933
4 7493
5 7157
6 6847
7 6600
8 6355
9 6182
10 6028
11 5890
12 5756
13 5633
14 5529
15 5430
16 5347
17 5264
18 5185
19 5109


In [276]:
df = pd.DataFrame.from_dict(test_result, orient="index")
df

Unnamed: 0,Rule,EntropyMove,DataType,DescriptionData,MessageSize,AlphabetSize,GrammaticSize,CalcTime
1,AT -> Ô,-0.281469,,,9133,5,1,0.069252
2,CT -> -,-0.212045,,,8512,6,2,0.089968
3,AG -> X,-0.23837,,,7933,7,3,0.116999
4,CG -> Ö,-0.181157,,,7493,8,4,0.148036
5,AA -> ö,-0.153603,,,7157,9,5,0.161363
6,TT -> ,-0.166145,,,6847,10,6,0.197517
7,TG -> H,-0.121708,,,6600,11,7,0.210859
8,AC -> m,-0.130753,,,6355,12,8,0.250985
9,ÔG -> P,-0.109946,,,6182,13,9,0.288847
10,CC -> ¾,-0.100875,,,6028,14,10,0.329587


In [278]:
fig = px.line(df, x=df.index, y=df.EntropyMove, text=[("%.3f" % x) for x in df.EntropyMove.values], title='Entropy paradox')
fig.show()

In [283]:
fig = px.line(df, x=df.index, y=df.MessageSize, text=df.MessageSize, title='Message size')
fig.show()
#fig.write_image("test.png")

# Real experiment data

In [284]:
def get_datasets(normalization=None):
    return [
        (*load_dataset(DataSets.english, normalization), DataSets.english), 
        (*load_dataset(DataSets.dna, normalization), DataSets.dna),
        (*load_dataset(DataSets.proteins, normalization), DataSets.proteins),
        (*load_dataset(DataSets.sources, normalization), DataSets.sources),
    ]

In [285]:
NORM_VALUES = [10000, 20000]
CSV_NAME = "steps.csv"
ENTROPY_GRAPH = "entropy_paradox.png"
MESSAGE_GRAPH = "message_size.png"

In [286]:
def write_images(df, path):
    fig = px.line(df, x=df.index, y=df.EntropyMove, text=[("%.3f" % x) for x in df.EntropyMove.values], title='Entropy paradox')
    fig.write_image(os.path.sep.join([path, ENTROPY_GRAPH]))
    fig = px.line(df, x=df.index, y=df.MessageSize, text=df.MessageSize, title='Message size')
    fig.write_image(os.path.sep.join([path, MESSAGE_GRAPH]))    

In [287]:
def save_dataframe(df, path):
    path = os.path.sep.join([path, CSV_NAME])
    df.to_csv(path, index=False)

In [292]:
def run_algorithm_for_datasets(normalization_values=NORM_VALUES, limit_steps=20):
    for n_v in normalization_values:

        datasets = get_datasets(n_v)

        for data, data_path, data_type in datasets:
            data_type_string = data_type.value
            current_path = os.path.sep.join([data_type_string, str(n_v)])

            if not os.path.isdir(current_path):
                os.makedirs(current_path)

            res = algorithm(data, data_type_string, "", limit_step=limit_steps)
            df = pd.DataFrame.from_dict(test_result, orient="index")

            save_dataframe(df, current_path)
            write_images(df, current_path)

In [293]:
run_algorithm_for_datasets()

Loading C://Users//Vojta//Desktop//iv//AKS//datasets\english\english.50MB
Loading C://Users//Vojta//Desktop//iv//AKS//datasets\dna\dna.50MB
Loading C://Users//Vojta//Desktop//iv//AKS//datasets\proteins\proteins.50MB
Loading C://Users//Vojta//Desktop//iv//AKS//datasets\sources\sources.50MB
0 10000
1 9734
2 9588
3 9493
4 9407
5 9330
6 9256
7 9185
8 9122
9 9067
10 9009
11 8950
