# Poznamky

- Singulární - jednoznačné
- Nesingulární - více výsledků
- Prefixové, okamžité kódy - kodová slova se sama oddělují

Ne všechny kódy jsme schopní dekodovat v jasném místě.
Není potřeba použít binárku, stačí text.

# Zadání

## 2.3. Variable-Length Codes (10 points)

- Implement Golomb coding/decoding algorithm.(7 pts)
- You should encode symbols with high probability by shorter codes. First compute frequency of symbols, sort them by probability and assign them a Golomb-code.
- You don't have to store bits in binary, it is sufficient to store them as textual values.
- Find the optimal M for the selected file.(2 pts)
- Compare your results with the entropy and compute redundancy of the code R = C - H, where C is an average number of bits per byte used in the compressed file for the original data: 8*|c(m)|/|m|(1 pt)

In [1]:
import os
import sys

def adding_module_path():
    module_path = os.path.abspath(os.path.sep.join([".."]*2))

    if module_path not in sys.path:
        sys.path.append(module_path)

adding_module_path()

In [2]:
from src.load_data import get_dataset
from src.load_data import DataSets
import numpy as np
import time
import pandas as pd

In [3]:
data_dna, path_dna = get_dataset(DataSets.dna)

Loading C://Users//Vojta//Desktop//iv//AKS//datasets\dna\dna.50MB


In [139]:
def calc_freq(data):
    tic = time.time()
    print('Calculating freqs')
    res = {}
    for c in data:
        res[c] = res.get(c, 0) + 1
    toc = time.time()
    print(f'End of calculating {toc - tic}')
    return res

In [149]:
def calc_probs(freq):
    res = {}    
    tic = time.time()
    print('Calculating probs')
    n = np.sum(list(freq.values()))
    for k, v in freq.items():
        res[k] = (v/n, v)
    toc = time.time()
    print(f'End of calculating {toc - tic}')
    return res

In [150]:
def get_probs(data):
    freq = calc_freq(data)
    return calc_probs(freq)

In [151]:
p = get_probs(data_dna)

Calculating freqs
End of calculating 7.002310752868652
Calculating probs
End of calculating 0.0


In [152]:
p

{'G': (0.20997453689575196, 11008713),
 'A': (0.28997873306274413, 15203237),
 'T': (0.28972803115844725, 15190093),
 'C': (0.21029790878295898, 11025667),
 '\n': (6.40869140625e-06, 336),
 'N': (8.907318115234375e-06, 467),
 'W': (8.58306884765625e-07, 45),
 'S': (6.29425048828125e-07, 33),
 'R': (1.2969970703125e-06, 68),
 'K': (8.7738037109375e-07, 46),
 'Y': (1.087188720703125e-06, 57),
 'V': (5.7220458984375e-08, 3),
 'D': (1.9073486328125e-08, 1),
 'M': (5.91278076171875e-07, 31),
 'H': (3.814697265625e-08, 2),
 'B': (1.9073486328125e-08, 1)}

In [148]:
sorted(p, key=lambda x: p[x]['p'], reverse=True)

['A',
 'T',
 'C',
 'G',
 'N',
 '\n',
 'R',
 'Y',
 'K',
 'W',
 'S',
 'M',
 'V',
 'H',
 'D',
 'B']

In [153]:
def get_sorted_probs(data):
    p = get_probs(data)
    return dict(sorted(p.items(), key=lambda item: item[1][0], reverse=True))

In [154]:
pd.DataFrame.from_dict(get_sorted_probs(data_dna), orient='index')

Calculating freqs
End of calculating 6.821624755859375
Calculating probs
End of calculating 0.0


Unnamed: 0,0,1
A,0.2899787,15203237
T,0.289728,15190093
C,0.2102979,11025667
G,0.2099745,11008713
N,8.907318e-06,467
\n,6.408691e-06,336
R,1.296997e-06,68
Y,1.087189e-06,57
K,8.773804e-07,46
W,8.583069e-07,45


In [155]:
import math
import time

In [156]:
def unary_func(n):
    return n * '0' + '1'

In [157]:
def truncated(r, M):
    k = math.ceil(math.log2(M))

    if(r < pow(2, k) - M):
        code = bin(r)[2:]
        ln = len(code)
        if(ln < k):
            code = '0' * (k - ln - 1) + code

    else:
        code = bin(r + k)[2:]
        ln = len(code)
        if(ln < k + 1):
            code = '0' * (k - 1 - ln)  + code
    return code

In [158]:
def encode_number_func(number, M):
    q = number // M
    r = number % M
    
    quo_part = unary_func(q)
    truncated_part = truncated(r, M)

    return "".join([quo_part, truncated_part])


In [223]:
def build_encoding_table(sorted_table, M):
    res = {}
    print(f"Table for m = {M}")

    for i, value in enumerate(sorted_table.items()):
        k, v = value
        p, counter = v 

        res[i] = {
            "number": i,
            'char': k,
            "p": p,
            "golomb": encode_number_func(i, M),
            "counter": counter
        }

    return res

In [224]:
class Golomb:
    def __init__(self, M, prob_table, verbose=False):
        self.M = M
        self.prob_table = prob_table
        self.verbose = verbose

    def from_numbers_data(self, numbers, prob_table):
        keys = list(prob_table.keys())
        res = ""
        try:
            for number in numbers:
                res += keys[number]
            return res
        except Exception as e:
            print(f"Error occurred {str(e)}")

    def from_data_numbers(self, data, prob_table):
        keys = list(prob_table.keys())
        return [keys.index(c) for c in data]

    def encode_number(self, number):
        return encode_number_func(number, self.M)

    def unary(self, n):
        return unary_func(n)

    def encode_with_table(self, c, prop_table):
        keys = list(prop_table.keys())
        char_index = keys.index(c)
        return self.encode_number(char_index, len(keys))

    def encode_indicies(self, indicies, precalculated_table=None):
        encoded_string = ""
        for index in indicies:
            encoded_number = ""
            if precalculated_table is not None:
                encoded_number = precalculated_table[index]['golomb']
            else:
                encoded_number = self.encode_number(index)
            if self.verbose:
                print(encoded_number)
            encoded_string += encoded_number
        return encoded_string

    def encode_stream(self, stream):
        tic = time.time()
        encoded_string = ""
        for c in stream:
            enc = self.encode_with_table(c, self.prob_table)
            encoded_string += enc
        toc = time.time()
        return encoded_string, toc - tic

    def decode_number(self, q, M, r):
        return q * M + r

    def decode_stream(self, stream):
        res = []

        i = 0

        current_quotient = ""
        k = math.ceil(math.log2(self.M))
        c = math.ceil(math.log2(self.M)) - 1


        try:
            while i < len(stream) - c:
                current_char = stream[i]
                if current_char == '0':
                    current_quotient += current_char
                    i += 1

                else:
                    current_quotient += '1'
                    next_bits = stream[(i+1):(i+1+c)]
                    dec = int(next_bits, 2)

                    value = dec
                    right_side = math.pow(2, k) - self.M
                    is_smaller = value < right_side

                    q = current_quotient.count('0')
                    remainder = 0

                    if is_smaller:
                        remainder = int(next_bits, 2)
                        i += k
                        res.append(self.decode_number(q, self.M, remainder))
                        current_quotient = ""
                        continue
                    else:
                        bits = stream[(i+1):(i+1+c+1)]
                        remainder = int(bits, 2)
                        remainder = remainder - k
                        i += k + 1
                        res.append(self.decode_number(q, self.M, remainder))
                        current_quotient = ""
                        continue
        except Exception as e:
            print("Error occurred")
            print(f"{str(e)}")
            return None

        return res

    def decode_from_indicies(self, ints, keys):
        res = ""
        for index in ints:
            res += keys[index]
        return res

In [170]:
test_data = data_dna[0:10]
test_data

'GATCAATGAG'

# Tests

### Test conversion

In [171]:
sorted_table = get_sorted_probs(test_data)
gol = Golomb(3, sorted_table)

indicies = gol.from_data_numbers(test_data, sorted_table)
current_data = gol.from_numbers_data(indicies, sorted_table)

print(current_data)
print(current_data == test_data)

Calculating freqs
End of calculating 0.0
Calculating probs
End of calculating 0.0
GATCAATGAG
True


In [172]:
sorted_table

{'A': (0.4, 4), 'G': (0.3, 3), 'T': (0.2, 2), 'C': (0.1, 1)}

### Test of encoding

In [173]:
sorted_table = get_sorted_probs(test_data)

print(sorted_table)
gol = Golomb(17, sorted_table, True)

indicies = gol.from_data_numbers(test_data, sorted_table)
encoding_table = build_encoding_table(sorted_table, gol.M)


Calculating freqs
End of calculating 0.0
Calculating probs
End of calculating 0.001001119613647461
{'A': (0.4, 4), 'G': (0.3, 3), 'T': (0.2, 2), 'C': (0.1, 1)}


In [174]:
pd.DataFrame.from_dict(encoding_table, orient="index")

Unnamed: 0,number,char,p,golomb,counter
0,0,A,0.4,10000,4
1,1,G,0.3,10001,3
2,2,T,0.2,10010,2
3,3,C,0.1,10011,1


In [175]:
encoded_string = gol.encode_indicies(indicies, encoding_table)
print(encoded_string)


decoded_numbers = gol.decode_stream(encoded_string)

decoded_data = gol.from_numbers_data(decoded_numbers, sorted_table)

print(decoded_numbers, decoded_data)
print(indicies, test_data)
print(decoded_data == test_data)

10001
10000
10010
10011
10000
10000
10010
10001
10000
10001
10001100001001010011100001000010010100011000010001
[1, 0, 2, 3, 0, 0, 2, 1, 0, 1] GATCAATGAG
[1, 0, 2, 3, 0, 0, 2, 1, 0, 1] GATCAATGAG
True


### Test of specified number

In [176]:
gol = Golomb(5, sorted_table, True)

In [177]:
gol.decode_stream("001110")

[13]

In [178]:
data_english, path_english = get_dataset(DataSets.english)

Loading C://Users//Vojta//Desktop//iv//AKS//datasets\english\english.50MB


### Encoding table for English

In [239]:
sorted_table = get_sorted_probs(data_english)
gol = Golomb(50, sorted_table, True)

Calculating freqs
End of calculating 6.918617248535156
Calculating probs
End of calculating 0.0


In [240]:
encoding_table = build_encoding_table(sorted_table, gol.M)

Table for m = 50


In [241]:
eng = pd.DataFrame.from_dict(encoding_table, orient="index")

In [242]:
eng.sort_index()

Unnamed: 0,number,char,p,golomb,counter
0,0,,1.704416e-01,100000,8936048
1,1,e,9.593225e-02,100001,5029613
2,2,t,6.757509e-02,100010,3542881
3,3,a,6.149715e-02,100011,3224222
4,4,o,5.518679e-02,100100,2893377
...,...,...,...,...,...
170,170,,1.907349e-08,000111010,1
171,171,Ö,1.907349e-08,000111011,1
172,172,Î,1.907349e-08,000111100,1
173,173,¼,1.907349e-08,000111101,1


In [248]:
eng['len'] =  list(map(lambda x: len(x), eng['golomb'].values))

In [249]:
eng.head()

Unnamed: 0,number,char,p,golomb,counter,len
0,0,,0.170442,100000,8936048,6
1,1,e,0.095932,100001,5029613,6
2,2,t,0.067575,100010,3542881,6
3,3,a,0.061497,100011,3224222,6
4,4,o,0.055187,100100,2893377,6


In [250]:
size = eng['counter'] * eng['len']

In [251]:
np.sum(size.values)

317209747

### Optimal M

In [252]:
from src.types.table_types import Fields

In [253]:
def from_table_size(encoding_table):
    encoding_table['len'] =  list(map(lambda x: len(x), encoding_table['golomb'].values))
    size = encoding_table['counter'] * encoding_table['len']
    aprox = np.sum(size.values)
    return aprox

In [254]:
def run(ds, path, name, M, sorted_table):
    gol = Golomb(M, sorted_table, False)

    encoding_table = build_encoding_table(sorted_table, gol.M)
    aprox = from_table_size(pd.DataFrame.from_dict(encoding_table, orient="index"))



    # tic = time.time()
    # decoded_numbers = gol.decode_stream(encoded_string)
    # toc = time.time()
    # decoding_data = toc - tic

    # decoded_data = gol.from_numbers_data(decoded_numbers, sorted_table)

    # if not decoded_data == test_data:
    #     return None

    n = len(ds)

    return {
        #"encoding_data": encoding_data,
        #"decoding_data": decoding_data,
        Fields.Path.value: path,
        Fields.Parameter.value: M,
        Fields.Type.value: "Golomb",
        Fields.Original.value: n * 8,
        Fields.Aprox.value: aprox,
        Fields.Calculation.value: "",
        Fields.Name.value: name
    }

In [255]:
data_dna, path_dna = get_dataset(DataSets.dna)
data_english, path_english = get_dataset(DataSets.english)

Loading C://Users//Vojta//Desktop//iv//AKS//datasets\dna\dna.50MB
Loading C://Users//Vojta//Desktop//iv//AKS//datasets\english\english.50MB


In [256]:
datasets = [(data_dna, path_dna, DataSets.dna.value), (data_english, path_english, DataSets.english.value)]

In [257]:
res = {}
counter = 0

for data, path, name in datasets:
    sorted_table = get_sorted_probs(data)
    alphabet_size = len(list(sorted_table.keys()))

    for M in range(3, int(alphabet_size / 2)):
        print(f'Current M = {M}')
        value = run(data, path, name, M, sorted_table)
        if value is not None:
            res[counter] = value
            counter += 1

Calculating freqs
End of calculating 6.28487753868103
Calculating probs
End of calculating 0.0
Current M = 3
Table for m = 3
Current M = 4
Table for m = 4
Current M = 5
Table for m = 5
Current M = 6
Table for m = 6
Current M = 7
Table for m = 7
Calculating freqs
End of calculating 7.065019130706787
Calculating probs
End of calculating 0.0
Current M = 3
Table for m = 3
Current M = 4
Table for m = 4
Current M = 5
Table for m = 5
Current M = 6
Table for m = 6
Current M = 7
Table for m = 7
Current M = 8
Table for m = 8
Current M = 9
Table for m = 9
Current M = 10
Table for m = 10
Current M = 11
Table for m = 11
Current M = 12
Table for m = 12
Current M = 13
Table for m = 13
Current M = 14
Table for m = 14
Current M = 15
Table for m = 15
Current M = 16
Table for m = 16
Current M = 17
Table for m = 17
Current M = 18
Table for m = 18
Current M = 19
Table for m = 19
Current M = 20
Table for m = 20
Current M = 21
Table for m = 21
Current M = 22
Table for m = 22
Current M = 23
Table for m = 23
C

In [258]:
result_dataframe = pd.DataFrame.from_dict(res, orient="index")

In [259]:
result_dataframe

Unnamed: 0,Path,Parameter,Type,Original,Aprox,Calculation,Name
0,C://Users//Vojta//Desktop//iv//AKS//datasets\d...,3,Golomb,419430400,153110628,,dna
1,C://Users//Vojta//Desktop//iv//AKS//datasets\d...,4,Golomb,419430400,179322230,,dna
2,C://Users//Vojta//Desktop//iv//AKS//datasets\d...,5,Golomb,419430400,168296369,,dna
3,C://Users//Vojta//Desktop//iv//AKS//datasets\d...,6,Golomb,419430400,179322401,,dna
4,C://Users//Vojta//Desktop//iv//AKS//datasets\d...,7,Golomb,419430400,194512535,,dna
...,...,...,...,...,...,...,...
84,C://Users//Vojta//Desktop//iv//AKS//datasets\e...,82,Golomb,419430400,367390716,,english
85,C://Users//Vojta//Desktop//iv//AKS//datasets\e...,83,Golomb,419430400,367390706,,english
86,C://Users//Vojta//Desktop//iv//AKS//datasets\e...,84,Golomb,419430400,367390696,,english
87,C://Users//Vojta//Desktop//iv//AKS//datasets\e...,85,Golomb,419430400,367390688,,english


In [261]:
english = result_dataframe[result_dataframe.Name == 'english']

In [263]:
english.sort_values(by='Aprox')

Unnamed: 0,Path,Parameter,Type,Original,Aprox,Calculation,Name
7,C://Users//Vojta//Desktop//iv//AKS//datasets\e...,5,Golomb,419430400,243175932,,english
8,C://Users//Vojta//Desktop//iv//AKS//datasets\e...,6,Golomb,419430400,246513444,,english
10,C://Users//Vojta//Desktop//iv//AKS//datasets\e...,8,Golomb,419430400,247197762,,english
11,C://Users//Vojta//Desktop//iv//AKS//datasets\e...,9,Golomb,419430400,247998064,,english
12,C://Users//Vojta//Desktop//iv//AKS//datasets\e...,10,Golomb,419430400,249845412,,english
...,...,...,...,...,...,...,...
77,C://Users//Vojta//Desktop//iv//AKS//datasets\e...,75,Golomb,419430400,367390822,,english
76,C://Users//Vojta//Desktop//iv//AKS//datasets\e...,74,Golomb,419430400,367390841,,english
75,C://Users//Vojta//Desktop//iv//AKS//datasets\e...,73,Golomb,419430400,367390861,,english
74,C://Users//Vojta//Desktop//iv//AKS//datasets\e...,72,Golomb,419430400,367390885,,english


In [264]:
from src.save import save_both

In [265]:
save_both(result_dataframe)