# Poznamky

- Singulární - jednoznačné
- Nesingulární - více výsledků
- Prefixové, okamžité kódy - kodová slova se sama oddělují

Ne všechny kódy jsme schopní dekodovat v jasném místě.
Není potřeba použít binárku, stačí text.

# Zadání

## 2.3. Variable-Length Codes (10 points)

- Implement Golomb coding/decoding algorithm.(7 pts)
- You should encode symbols with high probability by shorter codes. First compute frequency of symbols, sort them by probability and assign them a Golomb-code.
- You don't have to store bits in binary, it is sufficient to store them as textual values.
- Find the optimal M for the selected file.(2 pts)
- Compare your results with the entropy and compute redundancy of the code R = C - H, where C is an average number of bits per byte used in the compressed file for the original data: 8*|c(m)|/|m|(1 pt)

In [1]:
import os
import sys

def adding_module_path():
    module_path = os.path.abspath(os.path.sep.join([".."]*2))

    if module_path not in sys.path:
        sys.path.append(module_path)

adding_module_path()

In [2]:
from src.load_data import get_dataset
from src.load_data import DataSets
import numpy as np
import time
import pandas as pd

In [3]:
data_dna, path_dna = get_dataset(DataSets.dna)

Loading C://Users//Vojta//Desktop//iv//AKS//datasets\dna\dna.50MB


In [4]:
def calc_freq(data):
    tic = time.time()
    print('Calculating freqs')
    res = {}
    for c in data:
        res[c] = res.get(c, 0) + 1
    toc = time.time()
    print(f'End of calculating {toc - tic}')
    return res

In [5]:
def calc_probs(freq):
    res = {}    
    tic = time.time()
    print('Calculating probs')
    n = np.sum(list(freq.values()))
    for k, v in freq.items():
        res[k] = v / n

    toc = time.time()
    print(f'End of calculating {toc - tic}')
    return res

In [6]:
def get_probs(data):
    freq = calc_freq(data)
    return calc_probs(freq)

In [7]:
p = get_probs(data_dna)

Calculating freqs
End of calculating 6.977980852127075
Calculating probs
End of calculating 0.0


In [8]:
def get_sorted_probs(data):
    p = get_probs(data)
    return dict(sorted(p.items(), key=lambda item: item[1], reverse=True))

In [9]:
pd.DataFrame.from_dict(get_sorted_probs(data_dna), orient='index')

Calculating freqs
End of calculating 7.106724500656128
Calculating probs
End of calculating 0.0


Unnamed: 0,0
A,0.2899787
T,0.289728
C,0.2102979
G,0.2099745
N,8.907318e-06
\n,6.408691e-06
R,1.296997e-06
Y,1.087189e-06
K,8.773804e-07
W,8.583069e-07


In [50]:
import math
import time

In [58]:
def unary_func(n):
    return n * '0' + '1'

In [59]:
def encode_number_func(number, M):
    q = number // M
    r = number % M
    
    k = math.ceil(math.log2(M))
    quo_part = unary_func(q)

    c = 2**(k) - M

    truncated_part = None
    if r < c:
        b = c - 1
        a = "{0:0" + str(b) + "b}"
        truncated_part = a.format(r)
    else:
        b = c
        a = "{0:0" + str(b) + "b}"
        truncated_part = a.format(r+c)

    return "".join([quo_part, truncated_part])


In [60]:
def build_encoding_table(numbers, M):
    res = {}

    for current_number in numbers:

        value = encode_number_func(current_number, M)
        res[current_number] = value

    return res

In [79]:
class Golomb:
    def __init__(self, M, prob_table, verbose=False):
        self.M = M
        self.prob_table = prob_table
        self.verbose = verbose

    def from_numbers_data(self, numbers, prob_table):
        keys = list(prob_table.keys())
        res = ""
        try:
            for number in numbers:
                res += keys[number]
            return res
        except Exception as e:
            print(f"Error occurred {str(e)}")

    def from_data_numbers(self, data, prob_table):
        keys = list(prob_table.keys())
        return [keys.index(c) for c in data]

    def encode_number(self, number):
        return encode_number_func(number, self.M)

    def unary(self, n):
        return unary_func(n)

    def encode_with_table(self, c, prop_table):
        keys = list(prop_table.keys())
        char_index = keys.index(c)
        return self.encode_number(char_index, len(keys))

    def encode_indicies(self, indicies, precalculated_table=None):
        encoded_string = ""
        size = len(np.unique(indicies))
        for index in indicies:
            encoded_number = ""
            if precalculated_table is not None:
                encoded_number = precalculated_table[index]
            else:
                encoded_number = self.encode_number(index)
            if self.verbose:
                print(encoded_number)
            encoded_string += encoded_number
        return encoded_string

    def encode_stream(self, stream):
        tic = time.time()
        encoded_string = ""
        for c in stream:
            enc = self.encode_with_table(c, self.prob_table)
            encoded_string += enc
        toc = time.time()
        return encoded_string, toc - tic

    def decode_number(self, q, M, r):
        print(q, M, r)
        return q * M + r

    def decode_stream(self, stream):
        res = []

        i = 0

        current_quotient = ""
        k = math.ceil(math.log2(self.M))
        c = math.ceil(math.log2(self.M)) - 1

        print('k', k)
        print('c', c)
    

        try:
            while i < len(stream) - c:
                print('\n')
                current_char = stream[i]
                if current_char == '0':
                    current_quotient += current_char
                    i += 1

                else:
                    current_quotient += '1'
                    next_bits = stream[(i+1):(i+1+c)]
                    dec = int(next_bits, 2)

                    value = dec
                    right_side = math.pow(2, k) - self.M
                    is_smaller = value < right_side

                    q = current_quotient.count('0')
                    remainder = 0

                    if is_smaller:
                        if self.verbose:
                            print('1.', current_quotient + next_bits)
                        remainder = int(next_bits, 2)
                        i += k
                        res.append(self.decode_number(q, self.M, remainder))
                        current_quotient = ""
                        continue
                    else:
                        bits = stream[(i+1):(i+1+c+1)]
                        if self.verbose:
                            print('2.', current_quotient + bits, 'only_bits', bits)
                        remainder = int(bits, 2)
                        remainder = remainder - k
                        i += k + 1
                        res.append(self.decode_number(q, self.M, remainder))
                        current_quotient = ""
                        continue
        except Exception as e:
            print("Error occurred")
            print(f"{str(e)}")
            return None

        return res

    def decode_from_indicies(self, ints, keys):
        res = ""
        for index in ints:
            res += keys[index]
        return res

In [80]:
test_data = data_dna[0:10]
test_data

'GATCAATGAG'

# Tests

### Test conversion

In [81]:
sorted_table = get_sorted_probs(test_data)
gol = Golomb(10, sorted_table)

indicies = gol.from_data_numbers(test_data, sorted_table)
current_data = gol.from_numbers_data(indicies, sorted_table)

print(current_data)
print(current_data == test_data)

Calculating freqs
End of calculating 0.0009987354278564453
Calculating probs
End of calculating 0.0
GATCAATGAG
True


### Test of encoding

In [82]:
sorted_table = get_sorted_probs(test_data)

print(sorted_table)
gol = Golomb(4, sorted_table, True)

indicies = gol.from_data_numbers(test_data, sorted_table)
encoding_table = build_encoding_table(indicies, gol.M)

Calculating freqs
End of calculating 0.0
Calculating probs
End of calculating 0.001001596450805664
{'A': 0.4, 'G': 0.3, 'T': 0.2, 'C': 0.1}


In [83]:
pd.DataFrame.from_dict(encoding_table, orient="index")

Unnamed: 0,0
1,11
0,10
2,110
3,111


In [85]:
encoded_string = gol.encode_indicies(indicies, encoding_table)

print(encoded_string)

print()

decoded_numbers = gol.decode_stream(encoded_string)

decoded_data = gol.from_numbers_data(decoded_numbers, sorted_table)

print(decoded_numbers, decoded_data)
print(indicies, test_data)

print(decoded_data == test_data)

11
10
110
111
10
10
110
11
10
11
11101101111010110111011

k 2
c 1


2. 111 only_bits 11
0 4 1




2. 0110 only_bits 10
1 4 0


2. 111 only_bits 11
0 4 1


2. 101 only_bits 01
0 4 -1




2. 0110 only_bits 10
1 4 0


2. 111 only_bits 11
0 4 1




2. 011 only_bits 1
1 4 -1
Error occurred list index out of range
[1, 4, 1, -1, 4, 1, 3] None
[1, 0, 2, 3, 0, 0, 2, 1, 0, 1] GATCAATGAG
False


# Test of specified number

In [86]:
gol = Golomb(5, sorted_table, True)

In [87]:
gol.decode_stream("001110")

k 3
c 2






2. 001110 only_bits 110
2 5 3


[13]

https://homel.vsb.cz/~vas218/acs.html