# Poznamky

- Singulární - jednoznačné
- Nesingulární - více výsledků
- Prefixové, okamžité kódy - kodová slova se sama oddělují

Ne všechny kódy jsme schopní dekodovat v jasném místě.
Není potřeba použít binárku, stačí text.

# Zadání

## 2.3. Variable-Length Codes (10 points)

- Implement Golomb coding/decoding algorithm.(7 pts)
- You should encode symbols with high probability by shorter codes. First compute frequency of symbols, sort them by probability and assign them a Golomb-code.
- You don't have to store bits in binary, it is sufficient to store them as textual values.
- Find the optimal M for the selected file.(2 pts)
- Compare your results with the entropy and compute redundancy of the code R = C - H, where C is an average number of bits per byte used in the compressed file for the original data: 8*|c(m)|/|m|(1 pt)

In [38]:
import os
import sys

def adding_module_path():
    module_path = os.path.abspath(os.path.sep.join([".."]*2))

    if module_path not in sys.path:
        sys.path.append(module_path)

adding_module_path()

In [39]:
from src.load_data import get_dataset
from src.load_data import DataSets
import numpy as np
import time
import pandas as pd

In [40]:
data = get_dataset(DataSets.dna)

Loading C://Users//proko//Desktop//University//iv//aks//datasets\dna\dna.50MB


In [41]:
def calc_freq(data):

    tic = time.time()
    print('Calculating freqs')

    res = {}
    for c in data:
        res[c] = res.get(c, 0) + 1

    toc = time.time()
    print(f'End of calculating {toc - tic}')
    
    return res

In [42]:
def calc_probs(freq):
    res = {}
    
    tic = time.time()
    print('Calculating probs')

    n = np.sum(list(freq.values()))
    for k, v in freq.items():
        res[k] = v / n

    toc = time.time()
    print(f'End of calculating {toc - tic}')
    
    return res

In [43]:
def get_probs(data):
    freq = calc_freq(data)
    return calc_probs(freq)

In [44]:
p = get_probs(data)

Calculating freqs
End of calculating 9.874698877334595
Calculating probs
End of calculating 0.0


In [519]:
def get_sorted_probs(data):
    p = get_probs(data)
    return dict(sorted(p.items(), key=lambda item: item[1], reverse=True))

In [520]:
pd.DataFrame.from_dict(get_sorted_probs(data), orient='index')

Calculating freqs
End of calculating 9.800301313400269
Calculating probs
End of calculating 0.0


Unnamed: 0,0
A,0.2899787
T,0.289728
C,0.2102979
G,0.2099745
N,8.907318e-06
\n,6.408691e-06
R,1.296997e-06
Y,1.087189e-06
K,8.773804e-07
W,8.583069e-07


In [None]:
import math
import time

In [531]:
class Golomb:
    def __init__(self, M, sorted_table):
        self.M = M
        self.alphabet = sorted_table

    def from_numbers_data(self, numbers, prob_table):
        keys = list(prob_table.keys())
        res = ""
        for number in numbers:
            res += keys[number]
        return res

    def from_data_numbers(self, data, prob_table):
        keys = list(prob_table.keys())
        return [keys.index(c) for c in data]

    def encode_number(self, number):
        q = number // self.M
        r = number % self.M
        k = math.ceil(math.log2(self.M))
        quo_part = '0' * q + '1'
        c = 2**(k) - self.M
        truncated_part = None
        if r < c:
            truncated_part ="{0:b}".format(r).zfill(k-1)[0:k]
            #print(len(truncated_part), k - 1)
        else:
            truncated_part = "{0:b}".format(r + c).zfill(k)[0:k+1]
            #print(len(truncated_part), k)

        return "".join([quo_part, truncated_part])

    def encode(self, c):
        char_index = self.keys.index(c)
        #print(f'index {char_index}')
        return self.encode_number(char_index)

    def decode_number(self, q, M, r):
        return q * M + r

    def encode_stream(self, stream):
        tic = time.time()
        res = ""
        for c in stream:
            enc = self.encode(c)
            #print(enc, c)
            res += enc
        toc = time.time()
        return res, toc - tic

    def decode_stream(self, stream):
        res = []
        i = 0
        current_quotient = ""
        while i < len(stream):
            current_char = stream[i]
            if current_char == '0':
                current_quotient += current_char
                i += 1
            else:
                current_quotient += '1'       
                k = math.ceil(math.log2(self.M))
                move = k - 1


                next_bits = stream[i+1:i+1+move]
                dec = int(current_quotient + next_bits, 2)

                #print(dec, 2**(k - 1))
                remainder = 0

                if dec < 2**(k - 1):
                    #print('here', current_quotient, next_bits)
                    remainder = int(next_bits, 2)
                    i += k
                else:
                    bits = stream[i+1:i+1+k]
                    remainder = int(bits, 2)
                    #print('here2', current_quotient, bits)
                    remainder = remainder - k
                    i += k + 1
                res.append(self.decode_number(current_quotient.count('0'), self.M, remainder))
                current_quotient = ""
            #print(i)

        return res

    def decode_from_indicies(self, ints, keys):
        res = ""
        for index in ints:
            res += keys[index]
        return res

In [532]:
test_data = data[0:10]
test_data

'GATCAATGAG'

# Test conversion

In [533]:
sorted_table = get_sorted_probs(test_data)
gol = Golomb(5, sorted_table)

indicies = gol.from_data_numbers(test_data, sorted_table)
current_data = gol.from_numbers_data(indicies, sorted_table)

print(current_data)
print(current_data == test_data)

Calculating freqs
End of calculating 0.0
Calculating probs
End of calculating 0.0
GATCAATGAG
True


In [527]:
sorted_table = get_sorted_probs(test_data)
gol = Golomb(5, sorted_table)

print(sorted_table)
indicies = gol.from_data_numbers(test_data, sorted_table)

print(test_data)
print(indicies)
# encoded_data, time_for_encoding = gol.encode_stream(test_data)
# decoded_data = gol.decode_stream(encoded_data)
# print(decoded_data)
# predicted = gol.decode_from_indicies(decoded_data, gol.keys)
# print(test_data, predicted)

Calculating freqs
End of calculating 0.0
Calculating probs
End of calculating 0.0
{'A': 0.4, 'G': 0.3, 'T': 0.2, 'C': 0.1}
GATCAATGAG
[1, 0, 2, 3, 0, 0, 2, 1, 0, 1]


In [518]:
for M in range(2, 20):
    gol = Golomb(M, p)
    encoded_data, time_for_encoding = gol.encode_stream(test_data)
    decoded_data = gol.decode_stream(encoded_data)
    print(decoded_data)
    predicted = gol.decode_from_indicies(decoded_data, gol.keys)
    print(test_data, predicted)

[2, -1, 0, 1, -1, -1, 0, 2, -1, 2]
GATCAATGAG CBATBBACBC
[2, 3, 1, -1, 3, 2, 4]
GATCAATGAG CGTBGCN
[1, -2, -1, 0, -2, -2, -1, 1, -2, 1]
GATCAATGAG TDBADDBTDT
[3, -2, 8, -2, 10, 2, 13]
GATCAATGAG GDKDSCH
[2, -2, 9, 4, 12, 0, 14]
GATCAATGAG CDWNVAD
[1, -2, 6, 3, 5, 7, 12, -3]
GATCAATGAG TDRG
YVH
[0, -3, -2, -1, -3, -3, -2, 0, -3, 0]
GATCAATGAG AHDBHHDAHA
[3, 26, 9, 6, 25, -3, 10]


IndexError: list index out of range