# Poznamky

- Singulární - jednoznačné
- Nesingulární - více výsledků
- Prefixové, okamžité kódy - kodová slova se sama oddělují

Ne všechny kódy jsme schopní dekodovat v jasném místě.
Není potřeba použít binárku, stačí text.

# Zadání

## 2.3. Variable-Length Codes (10 points)

- Implement Golomb coding/decoding algorithm.(7 pts)
- You should encode symbols with high probability by shorter codes. First compute frequency of symbols, sort them by probability and assign them a Golomb-code.
- You don't have to store bits in binary, it is sufficient to store them as textual values.
- Find the optimal M for the selected file.(2 pts)
- Compare your results with the entropy and compute redundancy of the code R = C - H, where C is an average number of bits per byte used in the compressed file for the original data: 8*|c(m)|/|m|(1 pt)

In [31]:
import os
import sys

def adding_module_path():
    module_path = os.path.abspath(os.path.sep.join([".."]*2))

    if module_path not in sys.path:
        sys.path.append(module_path)

adding_module_path()

In [32]:
from src.load_data import get_dataset
from src.load_data import DataSets
import numpy as np
import time
import pandas as pd

In [33]:
data = get_dataset(DataSets.dna)

Loading C://Users//proko//Desktop//University//iv//aks//datasets\dna\dna.50MB


In [34]:
def calc_freq(data):
    tic = time.time()
    print('Calculating freqs')
    res = {}
    for c in data:
        res[c] = res.get(c, 0) + 1
    toc = time.time()
    print(f'End of calculating {toc - tic}')
    return res

In [35]:
def calc_probs(freq):
    res = {}    
    tic = time.time()
    print('Calculating probs')
    n = np.sum(list(freq.values()))
    for k, v in freq.items():
        res[k] = v / n

    toc = time.time()
    print(f'End of calculating {toc - tic}')
    return res

In [36]:
def get_probs(data):
    freq = calc_freq(data)
    return calc_probs(freq)

In [37]:
p = get_probs(data)

Calculating freqs
End of calculating 8.862469673156738
Calculating probs
End of calculating 0.0


In [38]:
def get_sorted_probs(data):
    p = get_probs(data)
    return dict(sorted(p.items(), key=lambda item: item[1], reverse=True))

In [39]:
pd.DataFrame.from_dict(get_sorted_probs(data), orient='index')

Calculating freqs
End of calculating 9.398788213729858
Calculating probs
End of calculating 0.0010020732879638672


Unnamed: 0,0
A,0.2899787
T,0.289728
C,0.2102979
G,0.2099745
N,8.907318e-06
\n,6.408691e-06
R,1.296997e-06
Y,1.087189e-06
K,8.773804e-07
W,8.583069e-07


In [40]:
import math
import time

In [45]:
class Golomb:
    def __init__(self, M, prob_table, verbose=False):
        self.M = M
        self.prob_table = prob_table
        self.verbose = verbose

    def from_numbers_data(self, numbers, prob_table):
        keys = list(prob_table.keys())
        res = ""
        try:
            for number in numbers:
                res += keys[number]
            return res
        except Exception as e:
            print(f"Error occurred {str(e)}")

    def from_data_numbers(self, data, prob_table):
        keys = list(prob_table.keys())
        return [keys.index(c) for c in data]

    def encode_number(self, number, size_of_alphabet=0):
        q = number // self.M
        r = number % self.M
        
        k = math.ceil(math.log2(self.M))
        quo_part = self.unary(q)

        c = 2**(k) - self.M

        truncated_part = None
        if r < c:
            b = c - 1
            a = "{0:0" + str(b) + "b}"
            truncated_part = a.format(r)
        else:
            b = c
            a = "{0:0" + str(b) + "b}"
            truncated_part = a.format(r+c)

        return "".join([quo_part, truncated_part])

    def unary(self, n):
        return n * '0' + '1'

    def encode_with_table(self, c, prop_table):
        keys = list(prop_table.keys())
        char_index = keys.index(c)
        return self.encode_number(char_index, len(keys))

    def encode_indicies(self, indicies):
        encoded_string = ""
        size = len(np.unique(indicies))
        for index in indicies:
            encoded_number = self.encode_number(index, size)
            if self.verbose:
                print(encoded_number)
            encoded_string += encoded_number
        return encoded_string

    def encode_stream(self, stream):
        tic = time.time()
        encoded_string = ""
        for c in stream:
            enc = self.encode_with_table(c, self.prob_table)
            encoded_string += enc
        toc = time.time()
        return encoded_string, toc - tic

    def decode_number(self, q, M, r):
        return q * M + r

    def decode_stream(self, stream):
        res = []

        i = 0

        current_quotient = ""

        k = math.ceil(math.log2(self.M))

        
        move = k - 1


        try:
            while i < len(stream) - move:
                current_char = stream[i]
                if current_char == '0':
                    current_quotient += current_char
                    i += 1

                else:
                    current_quotient += '1'
                    next_bits = stream[(i+1):(i+1+move)]
                    dec = int(next_bits, 2)
                    q = current_quotient.count('0')

                    print(dec, next_bits, 2**(k - 1), k)
                    remainder = 0

                    if dec <= 2**(k - 2):
                        if self.verbose:
                            print('1.', current_quotient + next_bits)
                        remainder = int(next_bits, 2)
                        i += k
                        res.append(self.decode_number(q, self.M, remainder))
                        current_quotient = ""
                        continue
                    else:
                        bits = stream[(i+1):(i+1+k)]
                        if self.verbose:
                            print('2.', current_quotient + bits)
                        remainder = int(bits, 2)
                        remainder = remainder - k
                        i += k + 1
                        res.append(self.decode_number(q, self.M, remainder))
                        current_quotient = ""
                        continue
                #print(i)
        except Exception as e:
            print("Error occurred")
            print(f"{str(e)}")
            return None

        return res

    def decode_from_indicies(self, ints, keys):
        res = ""
        for index in ints:
            res += keys[index]
        return res

In [46]:
test_data = data[0:10]
test_data

'GATCAATGAG'

# Tests

### Test conversion

In [47]:
sorted_table = get_sorted_probs(test_data)
gol = Golomb(10, sorted_table)

indicies = gol.from_data_numbers(test_data, sorted_table)
current_data = gol.from_numbers_data(indicies, sorted_table)

print(current_data)
print(current_data == test_data)

Calculating freqs
End of calculating 0.0
Calculating probs
End of calculating 0.0
GATCAATGAG
True


### Test of encoding

In [55]:
sorted_table = get_sorted_probs(test_data)
print(sorted_table)
gol = Golomb(8, sorted_table, True)


indicies = gol.from_data_numbers(test_data, sorted_table)
encoded_string = gol.encode_indicies(indicies)

print(encoded_string)

decoded_numbers = gol.decode_stream(encoded_string)

decoded_data = gol.from_numbers_data(decoded_numbers, sorted_table)

print(decoded_numbers, decoded_data)
print(indicies, test_data)

print(decoded_data == test_data)

Calculating freqs
End of calculating 0.0
Calculating probs
End of calculating 0.0
{'A': 0.4, 'G': 0.3, 'T': 0.2, 'C': 0.1}
11
10
110
111
10
10
110
11
10
11
11101101111010110111011
3 11 4 3
2. 1110
2 10 4 3
1. 110
3 11 4 3
2. 1111
1 01 4 3
1. 0101
1 01 4 3
1. 101
2 10 4 3
1. 110
Error occurred list index out of range
[3, 2, 4, 9, 1, 2] None
[1, 0, 2, 3, 0, 0, 2, 1, 0, 1] GATCAATGAG
False


# Test of specified number

In [None]:
gol = Golomb(5, sorted_table, True)

In [None]:
gol.decode_stream("001110")

https://homel.vsb.cz/~vas218/acs.html