# 23.3. Dictionary coding I - 10 points

- Compress and decompress files from Lab 1. using LZ77 or LZSS (7pts)
- Try different sizes of the sliding window: 4kB, 16kB, 32kB and lengths of the uncompressed part. (1pt)
- Try to compute zero order entropy of each field in a token and use it to compute approximate final compressed file size. (1pt)
- Compute the length of the output file and prepare a report summarizing the results. (1pt)

## Compress and decompress files from Lab 1. using LZ77 or LZSS (7pts)

In [1]:
import os
import sys

def adding_module_path():
    module_path = os.path.abspath(os.path.sep.join([".."]*2))

    if module_path not in sys.path:
        sys.path.append(module_path)

adding_module_path()

In [2]:
from src.load_data import get_dataset
from src.load_data import DataSets
from src.get_probs import get_sorted_probs_as_df
import numpy as np
import pandas as pd
import time
from src.types.table_types import Fields
from src.save import save_both
from enum import Enum
import re
from ast import literal_eval


In [3]:
class LZType(Enum):
    LZ77 = "LZ77"
    LZSS = "LZSS"

In [4]:
data_dna, path_dna = get_dataset(DataSets.dna)

Loading C://Users//proko//Desktop//University//iv//aks//datasets\dna\dna.50MB


In [5]:
test_text = "abbabbabbbaab"

### LZ77

In [6]:
class LZ77:
    def __init__(self):
        pass

    def encode(self, text):
        items = []
        unprocessed = text
        processed = ""
        match_length = 0
        while True:
            try:
                pointer, match_length = None, None

                if len(unprocessed) == 0:
                    break

                if len(unprocessed) == 1:
                    current_item = {
                        "Processed": processed + unprocessed,
                        "Unprocessed": "",
                        "P": 0,
                        "L": 0,
                        "N": unprocessed
                    }
                    items.append(current_item)
                    break


                if len(items) == 0:
                    pointer, match_length, match_text = self.find_longest_match(None, unprocessed)
                else:
                    last_item = items[-1]
                    pointer, match_length, match_text = self.find_longest_match(last_item["Processed"], last_item["Unprocessed"])
                

                next_symbol = unprocessed[match_length]
                processed +=  match_text + next_symbol
                unprocessed = unprocessed[match_length+1:]
                

                current_item = {
                    "Processed": processed,
                    "Unprocessed": unprocessed,
                    "P": pointer,
                    "L": match_length,
                    "N": next_symbol
                }

                items.append(current_item)
            except Exception as e:
                print(f"Exception {e}")
                print(match_length, unprocessed)
                break
        return items

    def find_longest_match(self, text_processed, text_unprocessed):
        #start of match
        pointer = 0

        #match length
        match_length = 0

        #text which is matched without replication
        match_text = ""

        if text_processed is None:
            return pointer, match_length, match_text

        for i in range(len(text_processed)):
            sub_sequence = text_unprocessed[:i+1]

            matches = [(match.span(), match.string[match.span()[0]:match.span()[1]])for match in re.finditer(sub_sequence, text_processed)]
            sorted_matches = list(sorted(matches, key=lambda x: x[0][0], reverse=True))


            if len(sorted_matches) > 0:
                last = sorted_matches[0]
                span, text = last
                start, end = span

                if end == len(text_processed):
                    #try replicate
                    test_length = 0
                    while True:
                        current_char = text[test_length % len(text)]
                        if current_char == text_unprocessed[test_length]:
                            test_length += 1
                        else:
                            break

                    #maybe equal?
                    if test_length > match_length:
                        match_length = test_length
                        pointer = len(text_processed) - start
                        match_text = text_unprocessed[0:test_length]
                else:
                    if len(text) > match_length:
                        match_length = len(text)
                        pointer = len(text_processed) - start
                        match_text = text_unprocessed[0:len(text)]


        return pointer, match_length, match_text

    def create_from_triplets_code(self, triplets):
        return ";".join(str((x["P"], x["L"], x["N"])) for x in triplets)

    def create_from_code_triplets(self, code):
        return code.split(';')

    def tiplets_to_table(self, triplets):
        return pd.DataFrame(triplets)
    
    def decode(self, triplets):
        processed = ""

        for triplet in triplets:
            new_processed = processed
            pointer, sequnce_length, next_symbol = literal_eval(triplet)

            if pointer == 0 and sequnce_length == 0:
                new_processed += next_symbol
                processed = new_processed
                continue
            
            i = pointer
            counter = 0
            while counter < sequnce_length:
                new_processed += processed[-i]
                counter += 1
                i -= 1
                if i == 0:
                    i = pointer

            new_processed += next_symbol
            processed = new_processed


        return processed

In [7]:
instance = LZ77()

In [8]:
instance.find_longest_match("abba", "bbabbbaab")

(3, 5, 'bbabb')

In [9]:
triplets = instance.encode(test_text)

In [10]:
instance.tiplets_to_table(triplets)

Unnamed: 0,Processed,Unprocessed,P,L,N
0,a,bbabbabbbaab,0,0,a
1,ab,babbabbbaab,0,0,b
2,abba,bbabbbaab,1,1,a
3,abbabbabbb,aab,3,5,b
4,abbabbabbbaa,b,4,1,a
5,abbabbabbbaab,,0,0,b


In [11]:
code = instance.create_from_triplets_code(triplets)

In [12]:
code

"(0, 0, 'a');(0, 0, 'b');(1, 1, 'a');(3, 5, 'b');(4, 1, 'a');(0, 0, 'b')"

In [13]:
triplets = instance.create_from_code_triplets(code)

In [14]:
decoded = instance.decode(triplets)

#### Test if works correctly

In [15]:
decoded == test_text

True

### LZSS

In [16]:
test_text = "abbabbabbbaab"

In [17]:
class LZSS:
    def __init__(self):
        pass

    def encode(self, text):
        items = []
        unprocessed = text
        processed = ""
        while True:
            pointer, match_length = None, None

            if len(unprocessed) == 0:
                break

            if len(unprocessed) == 1:
                current_item = {
                    "Processed": processed + unprocessed,
                    "Unprocessed": "",
                    "Output": (0, unprocessed)
                }
                items.append(current_item)
                break


            if len(items) == 0:
                pointer, match_length, match_text = self.find_longest_match(None, unprocessed)
            else:
                last_item = items[-1]
                pointer, match_length, match_text = self.find_longest_match(last_item["Processed"], last_item["Unprocessed"])
            
            if pointer == 0:
                next_symbol = unprocessed[match_length]
                processed += next_symbol
                unprocessed = unprocessed[match_length+1:]
                current_item = {
                    "Processed": processed,
                    "Unprocessed": unprocessed,
                    "Output": (0, next_symbol)
                }
                items.append(current_item)

            else:
                processed +=  match_text
                unprocessed = unprocessed[match_length:]
                
                current_item = {
                    "Processed": processed,
                    "Unprocessed": unprocessed,
                    "Output": (1, pointer, match_length)
                }
                items.append(current_item)
            
        return items

    def find_longest_match(self, text_processed, text_unprocessed):
        #start of match
        pointer = 0

        #match length
        match_length = 0

        #text which is matched without replication
        match_text = ""

        if text_processed is None:
            return pointer, match_length, match_text

        for i in range(len(text_processed)):
            sub_sequence = text_unprocessed[:i+1]

            matches = [(match.span(), match.string[match.span()[0]:match.span()[1]])for match in re.finditer(sub_sequence, text_processed)]
            sorted_matches = list(sorted(matches, key=lambda x: x[0][0], reverse=True))


            if len(sorted_matches) > 0:
                last = sorted_matches[0]
                span, text = last
                start, end = span

                if end == len(text_processed):
                    #try replicate
                    test_length = 0
                    while True:
                        current_char = text[test_length % len(text)]
                        if current_char == text_unprocessed[test_length]:
                            test_length += 1
                        else:
                            break

                    #maybe equal?
                    if test_length > match_length:
                        match_length = test_length
                        pointer = len(text_processed) - start
                        match_text = text_unprocessed[0:test_length]
                else:
                    if len(text) > match_length:
                        match_length = len(text)
                        pointer = len(text_processed) - start
                        match_text = text_unprocessed[0:len(text)]


        return pointer, match_length, match_text

    def create_from_triplets_code(self, triplets):
        return ";".join(str((x["Output"])) for x in triplets)

    def create_from_code_triplets(self, code):
        return code.split(';')

    def tiplets_to_table(self, triplets):
        return pd.DataFrame(triplets)
    
    def decode(self, triplets):
        processed = ""

        for triplet in triplets:
            new_processed = processed
            tup = literal_eval(triplet)

            if tup[0] == 0:
                flag, next = tup
                new_processed += next
                processed = new_processed
                continue

            else:
                flag, pointer, sequnce_length = tup

                i = pointer
                counter = 0
                while counter < sequnce_length:
                    new_processed += processed[-i]
                    counter += 1
                    i -= 1
                    if i == 0:
                        i = pointer

                processed = new_processed


        return processed

In [18]:
instance = LZSS()

In [19]:
instance.find_longest_match("abbabbabbbba", "ab")

(6, 2, 'ab')

In [20]:
triplets = instance.encode(test_text)

In [21]:
#chyba v prezentaci v poslednim radku :((

instance.tiplets_to_table(triplets)

Unnamed: 0,Processed,Unprocessed,Output
0,a,bbabbabbbaab,"(0, a)"
1,ab,babbabbbaab,"(0, b)"
2,abb,abbabbbaab,"(1, 1, 1)"
3,abbabbabb,baab,"(1, 3, 6)"
4,abbabbabbba,ab,"(1, 4, 2)"
5,abbabbabbbaab,,"(1, 5, 2)"


In [22]:
code = instance.create_from_triplets_code(triplets)

In [23]:
code

"(0, 'a');(0, 'b');(1, 1, 1);(1, 3, 6);(1, 4, 2);(1, 5, 2)"

In [24]:
triplets = instance.create_from_code_triplets(code)

In [25]:
decoded = instance.decode(triplets)

#### Test if works correctly

In [26]:
decoded == test_text

True

## Data to use

In [34]:
import random

In [27]:
data_dna, path_dna = get_dataset(DataSets.dna)

Loading C://Users//proko//Desktop//University//iv//aks//datasets\dna\dna.50MB


In [38]:
def pick_size(data, k, metric=10**6):
    return "".join([random.choice(data) for _ in range(k * metric)])

In [40]:
len(data_dna) / (10**6)

52.4288

In [41]:
data_english, path_english = get_dataset(DataSets.english)

Loading C://Users//proko//Desktop//University//iv//aks//datasets\english\english.50MB


# Zero order entropy

In [42]:
import math
from collections import Counter

def calc_freq(content):
    c = Counter(list(content))
    return c

def calc_p(counter, n):
    counter = dict(counter)
    res = {}
    for k, v in counter.items():
        res[k] = v / n  
    return res

def get_n(counter):
    counter = dict(counter)
    return np.sum(list(counter.values()))

def calc_H(p):
    H = 0
    for k, v in p.items():
        #Shannon equation!
        H += p[k] * math.log2(p[k])
    return -H

def zero_order_entropy(data):
    counter = calc_freq(data)
    n = get_n(counter)
    p = calc_p(counter, n)
    H = calc_H(p)
    return H

## Try different sizes of the sliding window: 4kB, 16kB, 32kB and lengths of the uncompressed part. (1pt)

In [43]:
defined_sizes = [4, 16, 32]

instances = {
    "LZ77": LZ77(),
    "LZSS": LZSS()
}

datasets = [
    #(*get_dataset(DataSets.english), DataSets.english), 
    (*get_dataset(DataSets.dna), DataSets.dna)
]

Loading C://Users//proko//Desktop//University//iv//aks//datasets\dna\dna.50MB


In [44]:
def create_windows(data, size, metric=10**3):
    n = len(data)

    size = size * metric
    
    windows = [data[x:((i+1) * size)] for i, x in enumerate(range(0, n, size))]

    return windows

In [45]:
windows = create_windows(data_dna, 4)
windows[0:3]

['GATCAATGAGGTGGACACCAGAGGCGGGGACTTGTAAATAACACTGGGCTGTAGGAGTGATGGGGTTCACCTCTAATTCTAAGATGGCTAGATAATGCATCTTTCAGGGTTGTGCTTCTATCTAGAAGGTAGAGCTGTGGTCGTTCAATAAAAGTCCTCAAGAGGTTGGTTAATACGCATGTTTAATAGTACAGTATGGTGACTATAGTCAACAATAATTTATTGTACATTTTTAAATAGCTAGAAGAAAAGCATTGGGAAGTTTCCAACATGAAGAAAAGATAAATGGTCAAGGGAATGGATATCCTAATTACCCTGATTTGATCATTATGCATTATATACATGAATCAAAATATCACACATACCTTCAAACTATGTACAAATATTATATACCAATAAAAAATCATCATCATCATCTCCATCATCACCACCCTCCTCCTCATCACCACCAGCATCACCACCATCATCACCACCACCATCATCACCACCACCACTGCCATCATCATCACCACCACTGTGCCATCATCATCACCACCACTGTCATTATCACCACCACCATCATCACCAACACCACTGCCATCGTCATCACCACCACTGTCATTATCACCACCACCATCACCAACATCACCACCACCATTATCACCACCATCAACACCACCACCCCCATCATCATCATCACTACTACCATCATTACCAGCACCACCACCACTATCACCACCACCACCACAATCACCATCACCACTATCATCAACATCATCACTACCACCATCACCAACACCACCATCATTATCACCACCACCACCATCACCAACATCACCACCATCATCATCACCACCATCACCAAGACCATCATCATCACCATCACCACCAACATCACCACCATCACCAACACCACCATCACCACCACCACCACCATCATCACCACCACCACCATCATCATCACCACCACCGCCATCATCATCGCCACCACCATGACCACCACCATCACAACCATCACC

In [53]:
def run(instance, size, data, path_data, name_data):

    instance_type = type(instance).__name__
    windows = create_windows(data, size)
    print(f"Number of windows {len(windows)}")
    n = len(data)

    all_tables = pd.DataFrame()
    
    tic = time.time()
    for window_index, window in enumerate(windows):
        
        print(f'Window index {window_index}')
        
        current_triplets = instance.encode(window)
        current_table = instance.tiplets_to_table(current_triplets)


        all_tables = pd.concat([all_tables, current_table])
    toc = time.time()

    dic = compute_aprox_size(all_tables, instance_type)


    des = {
        Fields.Path.value: path_data,
        Fields.Parameter.value: f"{size}kB",
        Fields.Type.value: instance_type,
        Fields.Original.value: n * 8,
        Fields.Aprox.value: 0,
        Fields.Calculation.value: toc - tic,
        Fields.Name.value: name_data
    }

    return {
        **des
        **dic,
    }


Try to compute zero order entropy of each field in a token and use it to compute approximate final compressed file size. (1pt)

In [54]:
def compute_aprox_size(table, current_type):

    if current_type == "LZ77":

        p = table.P.values
        l = table.L.values
        n = table.N.values

        p_H = zero_order_entropy(p)
        l_H = zero_order_entropy(l)
        n_H = zero_order_entropy(n)

        size_P = len(p)
        size_L = len(l)
        size_N = len(n)

        aprox_size_P = p_H * size_P
        aprox_size_L = l_H * size_L
        aprox_size_N = n_H * size_N

        return {
            "p_H": p_H,
            "l_H": l_H,
            "n_H": n_H,
            "size_P": size_P,
            "size_L": size_L,
            "size_N": size_N,
            "aprox_size_P": aprox_size_P,
            "aprox_size_L": aprox_size_L,
            "aprox_size_N": aprox_size_N,
            Fields.Aprox.value: aprox_size_P + aprox_size_L + aprox_size_N
        }


    else:
        output = table.Output.values

        flags = []
        pointers = []
        lengths = []
        nexts = []

        for o in output:
            if len(o) == 2:
                flag, next = o

                flags.append(flag)
                nexts.append(next)

            else:
                flag, pointer, length = o

                flags.append(flag)
                pointers.append(pointer)
                lengths.append(length)


        flags_H = zero_order_entropy(flags)
        nexts_H = zero_order_entropy(nexts)
        pointers_H = zero_order_entropy(pointers)
        lengths_H = zero_order_entropy(lengths)

        size_Flags = len(flags)
        size_Nexts = len(nexts)
        size_Pointers = len(pointers)
        size_Lengths = len(lengths)

        aprox_size_Flags = flags_H * size_Flags
        aprox_size_Nexts = nexts_H * size_Nexts
        aprox_size_Pointers = pointers_H * size_Pointers
        aprox_size_Lengths = lengths_H * size_Lengths

        return {
            "flags_H": flags_H,
            "nexts_H": nexts_H,
            "pointers_H": pointers_H,
            "lengths_H": lengths_H,
            "size_Flags": size_Flags,
            "size_Nexts": size_Nexts,
            "size_Pointers": size_Pointers,
            "size_Lengths": size_Lengths,
            "aprox_size_Flags": aprox_size_Flags,
            "aprox_size_Nexts": aprox_size_Nexts,
            "aprox_size_Pointers": aprox_size_Pointers,
            "aprox_size_Lengths": aprox_size_Lengths,
            Fields.Aprox.value: aprox_size_Flags + aprox_size_Nexts + aprox_size_Pointers + aprox_size_Lengths
        }


In [55]:
instance = LZ77()
triplets = instance.encode(test_text)
table = instance.tiplets_to_table(triplets)
compute_aprox_size(table, type(instance).__name__)

{'p_H': 1.7924812503605778,
 'l_H': 1.4591479170272448,
 'n_H': 1.0,
 'size_P': 6,
 'size_L': 6,
 'size_N': 6,
 'aprox_size_P': 10.754887502163466,
 'aprox_size_L': 8.754887502163468,
 'aprox_size_N': 6.0,
 'Aprox': 25.509775004326933}

In [56]:
instance = LZSS()
triplets = instance.encode(test_text)
table = instance.tiplets_to_table(triplets)
compute_aprox_size(table, type(instance).__name__)

{'flags_H': 0.9182958340544896,
 'nexts_H': 1.0,
 'pointers_H': 2.0,
 'lengths_H': 1.5,
 'size_Flags': 6,
 'size_Nexts': 2,
 'size_Pointers': 4,
 'size_Lengths': 4,
 'aprox_size_Flags': 5.509775004326937,
 'aprox_size_Nexts': 2.0,
 'aprox_size_Pointers': 8.0,
 'aprox_size_Lengths': 6.0,
 'Aprox': 21.509775004326936}

In [57]:
NORMALIZATION_MB = 1

In [58]:
def run_all(instances, sizes, datasets):
    res = {}
    counter = 0
    for data, path_data, name_data in datasets:
        data = pick_size(data, NORMALIZATION_MB)
        for k, v in instances.items():
            for size in sizes:
                print(name_data, k, size)
                value = run(v, size, data, path_data, name_data)
                res[counter] = value
                counter += 1

    return res

In [59]:
result = run_all(instances, defined_sizes, datasets)

DataSets.dna LZ77 4
Number of windows 250
Window index 0


KeyboardInterrupt: 

In [None]:
result_dataframe = pd.DataFrame.from_dict(result, orient="index")

## Compute the length of the output file and prepare a report summarizing the results. (1pt)

### Graf odsud


### Uložení do souboru pro načtení v results

In [None]:
save_both(result_dataframe)