In [1]:
from utils import transform

import itertools
from math import log2, ceil

def bin(x, n=0):
#     res = '{:0' + str(n) + 'b}'
    return "{:b}".format(x).zfill(n)
#     return res.format(x)

def int2elias(x):
    L = len(bin(x))
    M = len(bin(L))
    return '0' * (M-1) + bin(L) + bin(x)[1:]

def elias2int(c):
    M = c.index('1')
    L = int(c[M:2*M+1] or '0', 2)
    x = c[2*M+1:2*M+L]
    assert 2*M+L <= len(c)
    return int(x or '0', 2) + 2 ** (L-1)

def elias2list(c):
    while c:
        M = c.index('1')
        L = int(c[M:2*M+1] or '0', 2)
        x = c[2*M+1:2*M+L]
        assert 2*M+L <= len(c)
        c = c[2*M+L:]
        yield int(x or '0', 2) + 2 ** (L-1)

In [2]:
codes = [
    (1, '1'),
    (2, '0100'),
    (3, '0101'),
    (4, '01100'),
    (5, '01101'),
    (6, '01110'),
    (7, '01111'),
    (8, '00100000'),
    (9, '00100001'),
    (10, '00100010'),
    (11, '00100011'),
    (12, '00100100'),
    (13, '00100101'),
    (14, '00100110'),
]
for x, c in codes:
    assert int2elias(x) == c
    assert elias2int(c) == x, "{} -> {}".format(c, x)

In [3]:
for x in range(10, 1000000, 50):
    assert x == elias2int(int2elias(x)), "{} {}".format(x, elias2int(int2elias(x)))

In [4]:
with open('gore.txt') as f:
    doc = ''.join([transform(line) for line in f])

In [5]:
alphabet = set(doc)

In [6]:
def split(line, n):
    return [line[i:i+n] for i in range(0, len(line), n)]

In [7]:
class RyabkoElias:
    def __init__(self, k, alphabet):
        self.k = k
        self.alphabet = alphabet

    @property
    def table(self):
        return [''.join(i) for i in itertools.product(alphabet, repeat=self.k)]
    
    def encode(self, seq):
        if len(seq) % self.k:
            seq += ' ' * (self.k - len(seq) % self.k)
        table = self.table
        result = ''
        for chunk in split(seq, self.k):
            index = table.index(chunk)
            v = table.pop(index)
            table.insert(0, v)
            result += int2elias(index)
            
        return result
    
    def decode(self, seq):
        table = self.table
        result = ''
        for i in elias2list(seq):
            chunk = table.pop(i)
            table.insert(0, chunk)
            result += chunk
        return result

In [8]:
lengths = []
for k in range(2, 5):
    print(k)
    re = RyabkoElias(k, alphabet)
    e = re.encode(doc[:5000])
    lengths.append(len(e))

2
3
4


In [9]:
lengths

[28594, 28159, 30412]

In [10]:
re = RyabkoElias(3, alphabet)

In [11]:
len(re.encode(doc))

486903

[rossetacode](https://rosettacode.org/wiki/LZW_compression#Python)

In [12]:
from io import StringIO

In [13]:
class LZW:
    def __init__(self, alphabet):
        self.alphabet = alphabet
        self.n = n = ceil(log2(len(alphabet))) 
        
    def compress(self, uncompressed):
        """Compress a string to a list of output symbols."""

        # Build the dictionary.
        dictionary = {c: bin(i, self.n) for i, c in enumerate(self.alphabet)}
        dict_size = len(dictionary)

        w = ""
        result = []
        for c in uncompressed:
            wc = w + c
            if wc in dictionary:
                w = wc
            else:
                result.append(dictionary[w])
                # Add wc to the dictionary.
                dictionary[wc] = bin(dict_size, self.n)
                dict_size += 1
                w = c

        # Output the code for w.
        if w:
            result.append(dictionary[w])
        return result


    def decompress(self, compressed):
        """Decompress a list of output ks to a string."""
        
        # Build the dictionary.
        dictionary = {bin(i, self.n): c for i, c in enumerate(self.alphabet)}
        dict_size = len(dictionary)

        result = StringIO()
        w = dictionary[compressed[0]]
        result.write(w)
        for k in compressed[1:]:
            if k in dictionary:
                entry = dictionary[k]
            elif int(k, 2) == dict_size:
                entry = w + w[0]
            else:
                print(int(k, 2), dict_size)
                print(dictionary)
                raise ValueError('Bad compressed k: %s' % k)
            result.write(entry)

            # Add w+entry[0] to the dictionary.
            dictionary[bin(dict_size, self.n)] = w + entry[0]
            dict_size += 1

            w = entry
            
#         print(dictionary)
        return result.getvalue()

In [14]:
lzw = LZW(alphabet)
lzw2 = LZW('01')

In [15]:
compressed = lzw.compress(doc) 
doc == lzw.decompress(compressed)

True

In [16]:
j = ''.join(compressed)
l = len(j)

In [17]:
len(compressed), l / log2(l)

(25811, 16627.367347255247)

In [18]:
1 - l / log2(l) / len(compressed)

0.3558030550054144