In [1]:
from heapq import heappush, heappop, heapify
import collections
import numpy as np

def entropy(seq):
    return sum(-p * np.log2(p) for p in seq)


def count_freq(text):
    symb2freq = collections.Counter(text)
    norm = sum(symb2freq.values())
    return {k: v/norm for k, v in symb2freq.items()}

In [2]:
class Huffman:
    """Huffman encode the given dict mapping symbols to weights"""
    
    def __init__(self, freq):
        if isinstance(freq, (str, list)):
            freq = count_freq(freq)
        
        heap = [[wt, [sym, '']] for sym, wt in freq.items()]
        heapify(heap)

        while len(heap) > 1:
            lo, hi = heappop(heap), heappop(heap)

            for pair in lo[1:]:
                pair[1] = '0' + pair[1]
            for pair in hi[1:]:
                pair[1] = '1' + pair[1]

            heappush(heap, [lo[0] + hi[0]] + lo[1:] + hi[1:])
        
        self.symbols = list(freq.keys())
        self.freq = freq
        self.codes = dict(heappop(heap)[1:])
        
    def encode(self, string):
        return ''.join([self.codes[c] for c in string])
    
    def entropy(self):
        return entropy(self.freq.values())
    
    def __str__(self):
        res = "Symbol\tP\tHuffman Code\n"
        res += '\n'.join([
                "'%s'\t%.4f\t%s" % (c, f, self.codes[c]) 
                for c, f in sorted(self.freq.items(), key=lambda x: -x[1])
            ])
        return res
            
    def mean_length(self):
        return np.mean([len(c) for c in self.codes.values()])
    
    def __getitem__(self, key):
        return self.freq[key]

In [3]:
huff = Huffman({
    'a': .1,
    'b': .4,
    'c': .25,
    'd': .25
})

print(huff)

print()
print('mean length =', huff.mean_length())
print('entropy =', huff.entropy())

Symbol	P	Huffman Code
'b'	0.4000	0
'c'	0.2500	111
'd'	0.2500	10
'a'	0.1000	110

mean length = 2.25
entropy = 1.86096404744


In [4]:
huff = Huffman("this is an example for huffman encoding")

print(huff)

print()
print('mean length =', huff.mean_length())
print('entropy =', huff.entropy())

Symbol	P	Huffman Code
' '	0.1538	101
'n'	0.1026	010
'i'	0.0769	1110
'e'	0.0769	1100
'a'	0.0769	1001
'f'	0.0769	1101
'm'	0.0513	0010
's'	0.0513	0111
'h'	0.0513	0001
'o'	0.0513	0011
'u'	0.0256	10001
'd'	0.0256	111111
't'	0.0256	10000
'p'	0.0256	01100
'x'	0.0256	11110
'g'	0.0256	00000
'r'	0.0256	01101
'c'	0.0256	111110
'l'	0.0256	00001

mean length = 4.47368421053
entropy = 3.98977850058


# Lab 4

In [5]:
%%writefile utils.py
import collections

trans = collections.defaultdict(str)

for s in 'йцукенгшщзхфывапролджэячсмитьбю ':
    trans[s] = s

trans['ъ'] = 'ь'
trans['ё'] = 'е'
trans['\n'] = ' '

def transform(string):
    return ''.join([trans[s] for s in string.lower()])


Overwriting utils.py


In [6]:
from utils import transform

In [7]:
rusfreq = {
    ' ': .174,
    'о': .090,
    'е': .072,
    'а': .062,
    'и': .062,
    'т': .053,
    'н': .053,
    'с': .045,
    'р': .040,
    'в': .038,
    'л': .035,
    'к': .028,
    'м': .026,
    'д': .025,
    'п': .023,
    'у': .021,
    'я': .018,
    'ы': .016,
    'з': .016,
    'ь': .014,
    'б': .014,
    'г': .013,
    'ч': .012,
    'й': .010,
    'х': .009,
    'ж': .007,
    'ю': .006,
    'ш': .006,
    'ц': .004,
    'щ': .003,
    'э': .003,
    'ф': .002,
}

assert len(rusfreq) == 32
assert abs(sum(rusfreq.values()) - 1) < 1e-6, str(sum(rusfreq.values()))

In [8]:
with open('gore.txt') as f:
    doc = ''.join([transform(line) for line in f])

In [9]:
len(doc)

95336

In [10]:
enc = Huffman(rusfreq)
encoded = enc.encode(doc)
print('mean length:', len(encoded) / len(doc))
print('entropy:', enc.entropy())

freq_enc = enc

mean length: 4.40620542082739
entropy: 4.35586052699


In [11]:
enc = Huffman(doc)
encoded = enc.encode(doc)
print('mean length:', len(encoded) / len(doc))
print('entropy:', enc.entropy())

count_enc = enc

mean length: 4.367730972560208
entropy: 4.33518726706


$\mathrm{H}(p, q) = -\sum_x p(x)\, \log q(x).$

In [12]:
cross_entropy = -sum(
    count_enc[k] * np.log(freq_enc[k]) for k in count_enc.symbols
)
print("Cross entropy:", cross_entropy)

Cross entropy: 3.01650875963


In [13]:
pdoc = [doc[2*i:2*i+2] for i in range(len(doc) // 2)]

In [14]:
double_enc = Huffman(pdoc)

encoded = double_enc.encode(pdoc)

In [15]:
print('mean length:', len(encoded) / len(pdoc))
print('mean length (per symbol):', len(encoded) / len(doc))
print('entropy:', double_enc.entropy())

mean length: 7.945225308382982
mean length (per symbol): 3.972612654191491
entropy: 7.91870362066
