In [1]:
import numpy as np
import matplotlib.pyplot as plt
from dahuffman import HuffmanCodec

In [2]:
def bin_2_symbole(word, table):
    w = [table[int("".join(map(str, word[i])), base=2)] for i in range(word.shape[0])]  
    return "".join(map(str, w))

def word_2_number(word): 
    return [int("".join(map(str, word[i])), base=2) for i in range(word.shape[0])]

def generate_sequ(nb_symb, long, proba):
    q=np.int(np.round(1 / proba))
    x = np.random.randint(0, q, nb_symb * long)
    x[x < 9] = 0
    x[x == 9] = 1
    x = x.reshape((nb_symb, long))
    return x

def vec_bin_array(arr, m): 
    to_str_func = np.vectorize(lambda x: np.binary_repr(x).zfill(m))
    strs = to_str_func(arr)
    ret = np.zeros(list(arr.shape) + [m], dtype=np.int8)
    for bit_ix in range(0, m):
        fetch_bit_func = np.vectorize(lambda x: x[bit_ix] == '1')
        ret[...,bit_ix] = fetch_bit_func(strs).astype("int8")
    return ret 

def FT(word,table):
    word=word_2_number(word)
    len_t = len(table)
    h = np.histogram(word, bins=np.arange(len_t + 1))
    temp = {}
    for i in range(len_t):
        temp[table[i]] = h[0][i]
    return temp

In [46]:
nb_symb = 2000
length = 5
prob=0.1
#table = ['1_','2_','3_','4_', '5_', '6_', '7_', '8_', '9_', '10_', '11_', '12_', '13_', '14_', '15_', '16_', '17_', '18_', '19_', '20_', '21_', '22_','23_', '24_', '25', '26_', '27_', '28_', '29_', '30_', '31_','32_']
table = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'é', 'à', 'ö', 'ä', 'è','ü']

matrice_binaire=generate_sequ(nb_symb, length, prob)
#print('matrice binaire\n' ,matrice_binaire)

liste_symboles=bin_2_symbole(matrice_binaire,table)
#print(bin_2_symbole(matrice_binaire,table))

In [47]:
table_frequ=FT(matrice_binaire,table)
#print('table des frequences:\n' ,table_frequ)

codec = HuffmanCodec.from_frequencies(table_frequ)
print('\ncodec Huffman selon frequence : \n')
codec.print_code_table()


codec Huffman selon frequence : 

Bits Code             Value Symbol
   3 000                  0 'e'
   6 001000               8 'u'
   6 001001               9 'k'
   6 001010              10 'r'
   6 001011              11 's'
  10 0011000000         192 't'
  10 0011000001         193 'w'
  10 0011000010         194 'z'
  10 0011000011         195 'à'
   8 00110001            49 'v'
  10 0011001000         200 'é'
  10 0011001001         201 'ö'
   9 001100101          101 'o'
   9 001100110          102 'x'
  10 0011001110         206 'h'
  16 0011001111000000 13248 'l'
  16 0011001111000001 13249 'n'
  15 001100111100001   6625 'ä'
  14 00110011110001    3313 'è'
  13 0011001111001     1657 'ü'
  12 001100111101       829 _EOF
  11 00110011111        415 'p'
   7 0011010             26 'm'
   7 0011011             27 'f'
   7 0011100             28 'j'
   7 0011101             29 'y'
   7 0011110             30 'g'
   7 0011111             31 'd'
   4 0100                 4 'b'
 

In [48]:
res = [] #sauf les vals égales à 0
for key in table_frequ.keys() : 
    if table_frequ[key]!=0:
        res.append(table_frequ[key]/nb_symb) 

print(np.sum(res))
print(res)
print(len(res))

my_entr=-np.sum(res*np.log2(res))
print("l'entropie pour notre exemple est de: ",np.round(my_entr,2))
max_entr=-23*(1/23)*np.log2(1/23)
print("l'entropie maximale pour notre exemple est de: ",np.round(max_entr,2))

temp = codec.encode(liste_symboles)
ww = []
for i in temp:
    ww.append(np.str(bin(i)[2:]))
ww = "".join(map(str, ww))
print('nb bits codage de huffman:',len(ww))

0.9999999999999998
[0.592, 0.057, 0.0655, 0.008, 0.071, 0.007, 0.0075, 0.001, 0.0645, 0.007, 0.0085, 0.0065, 0.0015, 0.0005, 0.062, 0.009, 0.01, 0.0005, 0.008, 0.002, 0.0005, 0.0015, 0.007, 0.0005, 0.0005, 0.0005, 0.0005]
27
l'entropie pour notre exemple est de:  2.36
l'entropie maximale pour notre exemple est de:  4.52
nb bits codage de huffman: 4357
