# Huffman algorithm with compression ratio

# Load file -> huffman algorithm -> print compression rate

## Load file

In [1]:
# Load text from the file
file = open("test.txt", "r")
text = file.read().replace('\n', '')

text

'Oi, eu sou Goku!'

## Huffman algorithm

In [2]:
# Frequency of characters
my_dict = {}
for i in text:
  try:
    my_dict[i] += 1

  except KeyError:
    my_dict[i] = 1

my_dict

{'O': 1,
 'i': 1,
 ',': 1,
 ' ': 3,
 'e': 1,
 'u': 3,
 's': 1,
 'o': 2,
 'G': 1,
 'k': 1,
 '!': 1}

In [3]:
# Node class
class Node:
  def __init__(self, frequency, symbol, left=None, right=None):
    # Frequency of symbol
    self.frequency = frequency

    # Symbol name (character)
    self.symbol = symbol

    # Left node of current node
    self.left = left

    # Right node of current node
    self.right = right

    self.binary = ''

In [4]:
# Create tree
nodes = []

for i in my_dict:
  nodes.append(Node(my_dict[i], i))

# for i in range(len(nodes)):
#   print('Symbol: ', nodes[i].symbol, 'frequency: ', nodes[i].frequency)

while len(nodes) > 1:
  nodes = sorted(nodes, key=lambda x: x.frequency)

  # for i in range(len(nodes)):
  #   print('Symbol: ', nodes[i].symbol, 'frequency: ', nodes[i].frequency)

  # print()

  left = nodes[0]
  right = nodes[1]

  left.binary = '0'
  right.binary = '1'

  new_node = Node(left.frequency + right.frequency, left.symbol + right.symbol, left, right)

  nodes.remove(left)
  nodes.remove(right)
  nodes.append(new_node)

In [5]:
# Create huffman table
def huffman_table(node, binary='', table={}):
    binary_word = binary + node.binary

    if node.left:
        huffman_table(node.left, binary_word, table)

    if node.right:
        huffman_table(node.right, binary_word, table)

    if (not node.left) and (not node.right):
        table[node.symbol] = binary_word
        # print(f"{node.symbol} -> {binary_word}")

    return table

h_table = huffman_table(nodes[0])

### Create aux table

In [6]:
# "Oi, eu sou o Goku!"
table = {'O': '0000', 'i': '0001', ',': '0010', ' ': '0011', 'e': '0100', 'u': '0101', 's': '0110', 'o': '0111', 'G': '1000', 'k': '1001', '!':  '1010'}

print('Huffman table:\n', h_table, '\n\nTable:\n', table)

Huffman table:
 {'u': '00', 'o': '010', 'O': '0110', 'i': '0111', ',': '1000', 'e': '1001', 's': '1010', 'G': '1011', 'k': '1100', '!': '1101', ' ': '111'} 

Table:
 {'O': '0000', 'i': '0001', ',': '0010', ' ': '0011', 'e': '0100', 'u': '0101', 's': '0110', 'o': '0111', 'G': '1000', 'k': '1001', '!': '1010'}


### Binary word and binary compressed word

In [7]:
# Compression
print(text)

word = ''
compressed_word = ''

for i in text:
  word += table[i]
  compressed_word += h_table[i]

print('\nWord:\n', word, '\n\nCompressed word:\n', compressed_word)

Oi, eu sou Goku!

Word:
 0000000100100011010001010011011001110101001110000111100101011010 

Compressed word:
 01100111100011110010011110100100011110110101100001101


### Compression rate

In [8]:
# Compression rate
compression_rate = (len(compressed_word) * 100) / len(word)
compression_rate = 100 - compression_rate

print(f'Compression rate = {compression_rate:,.2f}%')

Compression rate = 17.19%
