In [6]:
class HuffmanNode:
    def __init__(self, char, freq, left, right):
        self.left = left
        self.right = right
        self.char = char
        self.freq = freq
    def get_nodes(self):
        return self.left, self.right
    def __str__(self):
            return "(" + str(self.char) + "," + str(self.freq) + ")"

In [7]:
import sys
class HuffmanCoding:
    
    # steps to create huffman code
    def huffman_code(self, input):
        codes = self.build_char_freq_map(input)
        node_queue = self.sort_by_freq(codes)
        self.root = self.build_tree(node_queue)
        huffman_code_map = self.assign_huffman_code(self.root)
        return huffman_code_map
    
    # create char frequency map from input string.
    def build_char_freq_map(self, input):
        codes = {}
        for c in input:
            codes[c] = codes.get(c, 0) + 1
        return codes
    
    def sort_by_freq(self, codes):
        node_queue = []
        for k, v in codes.items():
            node_queue.append(HuffmanNode(k, v, None, None))
        node_queue.sort(key=lambda x: x.freq)
        return node_queue
    
    def build_tree(self, node_queue):
        while len(node_queue)>1:
            node1 = node_queue.pop(0)
            node2 = node_queue.pop(0)
            node = HuffmanNode('', node1.freq + node2.freq, node1, node2)
            node_queue.append(node)
        return node_queue.pop(0)
    
    def assign_huffman_code(self, node, bin_str=''):
        code_map = {}
        self.assign_code_rec(node, code_map, bin_str='')
        return code_map
        
    def assign_code_rec(self, node, code_map, bin_str=''):
        if node.left == None and node.right == None:
            code_map[node.char] = bin_str
            return 
        self.assign_code_rec(node.left, code_map, bin_str+'0')
        self.assign_code_rec(node.right, code_map, bin_str+'1')
        
    def huffman_encoding(self, code_map, input_str):
        encoded_data = ''
        for i in range(len(input_str)):
            encoded_data += code_map.get(input_str[i])
        return encoded_data
        
    def huffman_decoding(self, encoded_data):
        decoded_data = ''
        curr_node = self.root
        for i in range(len(encoded_data)):
            if encoded_data[i] == '1' :
                curr_node = curr_node.right
            else:
                curr_node = curr_node.left
            if curr_node.left == None and curr_node.right == None:
                decoded_data += curr_node.char
                curr_node = self.root
        return decoded_data

In [8]:
def test(data):
    print ("The content of the data is: {}".format(data))
    print("The size of the data is:{}".format(sys.getsizeof(data)))
    huffman = HuffmanCoding()
    code_map = huffman.huffman_code(data)
    # print("code:" + str(code_map))
    encoded_data = huffman.huffman_encoding(code_map, data)
    decoded_data = huffman.huffman_decoding(encoded_data)
    
    if data != None:
        print ("The size of the encoded data is: {}".format(sys.getsizeof(int(encoded_data, base=2))))
        print ("The content of the encoded data is: {}".format(encoded_data))
    
        for char in str(encoded_data):
            if char != '0' and char != '1':
                print('The encoded data is not a binary code:',encoded_data)
                break
    
        print ("The size of the decoded data is: {}".format(sys.getsizeof(decoded_data)))
        print ("The content of the decoded data is: {}".format(decoded_data))
        if decoded_data != data:
            print('Fail: Decoded data:', decoded_data)
            print('is different from original input:',data)
        else:
            print('Pass: Decoded data matches original input')
    
        if int(sys.getsizeof(int(encoded_data, base=2))) >= int(sys.getsizeof(data)):
            print("Fail: There is no gain in size with the encoded data: {}, Vs original data: {}".format(sys.getsizeof(int(encoded_data, base=2)),sys.getsizeof(data)))
        else:
            print("Pass: Compression is working!")
        if sys.getsizeof(data) == sys.getsizeof(decoded_data):
            print("Pass: Size of data is equal to size of decoded data!\n")
        else:
            print("Fail: Size of data: {} not equal to size of decoded data: {}!\n".format(sys.getsizeof(data),sys.getsizeof(decoded_data)))
    else:
        if encoded_data == None and decoded_data == None:
            print("None input produces None output\n")
        else:
            print("None input not handled\n")


In [11]:
test("The bird is the word") # Expected output: Pass
test("I love Udacity, the problems are great and challenging!")# Expected output: Pass
test("AAAAAABBCCDDEEFFFFF") # Expected output: Pass


The content of the data is: The bird is the word
The size of the data is:69
The size of the encoded data is: 36
The content of the encoded data is: 1000111011110111001000001010011000101001110111110111101111001101001010
The size of the decoded data is: 69
The content of the decoded data is: The bird is the word
Pass: Decoded data matches original input
Pass: Compression is working!
Pass: Size of data is equal to size of decoded data!

The content of the data is: I love Udacity, the problems are great and challenging!
The size of the data is:104
The size of the encoded data is: 56
The content of the encoded data is: 1001010000101111001001101111000101001110101101111011111000110101101101000000100000111100010111001011100110000101011111001110101000011000100111100000110010011101100001100001100100111011000111100000011001010101011101000011111110100001111011
The size of the decoded data is: 104
The content of the decoded data is: I love Udacity, the problems are great and challenging!
Pass: Deco

In [3]:
input_str = "AAAAAABBCCDDEEFFFFF"
huffman = HuffmanCoding()
code_map = huffman.huffman_code(input_str)
print("code:" + str(code_map))
encoded_data = huffman.huffman_encoding(code_map, input_str)
decoded_data = huffman.huffman_decoding(encoded_data)
print(decoded_data)

code:{'F': '00', 'A': '01', 'B': '100', 'C': '101', 'D': '110', 'E': '111'}
AAAAAABBCCDDEEFFFFF
