# Huffman Coding

In [69]:
import heapq
import os
class BinaryTreeNode:
    def __init__(self, value, freq):
        self.value = value
        self.freq = freq
        self.left = None
        self.right = None
    
    def __lt__(self, other):
        return self.freq < other.freq
    
    def __eq__(self, other):
        return self.freq == other.freq
        
class HuffmanCoding:
    def __init__(self,path):
        self.path = path
        self.__heap = []
        self.__codes = {}
        self.__reverse_Codes = {}
        
    def __make_freq_dict(self, text):
        freq_dict = {}
        for char in text:
            freq_dict[char] = freq_dict.get(char, 0) + 1
        print(freq_dict)
        return freq_dict
    
    def __buildHeap(self, freq_dict):
        for key in freq_dict:
            freq = freq_dict[key]
            binaryTreeNode = BinaryTreeNode(key, freq)
            heapq.heappush(self.__heap, binaryTreeNode)
            
    def __makeBinaryTree(self):
        while len(self.__heap) > 1:
            min1 = heapq.heappop(self.__heap)
            min2 = heapq.heappop(self.__heap)
            freq_sum = min1.freq + min2.freq
            
            binTreeNode = BinaryTreeNode(None , freq_sum)
            binTreeNode.left = min1
            binTreeNode.right = min2
            heapq.heappush(self.__heap, binTreeNode)
        return
    
    
    def __constructCodesHelp(self, root, curr_bits):
        if root is None:
            return
        if root.value is not None:
            self.__codes[root.value] = curr_bits
            self.__reverse_Codes[curr_bits] = root.value
            return
        self.__constructCodesHelp(root.left, curr_bits + "0")
        self.__constructCodesHelp(root.right, curr_bits + "1")
    
    def __constructCodes(self):
        root = heapq.heappop(self.__heap)
        self.__constructCodesHelp(root, "")
        
    def __encodeText(self, text):
        encoded_text = ""
        for char in text:
            encoded_text += self.__codes[char]
        return encoded_text
    
    def __paddingText(self, encoded_text):
        to_pad = 8 - (len(self.__codes) % 8)
        for i in range(to_pad):
            encoded_text += "0"
        padded_info = "{0:08b}".format(to_pad)
        padded_encoded_text = padded_info + encoded_text
        return padded_encoded_text
    
    def __getBytesArray(self, padded_Encoded_text):
        array = []
        for i in range(0, len(padded_Encoded_text), 8):
            byte = padded_Encoded_text[i:i+8]
            array.append(int(byte, 2))
        return array
            
        
        
    
    def compress(self):
        file_name , file_extension = os.path.splitext(self.path)
        output_path = file_name + ".bin"
        # get file from path 
        with open(self.path, "r+") as f:
            text = f.read()
            text.rstrip()
        # read text from file
        
        # make frequency dictionay using the text
        freq_dict = self.__make_freq_dict(text)
        
        
        # construct the heap from the freq dictionary
        self.__buildHeap(freq_dict)
        
        # construct the binary tree from the heap
        self.__makeBinaryTree()
        
        
        # Construct the codes from the binary tree
        self.__constructCodes()
        
        # Create the encoded text
        
        encoded_text = self.__encodeText(text)
        
        # padding the Encoded Text
        padded_Encoded_text = self.__paddingText(encoded_text)
        
        # Put the encoded text into the binary file 
        bytes_array = self.__getBytesArray(padded_Encoded_text)
        
        final_bytes = bytes(bytes_array)
        
        # return the binary file as output
        with open(output_path, "wb") as output:
            output.write(final_bytes)
        return output_path
    
    def __removePadding(self, bit_string):
        padded_info = bit_string[:8]
        padded_info = int(padded_info, 2)
        
        text_removed_padding = bit_string[8: -1*padded_info]
        return text_removed_padding
    
    def __decode_Text(self, actual_text):
        curr_str = ""
        word_str = ""
        for bit in actual_text:
            curr_str += bit
            if curr_str in self.__reverse_Codes:
                word_str += self.__reverse_Codes[curr_str]
                curr_str = ""
        return word_str
        
    
    def decompress(self, input_path):
        file_name, file_extension = os.path.splitext(self.path)
        output_path = file_name + "_decompressed" + ".txt"
        print(output_path)
        with open(input_path, "rb") as file:
            bit_string = ""
            byte = file.read(1)
            while byte:
                byte = ord(byte)
                bits = bin(byte)[2:].rjust(8,'0')
                bit_string += bits
                byte = file.read(1)
        actual_text = self.__removePadding(bit_string)
        decoded_text = self.__decode_Text(actual_text)
        print(decoded_text)
        with open(output_path, "w") as output:
            output.write(decoded_text)
        return
            

In [70]:
h = HuffmanCoding("huffman.txt")

In [71]:
output_path =   h.compress()

{'a': 9, 'b': 5, 'c': 5, 'd': 5, 'e': 6, 's': 16, 'f': 8, 'r': 2, 'v': 1, 'g': 1}


In [72]:
h.decompress(output_path)

huffman_decompressed.txt
abcdedsafrssefffsaaabbbbccssssessaasefescfddasecvrffsgsasdb


In [68]:
print(output_path)

huffman.bin


In [36]:
a = 'abcabcttt'
d = {}
for i in a:
    d[i] = d.get(i, 0) + 1

In [37]:
d

{'a': 2, 'b': 2, 'c': 2, 't': 3}