# HuffMan Coding uses different data structure within one program
# it is used to get the file from the path and convert it to the binary form so that the space can be saved

In [30]:
import heapq
import os

class BinaryTreeNode:
    def __init__(self, value, frequency):
        self.value = value
        self.frequency = frequency
        self.left = None
        self.right = None

    def __lt__(self, other):
        return self.frequency < other.frequency

    def __eq__(self, other):
        return self.frequency == other.frequency
    
class HuffManCoding:
    def __init__(self, path):
        self.path = path
        self.__heap = []
        self.__codes = {}
        self.__reverseCodes = {}

    def __make_frequency_dictionary(self, text):
        freq = {}
        for char in text:
            if char not in freq:
                freq[char] = 0
            freq[char] += 1
        return freq

    def __buildHeap(self, freq_dictionary):
        for key in freq_dictionary:
            frequency = freq_dictionary[key]
            binary_tree_node = BinaryTreeNode(key, frequency)
            heapq.heappush(self.__heap, binary_tree_node)

    def __buildTree(self):
        while len(self.__heap) > 1:
            binaryTreeNode_1 = heapq.heappop(self.__heap)
            binaryTreeNode_2 = heapq.heappop(self.__heap)
            frequency_sum = binaryTreeNode_1.frequency + binaryTreeNode_2.frequency
            newNode = BinaryTreeNode(None, frequency_sum)
            newNode.left = binaryTreeNode_1
            newNode.right = binaryTreeNode_2
            heapq.heappush(self.__heap, newNode)

    def __buildCodesHelper(self, root, curr_bits):
        if root is None:
            return

        if root.value is not None:
            self.__codes[root.value] = curr_bits
            self.__reverseCodes[curr_bits] = root.value
            return
            
        self.__buildCodesHelper(root.left, curr_bits + "0")
        self.__buildCodesHelper(root.right, curr_bits + "1")

    def __buildCodes(self):
        root = heapq.heappop(self.__heap)
        self.__buildCodesHelper(root, "")

    def __getEncodedText(self, text):
        encoded_text = ""
        for char in text:
            encoded_text += self.__codes[char]
        return encoded_text

    def __getPaddedEncodedText(self, encoded_text):
        padded_amount = 8 - (len(encoded_text) % 8)
        for i in range(padded_amount):
            encoded_text += "0"
        papped_info = "{0:08b}".format(padded_amount)
        papped_encoded_text = papped_info + encoded_text
        return papped_encoded_text

    def __getBytesArray(self, papped_encoded_text):
        array = []
        for i in range(0, len(papped_encoded_text), 8):
            bytes_ = papped_encoded_text[i:i+8]
            array.append(int(bytes_, 2))
        return array
        
    def compress(self):
        file_name, file_extention = os.path.splitext(self.path)
        output_path = file_name + '.bin'
        with open(self.path, 'r+') as f, open(output_path, 'wb') as output:
            text = f.read()
            text = text.rstrip()
            freq_dictionary = self.__make_frequency_dictionary(text)
            self.__buildHeap(freq_dictionary)
            self.__buildTree()
            self.__buildCodes()
            encoded_text = self.__getEncodedText(text)
            padded_encoded_text = self.__getPaddedEncodedText(encoded_text)
            bytes_array = self.__getBytesArray(padded_encoded_text)
            final_bytes = bytes(bytes_array)
            output.write(final_bytes)
        print("Compressed")
        return output_path

    def __removePadding(self, text):
        padded_info = text[:8]
        extra_padding = int(padded_info,2)
        text = text[8:]
        text_after_removing_padding = text[:-1*extra_padding]
        return text_after_removing_padding

    def __decodedText(self, text):
        decoded_text = ""
        current_bit = ""
        for bit in text:
            current_bit += bit
            if current_bit in self.__reverseCodes:
                character = self.__reverseCodes[current_bit]
                decoded_text += character
                current_bit = ""
        return decoded_text
                
    def decompress(self,input_file):
        filename, file_extention = os.path.splitext(self.path)
        output_path = filename + "_decompressed" + ".txt"
        with open(input_file, 'rb') as file, open(output_path, 'w') as output:
            bit_string = ""
            byte = file.read(1)
            while byte:
                byte = ord(byte)
                bits = bin(byte)[2:].rjust(8,'0')
                bit_string += bits
                byte = file.read(1)
            actual_text = self.__removePadding(bit_string)
            decompressed_text = self.__decodedText(actual_text)
            output.write(decompressed_text)

path = 'C:\\Users\\sahil\\Desktop\\sample.txt'
h = HuffManCoding(path)
output_path = h.compress()
h.decompress(output_path)

Compressed
