In [6]:
# compare the two text files and print out the different line; if two files are identical, print No difference
def compareFile(path1="midsummer.txt",path2="recon.txt"):
    f1 = open(path1, "r")  
    f2 = open(path2, "r")   
    i = 0
    for line1 in f1:
        i += 1      
        for line2 in f2:
            if line1 != line2:
                print("Differs at line", i, ":")
                # else print that line from both files
                print("\tFile 1:", line1, end='')
                print("\tFile 2:", line2, end='')
            break
    print("The input file and reconstructed file have no difference")
    # closing files
    f1.close()                                       
    f2.close()  

In [7]:
# huffman code 
import heapq
# binary tree with parent node value always smaller than the child value.
class huffmancoding:
    def __init__(self,original_path,compressed_path,recon_path):
        self.original_path = original_path
        self.compressed_path = compressed_path
        self.recon_path = recon_path
        self.heap = []
        self.codes = {}
        self.reverse_mapping = {}
    
    class HeapNode:
        def __init__(self,char,frequency):
            self.char = char
            self.freq = frequency
            self.left = None # nodes of higher frequency
            self.right = None # nodes of lower frequency
            
        def __lt__(self, other):
            return self.freq < other.freq

        def __eq__(self, other):
            if(other == None):
                return False
            if(not isinstance(other, HeapNode)):
                return False
            return self.freq == other.freq
        
    def count_frequency(self,text):
        frequency = {}
        for char in text:
            if not char in frequency:
                frequency[char] = 1
            else:
                frequency[char] +=1
        return frequency
    
    def make_heap(self, frequency):
        for key in frequency:
            node = self.HeapNode(key, frequency[key])
            heapq.heappush(self.heap, node)
    
    def combine_nodes(self):
        while(len(self.heap)>1):
            node1 = heapq.heappop(self.heap)
            node2 = heapq.heappop(self.heap)
            merged_node = self.HeapNode(None, node1.freq + node2.freq)
            merged_node.left = node1
            merged_node.right = node2
            heapq.heappush(self.heap, merged_node)

    def make_codes_helper(self, root, current_code):
        if(root == None):
            return
        if(root.char != None):
            self.codes[root.char] = current_code
            self.reverse_mapping[current_code] = root.char
            return

        self.make_codes_helper(root.left, current_code + "0")
        self.make_codes_helper(root.right, current_code + "1")


    def make_codes(self):
        root = heapq.heappop(self.heap)
        current_code = ""
        self.make_codes_helper(root, current_code)


    def get_encoded_text(self, text):
        encoded_text = ""
        for char in text:
            encoded_text += self.codes[char]
        return encoded_text


    def pad_encoded_text(self, encoded_text):
        extra_padding = 8 - len(encoded_text) % 8
        for i in range(extra_padding):
            encoded_text += "0"

        padded_info = "{0:08b}".format(extra_padding)
        encoded_text = padded_info + encoded_text
        return encoded_text


    def get_byte_array(self, padded_encoded_text):
        if(len(padded_encoded_text) % 8 != 0):
            print("Encoded text not padded properly")
            exit(0)

        b = bytearray()
        for i in range(0, len(padded_encoded_text), 8):
            byte = padded_encoded_text[i:i+8]
            b.append(int(byte, 2))
        return b
# compression:
    def compress(self):
        with open(self.original_path,'r+') as filein:
            text = filein.read()
            frequency = self.count_frequency(text)
            self.make_heap(frequency)
            self.combine_nodes()
            self.make_codes()
            compressed_text = self.get_encoded_text(text)
            with open(self.compressed_path,'w') as fileout:
                fileout.write(compressed_text)
                print("size of original text is:",len(text),"letters")
                print("size of compressed text is:",len(compressed_text),"bits")
                print("size of compressed text is:",len(compressed_text)/8,"bytes")

# reconstruction:
    def decode_text(self, encoded_text):
        current_code = ""
        decoded_text = ""
        for bit in encoded_text:
            current_code += bit
            if(current_code in self.reverse_mapping):
                character = self.reverse_mapping[current_code]
                decoded_text += character
                current_code = ""

        return decoded_text


    def recon(self):
        with open(self.compressed_path,'r+') as filein:
            compressed_text = filein.read()
            compressed_text = compressed_text.rstrip()
            recon_text = self.decode_text(compressed_text)
            with open(self.recon_path,'w') as fileout:
                fileout.write(recon_text)


In [8]:
original_path = 'midsummer.txt'
compressed_path = 'compressed.txt'
recon_path = 'recon.txt'

huffman = huffmancoding(original_path, compressed_path, recon_path)
huffman.compress()
huffman.recon()
compareFile(original_path, recon_path)

size of original text is: 92603
size of compressed text is: 55347.125 bytes
The input file and output file have no difference
