<a href="https://colab.research.google.com/github/pragyaye/File-Compression/blob/main/File_Compression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import heapq
import os

class binaryTree:
  def __init__(self,value,freq):
    self.value=value
    self.freq=freq
    self.left=None
    self.right=None

  def __lt__(self,other):
    return self.freq<other.freq

  def __eq__(self,other):
    return self.freq==other.freq

class huffmanCode:
  def __init__(self,path):
    self.path = path
    self.__heap=[]
    self.__code={}
    self.__reverseCode={}

  def __freq_from_text(self,text):
    freq_dict = {}
    for char in text:
      if char not in freq_dict:
        freq_dict[char] = 0
      freq_dict[char]+=1
    return freq_dict

  def __build_heap(self,frequencyDict):
    for key in frequencyDict:
      frequency=frequencyDict[key]
      binary_tree_node = binaryTree(key,frequency)
      heapq.heappush(self.__heap , binary_tree_node)

  def __build_binary_tree(self):
    while len(self.__heap)>1:
      bt_node_1=heapq.heappop(self.__heap)
      bt_node_2=heapq.heappop(self.__heap)
      sum_freq = bt_node_1.freq + bt_node_2.freq
      newNode = binaryTree(None,sum_freq)
      newNode.left=bt_node_1
      newNode.right=bt_node_2
      heapq.heappush(self.__heap,newNode)
    return

  def __treeCode_helper(self,root,currBits):
    if root is None:
      return
    if root.value is not None:
      self.__code[root.value]=currBits
      self.__reverseCode[currBits]=root.value
      return
    self.__treeCode_helper(root.left,currBits+'0')
    self.__treeCode_helper(root.right,currBits+'1')

  def __build_tree_code(self):
    root=heapq.heappop(self.__heap)
    self.__treeCode_helper(root,'')

  def __build_encoded_text(self,text):
    encodedText=''
    for char in text:
      encodedText+=self.__code[char]
    return encodedText

  def __build_padded_text(self,encodedText):
    padVal=8-len(encodedText)%8
    for i in range(padVal):
      encodedText+='0'
    pad_info="{0:08b}".format(padVal)
    paddedText=pad_info + encodedText
    return paddedText

  def __build_byte_array(self,paddedText):
    array=[]
    for i in range(0,len(paddedText),8):
      byte=paddedText[i:i+8]
      array.append(int(byte,2))
    return array

  def compress(self):
    print('Starting the compression...')
    fileName,fileExtension = os.path.splitext(self.path)
    outputPath=fileName+'.bin'
    with open(self.path,'r+') as file, open(outputPath,'wb') as output:
      text=file.read()
      # to remove extra spaces
      text=text.rstrip()

      frequencyDict=self.__freq_from_text(text)
      builtHeap=self.__build_heap(frequencyDict)
      self.__build_binary_tree()
      self.__build_tree_code()
      encodedText=self.__build_encoded_text(text)
      # Padding of encoded text
      paddedText=self.__build_padded_text(encodedText)
      byteArray=self.__build_byte_array(paddedText)
      finalBytes=bytes(byteArray)
      output.write(finalBytes)
    print('Compression Successful!')
    return outputPath

  def __removePadding(self,text):
    padded_info=text[:8]
    padding_value=int(padded_info,2)
    text=text[8:]
    text=text[:-1*padding_value]
    return text

  def __decoded_text(self,text):
    currentBits=''
    decoded_text=''
    for char in text:
      currentBits+=char
      if currentBits in self.__reverseCode:
        decoded_text+=self.__reverseCode[currentBits]
        currentBits=''
    return decoded_text

  def decompress(self,inputPath):
    fileName,fileExtension=os.path.splitext(inputPath)
    outputPath=fileName+'_decompressed'+'.txt'
    # rb because reading binary
    with open(inputPath,'rb') as file, open(outputPath,'w') as output:
      bit_string=''
      # reading one by one
      byte=file.read(1)
      while byte:
        # to integer
        byte=ord(byte)
        # to binary and two slice to 8-bit format
        bits=bin(byte)[2:].rjust(8,'0')
        bit_string+=bits
        byte=file.read(1)
      # padding removal
      text_after_padding_removal = self.__removePadding(bit_string)
      actual_text=self.__decoded_text(text_after_padding_removal)
      output.write(actual_text)
    return outputPath

path=input("Enter the path of the file that is to be compressed")
h=huffmanCode(path)
compressedFile=h.compress()
h.decompress(compressedFile)

Enter the path of the file that is to be compresseda.txt
Starting the compression...
Compression Successful!


'a_decompressed.txt'