# Nucleotide example
- Strings of nucleotides form genes
- Nucleotides take the values A, C, G, T
- Representing a nucleotide as a unicode character takes 8 bits (one byte)
- 2 bits can represent four different values, so bit strings can save 75% of the space required for nucleotides

In [1]:
# Class to convert genes to binary representations
class CompressedGene:
    
    def __init__(self, gene: str) -> None:
        self._compress(gene)
        
    def _compress(self, gene: str) -> None:        
        # Start with sentinel
        self.bit_string: int = 1            
        for nucleotide in gene.upper():
            # Make room for new bits
            self.bit_string <<= 2
            # Add new bits to string
            if nucleotide == "A":
                self.bit_string |= 0b00
            elif nucleotide == "C":
                self.bit_string |= 0b01
            elif nucleotide == "G":
                self.bit_string |= 0b10
            elif nucleotide == "T":
                self.bit_string |= 0b11
            else:
                raise ValueError("Invalid Nucleotide:{}".format(nucleotide))
                
    def decompress(self) -> str:
        gene: str = ""
        # Iterate through string excluding sentinel    
        for i in range(0, self.bit_string.bit_length() - 1, 2):
            # Access relevant bits (bitwise and to reduce to 2 bits)
            bits: int = self.bit_string >> i & 0b11
            if bits == 0b00: # A
                gene += "A"
            elif bits == 0b01: # C
                gene += "C"
            elif bits == 0b10: # G
                gene += "G"
            elif bits == 0b11: # T
                gene += "T"
            else:
                raise ValueError("Invalid bits:{}".format(bits))
        return gene[::-1] # [::-1] reverses string by slicing backward
    
    def __str__(self) -> str: # string representation for pretty printing
        return self.decompress()

In [2]:
# Example usage
from sys import getsizeof
original = 100 * "TAGGGATTAACCGTTATATATATATAGCCATGGATCGATTATATAGGGATTAACCGTTATATATATATAGCCATGGATCGATTATA"
print("original is {} bytes".format(getsizeof(original)))
compressed = CompressedGene(original)
print("compressed is {} bytes".format(getsizeof(compressed.bit_string)))
# print(compressed)
print("original and decompressed are the same: {}".format(original == compressed.decompress()))

original is 8649 bytes
compressed is 2320 bytes
original and decompressed are the same: True
