# 1.2 Trivial compression

***

### Python Bitwise Operators Review

OPERATOR | DESCRIPTION  | SYNTAX
---|---|---
&  | AND |x & y
\| | OR | x \| y
~ | NOT | ~x
^ | XOR | x ^ y
\>> | right shift | x>>
<< | left shift	| x<<








In [1]:
a = 10 
b = 5
print(f"a = {a} and binary(a) = {bin(a)}")
print(f"b = {b} and binary(b) = {bin(b)}")

a = 10 and binary(a) = 0b1010
b = 5 and binary(b) = 0b101


$a= 1010$

$b= 0101$

In [2]:
# Bitwise AND operator & 
print(f"a & b = {a & b} and binary(a & b) = {bin(a & b)}")

a & b = 0 and binary(a & b) = 0b0


$a= 1010$

$\ \ \ \ \ \ \ \ \ \ \&$  AND

$b= 0101$

$r=  0000$

In [3]:
# Bitwise OR operator |
print(f"a | b = {a | b} and binary(a | b) = {bin(a | b)}")

a | b = 15 and binary(a | b) = 0b1111


$a= 1010$

$\ \ \ \ \ \ \ \ \ \ |$   OR

$b= 0101$

$r=  1111$

In [4]:
x = 9
y = 7
print(f"x = {x} and binary(x) = {bin(x)}")
print(f"y = {y} and binary(y) = {bin(y)}")

x = 9 and binary(x) = 0b1001
y = 7 and binary(y) = 0b111


$x= 1001$

$y= 0111$


In [5]:
# Bitwise XOR operator ^ 
print(f"x ^ y = {x ^ y} and binary(x ^ y) = {bin(x ^ y)}")

x ^ y = 14 and binary(x ^ y) = 0b1110


$x= 1001$

          ^  XOR

$y= 0111$

$r=  1110$

In [6]:
# Bitwise NOT operator ~
print(f"x = {x} and binary(x) = {bin(x)}")
print(f"~x = {~x} and binary(~x) = {bin(~x)}")

x = 9 and binary(x) = 0b1001
~x = -10 and binary(~x) = -0b1010


$x= 1001$

$\sim x=1010$ !!! Why we got  ```1010``` istead of ```110```?

How to fix this problem we can use ~ with & to prevent this problem 

In [7]:
# Bitwise left shift operator << 
print(f"x = {x} and binary(x) = {bin(x)}")
print(f"x << 1 = {x << 1} and binary(x << 1) = {bin(x << 1)}")
print(f"x << 2 = {x << 2} and binary(x << 2) = {bin(x << 2)}")
print(f"x << 3 = {x << 3} and binary(x << 3) = {bin(x << 3)}")
print(f"x << 4 = {x << 4} and binary(x << 4) = {bin(x << 4)}")

x = 9 and binary(x) = 0b1001
x << 1 = 18 and binary(x << 1) = 0b10010
x << 2 = 36 and binary(x << 2) = 0b100100
x << 3 = 72 and binary(x << 3) = 0b1001000
x << 4 = 144 and binary(x << 4) = 0b10010000


In [8]:
# Bitwise right shift operator >> 
print(f"x = {x} and binary(x) = {bin(x)}")
print(f"x >> 1 = {x >> 1} and binary(x >> 1) = {bin(x >> 1)}")
print(f"x >> 2 = {x >> 2} and binary(x >> 2) = {bin(x >> 2)}")
print(f"x >> 3 = {x >> 3} and binary(x >> 3) = {bin(x >> 3)}")
print(f"x >> 4 = {x >> 4} and binary(x >> 4) = {bin(x >> 4)}")

x = 9 and binary(x) = 0b1001
x >> 1 = 4 and binary(x >> 1) = 0b100
x >> 2 = 2 and binary(x >> 2) = 0b10
x >> 3 = 1 and binary(x >> 3) = 0b1
x >> 4 = 0 and binary(x >> 4) = 0b0


The right shift operator acts like floor division operator //

$x >> n = \lfloor \frac{x}{2^n} \rfloor$

$ x = 9\ \ \ \ {(1001)}_2 $

$x >> 1 = \lfloor \frac{9}{2^1} \rfloor = 4 = {(100)}_2$

$x >> 2 = \lfloor \frac{9}{2^2} \rfloor = 2 = {(10)}_2$

$x >> 3 = \lfloor \frac{9}{2^3} \rfloor = 1 = {(1)}_2$

$x >> 4 = \lfloor \frac{9}{2^4} \rfloor = 0 = {(0)}_2$

***
## Trivial compression
In this section, we want to convert the DNA strand to bits(0 and 1) To reduce the amount of memory used. As you know, the DNA strand consists of 4 bases: **A**, **C**, **G**, and **T**, which can be mapped with two bits.  
* **A**: 00
* **C**: 01
* **G**: 10
* **T**: 11

So we write a class that does this conversion. 

We have also stored a piece of human DNA in a file that is responsible for encoding **Homo sapiens beta globin chain gene**. 

Source: https://www.ncbi.nlm.nih.gov/nuccore/AY260740.1


In [9]:

# >AY260740.1 Homo sapiens beta globin chain gene, complete cds
# Reading dataset from file and store it into dna
with open("../data/beta_globin_chain_gene.txt", "r") as f:
    original_dna = "".join([line.rstrip("\n") for line in f.readlines()])
# show the original dna
print(f"Original DNA: {original_dna}")

Original DNA: GGTATGGGGCCAAGAGATATATCTTAGAGGGAGGGCTGAGGGTTTGAAGTCCAACTCCTAAGCCAGTGCCAGAAGAGCCAAGGACAGGTACGGCTGTCATCACTTAGACCTCACCCTGTGGAGCCATACCCTAGGGTTGGCCAATCTACTCCCAGGAGCAGGGAGGGCAGGAGCCAGGGCTGGGCATAAAAGTCAGGGCAGAGCCATCTATTGCTTACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCACCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGTTGGTATCAAGGTTACAAGACAGGTTTAAGGAGACCAATAGAAACTGGGCATGTGGAGACAGAGAAGACTCTTGGGTTTCTGATAGGCACTGACTCTCTCTGCCTATTGGTCTATTTTCCCACCCTTAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGGTGAGTCTATGGGACCCTTGATGTTTTCTTTCCCCTTCTTTTCTATGGTTAAGTTCATGTCATAGGAAGGGGAGAAGTAACAGGGTACAGTTTAGAATGGGAAACAGACGAATGATTGCATCAGTGTGGAAGTCTCAGGATCGTTTTAGTTTCTTTTATTTGCTGTTCATAACAATTGTTTTCTTTTGTTTAATTCTTGCTTTCTTTTTTTTTCTTCTCCGCAATTTTTACTATTATACTTAATGCCTTAACATTGTGTATAACAAAAGGAAATA

In [10]:
import sys

print(f"Size of original DNA: {sys.getsizeof(original_dna)} bytes")

Size of original DNA: 1969 bytes


In [11]:
# convert the dna into binary format 
class CompressedDNA:
    def __init__(self: object, dna: str) -> None:
        self.dna = dna.upper() # convert dna to uppercase 
        self._compress(dna) # compress the dna to binary format
    
    def _compress(self: object, dna: str) -> None:
        nt2bin = {"A": 0b00, "C": 0b01, "G": 0b10, "T": 0b11}
        self.binary: int = 0
        for nt in dna:
            self.binary <<= 2 # shift the binary to left by 2 bits
            if nt in nt2bin:
                self.binary |= nt2bin[nt] # or the binary with the nt2bin
            else:
                raise(ValueError(f"Invalid DNA Nucleotide: {nt}"))

    def decompress(self: object) -> str:
        bin2nt = {0b00: "A", 0b01: "C", 0b10: "G", 0b11: "T"}
        decompress_dna = ""
        bits = self.binary
        while bits != 0: # while bits is not empty do the following
            bit = bits & 0b11 # get the last 2 bits
            decompress_dna += bin2nt[bit] # append the nt to the decompress_dna
            bits >>= 2 # shift the binary to right by 2 bits
        return decompress_dna[::-1] # return the reverse of decompress_dna
    
    def __repr__(self) -> str:
        return f'{__class__.__name__}("{self.dna}")'
    
    def __str__(self) -> str:
        return self.decompress()



In [12]:
compressed_dna = CompressedDNA(original_dna)
c_dna_size = sys.getsizeof(compressed_dna.binary) # get the size of compressed_dna
o_dna_size = sys.getsizeof(original_dna) # get the size of original_dna
print(f"Size of Compressed DNA: {c_dna_size} bytes")
print(f"Size of original DNA: {o_dna_size} bytes")
print(f"{((o_dna_size-c_dna_size)/o_dna_size)*100:.2f}% reduction in size")

Size of Compressed DNA: 536 bytes
Size of original DNA: 1969 bytes
72.78% reduction in size


In [13]:
# Check if the compressed_dna is equal to original_dna
compressed_dna.decompress() == original_dna

True

In [14]:
# Print the compressed_dna binary
bin(compressed_dna.binary)

'0b1010110011101010100101000010001000110011001101111100100010101000101010011110001010101111111000001011010100000111010111000010010100101110010100100000100010010100001010000100101011000110100111101101001101000111110010000101110100010101111011101000100101001100010101110010101011111010010100001101110001110101010010100010010010101000101010010010100010010100101010011110101001001100000000101101001010100100100010010100110111001111100111110001001111111001111101111000010001000001111011101111010001110010010000010111010000000100100001000101001110101110010001011110000111010111100010100010000010110111100101101111000111100101011110111010101001000010101110000001101110100011100000101111101011101011100010100101011110101001001010111110101100110100001010111100010000100001001010111111000010100010000101000011001000000001111010100100111011101000100001001000100000100001110111111010101111110111100011001010010001111000011101110111011110010111001111101011011100111111110101010001010111110010100111100111101011101

In [15]:
# Print the compressed_dna string representation
str(compressed_dna)

'GGTATGGGGCCAAGAGATATATCTTAGAGGGAGGGCTGAGGGTTTGAAGTCCAACTCCTAAGCCAGTGCCAGAAGAGCCAAGGACAGGTACGGCTGTCATCACTTAGACCTCACCCTGTGGAGCCATACCCTAGGGTTGGCCAATCTACTCCCAGGAGCAGGGAGGGCAGGAGCCAGGGCTGGGCATAAAAGTCAGGGCAGAGCCATCTATTGCTTACATTTGCTTCTGACACAACTGTGTTCACTAGCAACCTCAAACAGACACCATGGTGCACCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAGTTGGTGGTGAGGCCCTGGGCAGGTTGGTATCAAGGTTACAAGACAGGTTTAAGGAGACCAATAGAAACTGGGCATGTGGAGACAGAGAAGACTCTTGGGTTTCTGATAGGCACTGACTCTCTCTGCCTATTGGTCTATTTTCCCACCCTTAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGGTGAGTCTATGGGACCCTTGATGTTTTCTTTCCCCTTCTTTTCTATGGTTAAGTTCATGTCATAGGAAGGGGAGAAGTAACAGGGTACAGTTTAGAATGGGAAACAGACGAATGATTGCATCAGTGTGGAAGTCTCAGGATCGTTTTAGTTTCTTTTATTTGCTGTTCATAACAATTGTTTTCTTTTGTTTAATTCTTGCTTTCTTTTTTTTTCTTCTCCGCAATTTTTACTATTATACTTAATGCCTTAACATTGTGTATAACAAAAGGAAATATCTCTGAGATACA